<a href="https://colab.research.google.com/github/mbewustanley/yt_insights/blob/main/7_experiment_6_lightgbm_detailed_hpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!pip install mlflow boto3 awscli optuna imbalanced-learn lightgbm



In [None]:
!aws configure


In [21]:
# mlflow, boto3, awscli, optuna, xgboost, imbalanced-learn

import optuna # a hyperparameter optimization framework for ml
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

In [22]:
#set mlflow tracking and experiment
mlflow.set_tracking_uri("http://ec2-13-244-203-74.af-south-1.compute.amazonaws.com:5000")
mlflow.set_experiment('LightGBM HP Tuning')

<Experiment: artifact_location='s3://stanley-mlflow-bucket-27/248940192345400758', creation_time=1764445797771, experiment_id='248940192345400758', last_update_time=1764445797771, lifecycle_stage='active', name='LightGBM HP Tuning', tags={}>

In [23]:
# load data
df = pd.read_csv('/content/reddit_preprocessing.csv').dropna()
df.shape

(36662, 2)

In [24]:
# remap the class labels from [-1,0,1] to [2,0,1]. xgboost does not take negatove values
df['category'] = df['category'].map({-1:2, 0:0, 1:1})

In [25]:
# remove rows where target labels (category) are NAN
df = df.dropna(subset=['category'])

In [26]:
ngram_range = (1,3) #trigram settings
max_features = 1000  # set max features for TF-IDF

In [27]:
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X = vectorizer.fit_transform(df['clean_comment'])
y = df['category']

In [28]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [29]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [32]:
# function to log results in MLFLOW
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test, params, trial):
    with mlflow.start_run():
        #log model type
        mlflow.set_tag('mlflow.runName', f"{model_name}_SMOTE_TFIDF_Trigrams")
        mlflow.set_tag('experiment_type', "algorithm_comparison")

        # log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        #log accuracy
        accuracy= accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        #log classification report
        classification_rep =classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric_name, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric_name}", value)

        # log parameters
        for param, value in params.items():
            mlflow.log_param(param, value)

        # log trial number
        mlflow.log_param("trial_number", trial.number)

        #log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")

        return accuracy

In [30]:
# optuna objective function for LightGBM
def objective_lightgbm(trial):
    # Hyperparameter space to explore
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    num_leaves = trial.suggest_int('num_leaves', 20, 150)
    min_child_samples = trial.suggest_int('min_child_samples', 10, 100)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-4, 10,0, log=True)  #L1 regularization
    reg_lambda = trial.suggest_float('reg_lambda', 1e-4, 10,0, log=True)  #L2 regularization

    #log trial parameters
    params = {
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'num_leaves': num_leaves,
        'min_child_samples': min_child_samples,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda
        }
    # create LightGBM model
    model = LGBMClassifier(n_estimators=n_estimators,
                           learning_rate=learning_rate,
                           max_depth=max_depth,
                           num_leaves=num_leaves,
                           min_child_samples=min_child_samples,
                           colsample_bytree=colsample_bytree,
                           subsample=subsample,
                           reg_alpha=reg_alpha,
                           reg_lambda=reg_lambda,
                           random_state=42)

    # log each trial as a separate run in mlflow
    accuracy = log_mlflow("LightGBM", model, X_train, X_test, y_train, y_test, params, trial.number)

    return accuracy

In [33]:
# run optuna for LightGBM, log the best model only, and plot the importance of each parameter
def run_optuna_experiment():
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_lightgbm, n_trials=100)  #increased to 100 trials

    #get the best parameters and log only the best model
    best_params = study.best_params
    best_model = XGBClassifier(n_estimators=best_params['n_estimators'],
                               learning_rate=best_params['learning_rate'],
                               max_depth = best_params['max_depth'],
                               num_leaves = best_params['num_leaves'],
                               min_child_samples = best_params['min_child_samples'],
                               subsample = best_params['subsample'],
                               colsample_bytree = best_params['colsample_bytree'],
                               reg_alpha = best_params['reg_alpha'],
                               reg_lambda = best_params['reg_lambda'],
                               random_state=42)

    #log the nest model with mlflow, and print the classification report
    log_mlflow("LightGBM", best_model, X_train, X_test, y_train, y_test, best_params, "best")

    # Plot parameter importance
    optuna.visualization.plot_param_importances(study).show()

    # Plot optimization history
    optuna.visualization.plot_optimization_history(study).show()

In [None]:
run_optuna_experiment()