In [1]:
import numpy as np
import pandas as pd
import mlflow

In [2]:
mlflow.set_tracking_uri('http://ec2-51-20-64-139.eu-north-1.compute.amazonaws.com:5000')
mlflow.set_experiment('exp 6 - lightGBM with class imbalance')

KeyboardInterrupt: 

In [1]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv('dataset.csv')
df.dropna(inplace=True)
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [None]:
X_train, y_train, X_test, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2)

# vectorizer
max_features = 2000
ngram_range = (1, 2)
vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
X_train_trf = vectorizer.fit_transform(X_train)
X_test_trf = vectorizer.transform(X_test)

In [5]:
def log_mlflow(model_name, model, X_train, y_train, X_test, y_test):
    mlflow.set_tag("mlflow.runName", "lightGBM_class_weight")
    mlflow.set_tag("experiment_type", "lightGBM_class_weight_exp")
    mlflow.set_tag("algorithm", "lightGBM")

    # model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # metrics
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict=True)

    mlflow.log_metric("accuracy", accuracy)
    for label, metrics in class_report.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    mlflow.sklearn.log_model(model, f"{model_name}_model")

In [6]:
def objective_lightGBM(trial):
    param = {
        "objective":"multiclass",
        "num_class":3,
        "n_estimators":trial.suggest_int("n_estimators", 50, 500),
        "max_depth":trial.suggest_int("max_depth", 3, 20),
        "learning_rate":trial.suggest_float("learning_rate", 1e-3, 1e-1),
        "metric":"multi_logloss",
        "is_unbalance":True,
        "class_weight":"balanced",
    }

    model = LGBMClassifier(**param)
    scores = cross_val_score(model, X_train_trf, y_train, cv=3, scoring="accuracy")

    return scores.mean()

In [None]:
def run_experiment_with_optuna():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_lightGBM, n_trials=50)

    best_params = study.best_params
    mlflow.log_params(best_params)
    best_model = LGBMClassifier(
        objective='multiclass',
        num_class=3,
        metric="multi_logloss",
        is_unbalance= True,
        class_weight= "balanced",
        reg_alpha= 0.1,  # L1 regularization
        reg_lambda= 0.1,  # L2 regularization
        learning_rate= best_params['learning_rate'],
        max_depth= best_params['max_depth'],
        n_estimators=best_params['n_estimators']
    )

    log_mlflow("lightGBM", best_model, X_train_trf, y_train, X_test_trf, y_test)