In [121]:
import numpy as np
import pandas as pd
import mlflow

In [122]:
mlflow.set_tracking_uri('http://ec2-51-20-64-139.eu-north-1.compute.amazonaws.com:5000')
mlflow.set_experiment('exp 5 - ML Algos with HP Tuning')

<Experiment: artifact_location='s3://neeraj-first-bucket/615931187569087796', creation_time=1737545423333, experiment_id='615931187569087796', last_update_time=1737545423333, lifecycle_stage='active', name='exp 5 - ML Algos with HP Tuning', tags={}>

In [123]:
!pip install lightgbm




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [124]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import ADASYN

In [125]:
df = pd.read_csv('dataset.csv')
df.dropna(inplace=True)
df['category'] = df['category'].map({-1:2, 0:0, 1:1})
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,2
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [126]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42)

# vectorizer
ngram_range = (1,2)
max_features = 2000
vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
X_train_trf = vectorizer.fit_transform(X_train)
X_test_trf = vectorizer.transform(X_test)

# oversampling
adasyn = ADASYN(random_state=42)
X_train_trf, y_train = adasyn.fit_resample(X_train_trf, y_train)

def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    if mlflow.active_run():
        mlflow.end_run()
        
    with mlflow.start_run(nested=True):
        mlflow.set_tag("mlflow.runName", f"{model_name}_Adasyn_Tfidf_Bigram")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        mlflow.log_param("algorithm", model_name)

        # model
        model.fit(X_train_trf, y_train)
        y_pred = model.predict(X_test_trf)

        # metrics
        accuracy = accuracy_score(y_test, y_pred)
        class_report = classification_report(y_test, y_pred, output_dict=True)

        mlflow.log_metric("accuracy", accuracy)
        for label, metrics in class_report.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        mlflow.sklearn.log_model(model, f"{model_name}_model")

In [127]:
# optuna objective function
def objective_lightgbm(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    lr = trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True)
    max_depth = trial.suggest_int("max_depth", 3, 10)

    model = LGBMClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=lr, random_state=42)
    return accuracy_score(y_test, model.fit(X_train_trf, y_train).predict(X_test_trf))

In [128]:
def run_optuna_experiment():
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_lightgbm, n_trials=30)

    best_params = study.best_params
    best_model = LGBMClassifier(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], random_state=42)

    log_mlflow("lightGBM", best_model, X_train, X_test, y_train, y_test)

In [129]:
run_optuna_experiment()

[I 2025-01-23 15:56:54,845] A new study created in memory with name: no-name-3e292d59-3b5c-4bfe-b8dc-c2d7b6fc1124
[I 2025-01-23 15:57:07,563] Trial 0 finished with value: 0.7768989499522706 and parameters: {'n_estimators': 268, 'learning_rate': 0.030653176040354373, 'max_depth': 9}. Best is trial 0 with value: 0.7768989499522706.
[I 2025-01-23 15:57:10,147] Trial 1 finished with value: 0.46106641210964133 and parameters: {'n_estimators': 73, 'learning_rate': 0.0006855155210012858, 'max_depth': 4}. Best is trial 0 with value: 0.7768989499522706.
[I 2025-01-23 15:57:15,553] Trial 2 finished with value: 0.6166643938360835 and parameters: {'n_estimators': 87, 'learning_rate': 0.0022270656108022155, 'max_depth': 9}. Best is trial 0 with value: 0.7768989499522706.
[I 2025-01-23 15:57:32,413] Trial 3 finished with value: 0.6293467884903859 and parameters: {'n_estimators': 258, 'learning_rate': 0.002088879491669765, 'max_depth': 8}. Best is trial 0 with value: 0.7768989499522706.
[I 2025-01-23

🏃 View run LightGBM_Adasyn_Tfidf_Bigram at: http://ec2-51-20-64-139.eu-north-1.compute.amazonaws.com:5000/#/experiments/870835495118995563/runs/008b41d9995b44389e5690cb3627a022
🧪 View experiment at: http://ec2-51-20-64-139.eu-north-1.compute.amazonaws.com:5000/#/experiments/870835495118995563


RestException: INVALID_PARAMETER_VALUE: The run 008b41d9995b44389e5690cb3627a022 must be in 'active' lifecycle_stage.