In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import mlflow
import optuna
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [3]:

#Set up the MLflow tracking server
mlflow.set_tracking_uri('http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/')

In [4]:
# Set or create an experiment
mlflow.set_experiment("Exp5: ML Algos with Hyperparameter tuning")

<Experiment: artifact_location='s3://youtubecommentmlflow/149143573475036011', creation_time=1741969037519, experiment_id='149143573475036011', last_update_time=1741969037519, lifecycle_stage='active', name='Exp5: ML Algos with Hyperparameter tuning', tags={}>

In [None]:
df = pd.read_csv('C:/test/youtube_comment_sentiment_analysis/Artifacts/youtubecomment_preprocessing.csv')
df.shape

(15833, 2)

In [6]:
# Step 1: Remap the class labels from [-1, 0, 1] to [2, 0, 1]
df['Sentiment'] = df['Sentiment'].map({-1: 2, 0: 0, 1: 1})



ngram_range = (1, 3)  # Trigram setting
max_features = 8000  # Set max_features to 8000 for TF-IDF

# Step 4: Train-test split before vectorization and resampling
X_train, X_test, y_train, y_test = train_test_split(df['Comment'], df['Sentiment'], test_size=0.2, random_state=42, stratify=df['Sentiment'])

# Step 2: Vectorization using TF-IDF, fit on training data only
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X_train = vectorizer.fit_transform(X_train)  # Fit on training data
X_test = vectorizer.transform(X_test)  # Transform test data

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
# Function to log results in MLflow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test, params, trial_number):
    with mlflow.start_run():
        # Log model type and trial number
        mlflow.set_tag("mlflow.runName", f"Trial_{trial_number}_{model_name}_SMOTE_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Log hyperparameters
        for key, value in params.items():
            mlflow.log_param(key, value)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")

        return accuracy

# Step 6: Optuna objective function for LightGBM
def objective_lightgbm(trial):
    # Hyperparameter space to explore
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    num_leaves = trial.suggest_int('num_leaves', 20, 150)
    min_child_samples = trial.suggest_int('min_child_samples', 10, 100)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True)  # L1 regularization
    reg_lambda = trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True)  # L2 regularization

    # Log trial parameters
    params = {
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'num_leaves': num_leaves,
        'min_child_samples': min_child_samples,
        'colsample_bytree': colsample_bytree,
        'subsample': subsample,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda
    }

    # Create LightGBM model
    model = LGBMClassifier(n_estimators=n_estimators,
                           learning_rate=learning_rate,
                           max_depth=max_depth,
                           num_leaves=num_leaves,
                           min_child_samples=min_child_samples,
                           colsample_bytree=colsample_bytree,
                           subsample=subsample,
                           reg_alpha=reg_alpha,
                           reg_lambda=reg_lambda,
                           random_state=42)

    # Log each trial as a separate run in MLflow
    accuracy = log_mlflow("LightGBM", model, X_train, X_test, y_train, y_test, params, trial.number)

    return accuracy

# Step 7: Run Optuna for LightGBM, log the best model, and plot the importance of each parameter
def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_lightgbm, n_trials=20)  # Increased to 20 trials

    # Get the best parameters
    best_params = study.best_params
    best_model = LGBMClassifier(n_estimators=best_params['n_estimators'],
                                learning_rate=best_params['learning_rate'],
                                max_depth=best_params['max_depth'],
                                num_leaves=best_params['num_leaves'],
                                min_child_samples=best_params['min_child_samples'],
                                colsample_bytree=best_params['colsample_bytree'],
                                subsample=best_params['subsample'],
                                reg_alpha=best_params['reg_alpha'],
                                reg_lambda=best_params['reg_lambda'],
                                random_state=42)

    # Log the best model with MLflow and print the classification report
    log_mlflow("LightGBM", best_model, X_train, X_test, y_train, y_test, best_params, "Best")

    # Plot parameter importance
    optuna.visualization.plot_param_importances(study).show()

    # Plot optimization history
    optuna.visualization.plot_optimization_history(study).show()

In [10]:
# Run the experiment for LightGBM
run_optuna_experiment()

[I 2025-03-15 13:38:55,256] A new study created in memory with name: no-name-b98fa491-fc2d-47af-9383-a10d5416a491


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.137590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 92392
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 1154
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_0_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/1863b811b68442fb82e13c9a1f332541
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 13:40:23,379] Trial 0 finished with value: 0.6665614145879382 and parameters: {'n_estimators': 555, 'learning_rate': 0.012359179482890132, 'max_depth': 7, 'num_leaves': 132, 'min_child_samples': 82, 'colsample_bytree': 0.9713643465787258, 'subsample': 0.800171693937636, 'reg_alpha': 0.0003252580869693209, 'reg_lambda': 0.002317349590025096}. Best is trial 0 with value: 0.6665614145879382.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.171511 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 111174
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 2027
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_1_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/22ee3fc9e030484390a8f6ad5b5f8a94
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 13:41:10,665] Trial 1 finished with value: 0.628986422481844 and parameters: {'n_estimators': 201, 'learning_rate': 0.046292076259457915, 'max_depth': 3, 'num_leaves': 20, 'min_child_samples': 46, 'colsample_bytree': 0.6921534179179325, 'subsample': 0.9655835513345894, 'reg_alpha': 6.39032078204093, 'reg_lambda': 0.00016840602118397383}. Best is trial 0 with value: 0.6665614145879382.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.678163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 107769
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 1831
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_2_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/fa410296bea94501b6279419d1ddd258
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 13:43:02,690] Trial 2 finished with value: 0.6949794758446479 and parameters: {'n_estimators': 727, 'learning_rate': 0.015761629205702468, 'max_depth': 7, 'num_leaves': 50, 'min_child_samples': 51, 'colsample_bytree': 0.9184684681378529, 'subsample': 0.8926610683291074, 'reg_alpha': 0.002844912365242915, 'reg_lambda': 0.02007349079655746}. Best is trial 2 with value: 0.6949794758446479.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.157936 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100952
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 1493
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_3_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/8f16a337d8ed49d5aabbe4fed6d1c239
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 13:44:31,823] Trial 3 finished with value: 0.5652036627723398 and parameters: {'n_estimators': 521, 'learning_rate': 0.0029392006614293964, 'max_depth': 4, 'num_leaves': 28, 'min_child_samples': 64, 'colsample_bytree': 0.7690594220872535, 'subsample': 0.9265400809791371, 'reg_alpha': 0.003479357495108568, 'reg_lambda': 2.998497111285186}. Best is trial 2 with value: 0.6949794758446479.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.272834 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 118808
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 2541
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_4_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/29b8ea3c8bad42b698f8bdd7b74c1e41
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 13:47:48,793] Trial 4 finished with value: 0.7265550994632144 and parameters: {'n_estimators': 940, 'learning_rate': 0.021843557585941614, 'max_depth': 10, 'num_leaves': 104, 'min_child_samples': 37, 'colsample_bytree': 0.7544287135238494, 'subsample': 0.9072286373332532, 'reg_alpha': 0.00017754043223264875, 'reg_lambda': 1.844130466453349}. Best is trial 4 with value: 0.7265550994632144.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.083927 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 90133
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 1077
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_5_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/8d6337d846474062b101555a8a221c43
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 13:49:09,251] Trial 5 finished with value: 0.624250078939059 and parameters: {'n_estimators': 431, 'learning_rate': 0.0024609672263978147, 'max_depth': 12, 'num_leaves': 115, 'min_child_samples': 88, 'colsample_bytree': 0.6405501461625547, 'subsample': 0.5173084251301652, 'reg_alpha': 1.2833268541525908, 'reg_lambda': 0.004617209815970665}. Best is trial 4 with value: 0.7265550994632144.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.145099 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 95110
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 1253
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_6_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/86b3e0a0e13a4619ba45ac4e808d5d9c
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 13:50:20,041] Trial 6 finished with value: 0.6908746447742343 and parameters: {'n_estimators': 324, 'learning_rate': 0.023676709187068934, 'max_depth': 13, 'num_leaves': 45, 'min_child_samples': 76, 'colsample_bytree': 0.9734454213160456, 'subsample': 0.9216631354480677, 'reg_alpha': 0.0006450160817062332, 'reg_lambda': 0.0009392153057913932}. Best is trial 4 with value: 0.7265550994632144.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.113907 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 95630
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 1273
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_7_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/bda684e58daa4b40ad2c9ce927a10df2
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 13:51:51,297] Trial 7 finished with value: 0.6125670982001895 and parameters: {'n_estimators': 408, 'learning_rate': 0.00011669849707621382, 'max_depth': 9, 'num_leaves': 148, 'min_child_samples': 75, 'colsample_bytree': 0.5103640109927201, 'subsample': 0.6207914547887421, 'reg_alpha': 0.0011623030880014383, 'reg_lambda': 0.3473456644838845}. Best is trial 4 with value: 0.7265550994632144.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.159895 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97806
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 1359
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_8_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/6f3181c464324097b059fb71694552d8
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 13:53:05,956] Trial 8 finished with value: 0.6362488159141143 and parameters: {'n_estimators': 228, 'learning_rate': 0.00857829353153801, 'max_depth': 13, 'num_leaves': 100, 'min_child_samples': 70, 'colsample_bytree': 0.8060044151208623, 'subsample': 0.8186854368902509, 'reg_alpha': 0.18699093959813484, 'reg_lambda': 0.004487322015857652}. Best is trial 4 with value: 0.7265550994632144.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.602721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 146571
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 5710
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_9_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/a6834f30674b4bf38c7c8f1e0491a644
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 13:54:54,191] Trial 9 finished with value: 0.7016103568045469 and parameters: {'n_estimators': 255, 'learning_rate': 0.02987325892887392, 'max_depth': 11, 'num_leaves': 133, 'min_child_samples': 14, 'colsample_bytree': 0.9154071817986029, 'subsample': 0.8836860865494647, 'reg_alpha': 0.00015809944297863498, 'reg_lambda': 0.3009039882492276}. Best is trial 4 with value: 0.7265550994632144.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.321648 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127314
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 3274
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_10_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/6c72d5302b3a48b9ad23a408d49753bd
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 13:59:38,146] Trial 10 finished with value: 0.6365645721502999 and parameters: {'n_estimators': 987, 'learning_rate': 0.0005669834985104985, 'max_depth': 15, 'num_leaves': 77, 'min_child_samples': 28, 'colsample_bytree': 0.5739366973824812, 'subsample': 0.6934495399396983, 'reg_alpha': 0.0208996368588101, 'reg_lambda': 9.662974104632351}. Best is trial 4 with value: 0.7265550994632144.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.537937 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 143931
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 5270
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_11_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/a2c5363abe714570a8285f75bbed4b04
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 14:02:51,717] Trial 11 finished with value: 0.747710767287654 and parameters: {'n_estimators': 937, 'learning_rate': 0.07612882793365607, 'max_depth': 11, 'num_leaves': 88, 'min_child_samples': 16, 'colsample_bytree': 0.8363727100870936, 'subsample': 0.9970718747519761, 'reg_alpha': 0.00011767888244644364, 'reg_lambda': 0.6851016216752097}. Best is trial 11 with value: 0.747710767287654.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.615269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122869
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 2868
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_12_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/94b8223f627b4b29966aedae1b5fb08a
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 14:06:24,488] Trial 12 finished with value: 0.7379223239658983 and parameters: {'n_estimators': 949, 'learning_rate': 0.09676360136819538, 'max_depth': 10, 'num_leaves': 80, 'min_child_samples': 32, 'colsample_bytree': 0.8415133868869746, 'subsample': 0.9806639861733915, 'reg_alpha': 0.00016353186832797977, 'reg_lambda': 0.6734967783929933}. Best is trial 11 with value: 0.747710767287654.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.868088 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 150286
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 6406
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_13_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/8f84cb0d9c3a4fde983fd830f1a86c6a
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 14:09:13,106] Trial 13 finished with value: 0.7445532049257972 and parameters: {'n_estimators': 751, 'learning_rate': 0.08610510372110691, 'max_depth': 9, 'num_leaves': 73, 'min_child_samples': 11, 'colsample_bytree': 0.8486374165094263, 'subsample': 0.9972794971492881, 'reg_alpha': 0.01644250649302146, 'reg_lambda': 0.2431787116012763}. Best is trial 11 with value: 0.747710767287654.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.633098 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149143
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 6177
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_14_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/343b2d9e5dc74961852caca52c2a0ec4
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 14:11:28,362] Trial 14 finished with value: 0.742027155036312 and parameters: {'n_estimators': 752, 'learning_rate': 0.08489484123814015, 'max_depth': 8, 'num_leaves': 61, 'min_child_samples': 12, 'colsample_bytree': 0.8532104077641773, 'subsample': 0.9995798890995392, 'reg_alpha': 0.035512000771427334, 'reg_lambda': 0.07046256813945706}. Best is trial 11 with value: 0.747710767287654.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.438916 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 138261
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 4485
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_15_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/7b526146299f45d2a51d06a3db427a63
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 14:13:56,334] Trial 15 finished with value: 0.6406694032207136 and parameters: {'n_estimators': 792, 'learning_rate': 0.005697620586232921, 'max_depth': 6, 'num_leaves': 72, 'min_child_samples': 20, 'colsample_bytree': 0.8833046582373739, 'subsample': 0.8174820489989822, 'reg_alpha': 0.01647798437728716, 'reg_lambda': 0.040834159859743126}. Best is trial 11 with value: 0.747710767287654.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.382002 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 132758
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 3835
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_16_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/f51cd007a9094f57b08d47fcde7e9bc5
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 14:18:42,596] Trial 16 finished with value: 0.632775497316072 and parameters: {'n_estimators': 849, 'learning_rate': 0.0011721404314702514, 'max_depth': 15, 'num_leaves': 96, 'min_child_samples': 24, 'colsample_bytree': 0.6895844229937756, 'subsample': 0.7238174535256463, 'reg_alpha': 0.22789295756566388, 'reg_lambda': 0.1263170949578322}. Best is trial 11 with value: 0.747710767287654.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.136064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 85515
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 935
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_17_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/b8656475f28743dbb0bc44bef3274af5
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 14:20:15,187] Trial 17 finished with value: 0.7050836754025892 and parameters: {'n_estimators': 619, 'learning_rate': 0.05539556132869762, 'max_depth': 5, 'num_leaves': 64, 'min_child_samples': 100, 'colsample_bytree': 0.8080376490860103, 'subsample': 0.6451610711731846, 'reg_alpha': 0.005803420009211514, 'reg_lambda': 1.0635228910235606}. Best is trial 11 with value: 0.747710767287654.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.290289 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115254
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 2290
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_18_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/b6ae207f8ffe40b684b7e19d5d3e130f
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 14:24:08,004] Trial 18 finished with value: 0.5952005052099779 and parameters: {'n_estimators': 669, 'learning_rate': 0.00029099770458217817, 'max_depth': 9, 'num_leaves': 92, 'min_child_samples': 41, 'colsample_bytree': 0.6798234374453471, 'subsample': 0.8539344813985227, 'reg_alpha': 0.07711067979021405, 'reg_lambda': 0.015866381941562244}. Best is trial 11 with value: 0.747710767287654.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.814193 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 151460
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 6643
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_19_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/1d7025a67b4b473790784eda65a24d60
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011


[I 2025-03-15 14:29:58,160] Trial 19 finished with value: 0.7328702241869277 and parameters: {'n_estimators': 876, 'learning_rate': 0.04419480157143808, 'max_depth': 12, 'num_leaves': 114, 'min_child_samples': 10, 'colsample_bytree': 0.9214918410190789, 'subsample': 0.7697974347550122, 'reg_alpha': 1.592908434831827, 'reg_lambda': 6.539486305110264}. Best is trial 11 with value: 0.747710767287654.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.004816 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 143931
[LightGBM] [Info] Number of data points in the train set: 24759, number of used features: 5270
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run Trial_Best_LightGBM_SMOTE_TFIDF_Trigrams at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011/runs/e37f4e0e35e24668a310dc91e7f164ac
🧪 View experiment at: http://ec2-54-173-92-7.compute-1.amazonaws.com:5000/#/experiments/149143573475036011
