In [1]:
!pip install mlflow boto3 awscli optuna imbalanced-learn lightgbm

Collecting mlflow
  Downloading mlflow-3.3.1-py3-none-any.whl.metadata (30 kB)
Collecting boto3
  Downloading boto3-1.40.15-py3-none-any.whl.metadata (6.7 kB)
Collecting awscli
  Downloading awscli-1.42.15-py3-none-any.whl.metadata (11 kB)
Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting mlflow-skinny==3.3.1 (from mlflow)
  Downloading mlflow_skinny-3.3.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.3.1 (from mlflow)
  Downloading mlflow_tracing-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.2

In [None]:
import mlflow
#Setup MLflow tracking server
mlflow.set_tracking_uri("http://18.117.193.162:5000/")

#set experiment
mlflow.set_experiment("Exp 5- Machine Learning Algorithms with HP tuning")

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
import mlflow
import mlflow.sklearn
import optuna

In [None]:
df = pd.read_csv('reddit_preprocessing.csv')

In [None]:
#Step1: Remap the class labels from [-1, 0, 1] to [2, 0, 1]
df['category'] = df['category'].map({-1:2, 0:0, 1:1})

#Step2: Remove rows where the target labels(category) are NAN
df = df.dropna(subset=['category'])

In [7]:
ngram_range = (1,3)
max_features = 1000
vectorizer = TfidfVectorizer(ngram_range = ngram_range, max_features=max_features)

# Remove rows where 'clean_comment' is null
df = df.dropna(subset=['clean_comment'])

X=vectorizer.fit_transform(df['clean_comment'])
y = df['category']

smote = SMOTE(random_state = 42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
#Step3: Train-test split before vectorization and resampling
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

In [9]:
#Function to log results in MLflow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test, params = None):
  with mlflow.start_run():
     # Log model type
        mlflow.set_tag("mlflow.runName", f"{model_name}_SMOTE_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        #Log Hyperparamter
        if params:
          for key,value in params.items():
            mlflow.log_param(key, value)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")
        return accuracy

In [10]:
#Step6: Optuna Objective function for LightGBM
def objective_lightgbm(trial):
  #Hyperparameter space to explore
  n_estimators = trial.suggest_int('n_estimators', 100, 1000)
  learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
  max_depth = trial.suggest_int('max_depth',3,15)
  num_leaves = trial.suggest_int('num_leaves', 20, 150)
  min_child_samples = trial.suggest_int('min_child_samples', 10, 100)
  colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
  subsample = trial.suggest_float('subsample', 0.5, 1.0)
  reg_alpha = trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True)
  reg_lambda = trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True)

  #Log trial parameters
  params = {
      'n_estimators': n_estimators,
      'learning_rate': learning_rate,
      'max_depth': max_depth,
      'num_leaves': num_leaves,
      'min_child_samples': min_child_samples,
      'colsample_bytree':colsample_bytree,
      'subsample': subsample,
      'reg_alpha': reg_alpha,
      'reg_lambda': reg_lambda
  }

  #Create LightGBM Model
  model = LGBMClassifier(
      learning_rate=learning_rate,
      n_estimators=n_estimators,
      max_depth=max_depth,
      num_leaves=num_leaves,
      min_child_samples=min_child_samples,
      colsample_bytree=colsample_bytree,
      subsample=subsample,
      reg_alpha=reg_alpha,
      reg_lambda=reg_lambda,
      random_state=42
  )

  accuracy = log_mlflow('LightGBM', model, X_train, X_test, y_train, y_test, params)
  return accuracy

In [11]:
#Step : Run Optuna for LightGBM, log the best model, and plot the importance of each other
def run_optuna_experiment():
  study = optuna.create_study(direction='maximize')
  study.optimize(objective_lightgbm, n_trials=50)

  #Get the best parameters
  best_params = study.best_params
  best_model = LGBMClassifier(
      n_estimators=best_params['n_estimators'],
      learning_rate=best_params['learning_rate'],
      max_depth=best_params['max_depth'],
      num_leaves=best_params['num_leaves'],
      min_child_samples=best_params['min_child_samples'],
      colsample_bytree=best_params['colsample_bytree'],
      subsample=best_params['subsample'],
      reg_alpha=best_params['reg_alpha'],
      reg_lambda=best_params['reg_lambda'],
      random_state=42
  )

  #log the best model with MLflow and print the classifiation report
  log_mlflow("LightGBM", best_model, X_train, X_test, y_train, y_test, params=best_params)

  #Plot parameter importance
  optuna.visualization.plot_param_importances(study).show()

  #Plot optimization history
  optuna.visualization.plot_optimization_history(study).show()

In [None]:
#Run the experiment for LightBGM
run_optuna_experiment()

[I 2025-08-22 03:22:31,885] A new study created in memory with name: no-name-f7c69869-af77-449e-9f6a-b7c6698b9a5c


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.238797 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98439
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 945
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/de26914905894ef0975b47b305fef3ba
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:24:55,975] Trial 0 finished with value: 0.7043965335024308 and parameters: {'n_estimators': 906, 'learning_rate': 0.00020378633310673392, 'max_depth': 14, 'num_leaves': 74, 'min_child_samples': 95, 'colsample_bytree': 0.6577812572155832, 'subsample': 0.679989587117588, 'reg_alpha': 0.0002746177456563449, 'reg_lambda': 0.22137872109017215}. Best is trial 0 with value: 0.7043965335024308.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.256464 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98850
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 959
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/b504e2eba241413f8fb2ffa55ae88282
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:26:07,737] Trial 1 finished with value: 0.6149862608328049 and parameters: {'n_estimators': 507, 'learning_rate': 0.0001464745052336634, 'max_depth': 5, 'num_leaves': 122, 'min_child_samples': 60, 'colsample_bytree': 0.6095956127807229, 'subsample': 0.5580220306686556, 'reg_alpha': 0.00023666851247347833, 'reg_lambda': 0.0016649625196495025}. Best is trial 0 with value: 0.7043965335024308.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.374547 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98695
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 953
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/c87a2e63867f4ae99eeae10d6703a73f
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:27:32,849] Trial 2 finished with value: 0.6061086451067428 and parameters: {'n_estimators': 758, 'learning_rate': 0.00010665527451030345, 'max_depth': 6, 'num_leaves': 26, 'min_child_samples': 90, 'colsample_bytree': 0.84442095507302, 'subsample': 0.7752459902560976, 'reg_alpha': 0.1401108072867582, 'reg_lambda': 0.07958876066658789}. Best is trial 0 with value: 0.7043965335024308.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.257637 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98978
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/f8e88c78122d4fcb8ba0b63c65a196c6
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:28:27,109] Trial 3 finished with value: 0.8040583386176284 and parameters: {'n_estimators': 351, 'learning_rate': 0.0611547810504011, 'max_depth': 7, 'num_leaves': 76, 'min_child_samples': 45, 'colsample_bytree': 0.6450872072048364, 'subsample': 0.641710821853388, 'reg_alpha': 0.007064941322311617, 'reg_lambda': 1.3795190545601812}. Best is trial 3 with value: 0.8040583386176284.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.245622 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98406
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 944
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/ed32f4e5bd1c4c87a75092824b130865
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:29:09,921] Trial 4 finished with value: 0.6255548509828789 and parameters: {'n_estimators': 141, 'learning_rate': 0.004067897359026298, 'max_depth': 6, 'num_leaves': 110, 'min_child_samples': 97, 'colsample_bytree': 0.767871098760295, 'subsample': 0.9255059090543252, 'reg_alpha': 0.07394844638455214, 'reg_lambda': 0.002309570640522853}. Best is trial 3 with value: 0.8040583386176284.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.258143 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98978
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/f5aa00d08da941539f75eabe0bbd150a
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:31:20,708] Trial 5 finished with value: 0.6537729866835764 and parameters: {'n_estimators': 892, 'learning_rate': 0.0005312389730033285, 'max_depth': 9, 'num_leaves': 39, 'min_child_samples': 50, 'colsample_bytree': 0.8600719712211052, 'subsample': 0.9250922779704522, 'reg_alpha': 0.01051141094344988, 'reg_lambda': 2.4789276517851846}. Best is trial 3 with value: 0.8040583386176284.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.255636 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98978
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/2a638c1ceca04ae995c895ae0cfd9707
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:32:15,422] Trial 6 finished with value: 0.7271190023250899 and parameters: {'n_estimators': 168, 'learning_rate': 0.011986395366812992, 'max_depth': 15, 'num_leaves': 44, 'min_child_samples': 47, 'colsample_bytree': 0.7810700432805684, 'subsample': 0.5618174785587702, 'reg_alpha': 7.777718110596112, 'reg_lambda': 0.015149292017047745}. Best is trial 3 with value: 0.8040583386176284.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.256812 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98888
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 961
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/518252b449984af9ac1e3126dddecb86
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:33:16,981] Trial 7 finished with value: 0.7000634115409005 and parameters: {'n_estimators': 269, 'learning_rate': 0.0024171109944103324, 'max_depth': 14, 'num_leaves': 22, 'min_child_samples': 54, 'colsample_bytree': 0.7232094684970549, 'subsample': 0.737218454093655, 'reg_alpha': 1.8932013923262139, 'reg_lambda': 0.20142909186451927}. Best is trial 3 with value: 0.8040583386176284.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.261123 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98978
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/e134b34420ac45319c183f18c4e52da7
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:34:12,602] Trial 8 finished with value: 0.6605368843796238 and parameters: {'n_estimators': 203, 'learning_rate': 0.00104681490602159, 'max_depth': 11, 'num_leaves': 49, 'min_child_samples': 36, 'colsample_bytree': 0.820017899132357, 'subsample': 0.5179869890979465, 'reg_alpha': 7.4937246335213334, 'reg_lambda': 4.893952165387049}. Best is trial 3 with value: 0.8040583386176284.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.253887 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98850
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 959
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/496e0ab00b8b42509032702603bad916
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:35:04,562] Trial 9 finished with value: 0.5516804058338618 and parameters: {'n_estimators': 516, 'learning_rate': 0.00021936130335639722, 'max_depth': 3, 'num_leaves': 83, 'min_child_samples': 60, 'colsample_bytree': 0.8902525602505489, 'subsample': 0.7792643664665414, 'reg_alpha': 6.261315169275978, 'reg_lambda': 0.14337277980346663}. Best is trial 3 with value: 0.8040583386176284.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.246378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99071
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 978
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/82591a28aed24788810f6af883e4a02f
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:36:30,368] Trial 10 finished with value: 0.8154724159797083 and parameters: {'n_estimators': 379, 'learning_rate': 0.09687643881678176, 'max_depth': 9, 'num_leaves': 150, 'min_child_samples': 14, 'colsample_bytree': 0.517839903431426, 'subsample': 0.6457839690951988, 'reg_alpha': 0.003126626509406514, 'reg_lambda': 0.00020188333719840286}. Best is trial 10 with value: 0.8154724159797083.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.249049 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99119
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 988
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/e36bf58fa1474d84afb5d2b778af3db4
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:37:59,522] Trial 11 finished with value: 0.8165292749947157 and parameters: {'n_estimators': 369, 'learning_rate': 0.09015921636285379, 'max_depth': 9, 'num_leaves': 147, 'min_child_samples': 10, 'colsample_bytree': 0.5273464687357948, 'subsample': 0.6446001974056038, 'reg_alpha': 0.006090734849343507, 'reg_lambda': 0.0003096952765443397}. Best is trial 11 with value: 0.8165292749947157.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.271080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99071
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 978
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/6c53ba5d335b4d2d9a6c5961e79d5893
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:39:34,121] Trial 12 finished with value: 0.818008877615726 and parameters: {'n_estimators': 406, 'learning_rate': 0.07853449603610292, 'max_depth': 10, 'num_leaves': 150, 'min_child_samples': 15, 'colsample_bytree': 0.5115442248440554, 'subsample': 0.647412347735298, 'reg_alpha': 0.002032886733042793, 'reg_lambda': 0.00015380006184658455}. Best is trial 12 with value: 0.818008877615726.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.264261 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99077
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 979
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/d3d90337596842acbebae0df085ad451
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:42:16,252] Trial 13 finished with value: 0.8109279222151765 and parameters: {'n_estimators': 678, 'learning_rate': 0.0250029404693385, 'max_depth': 11, 'num_leaves': 150, 'min_child_samples': 13, 'colsample_bytree': 0.5128948000930608, 'subsample': 0.6971840249714053, 'reg_alpha': 0.0009166476343527911, 'reg_lambda': 0.00013984621049080445}. Best is trial 12 with value: 0.818008877615726.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.246849 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99009
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 969
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/4abe6e7ca58644a1932fc5f292e98a68
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:44:00,958] Trial 14 finished with value: 0.8012048192771084 and parameters: {'n_estimators': 385, 'learning_rate': 0.028500424592124898, 'max_depth': 11, 'num_leaves': 129, 'min_child_samples': 22, 'colsample_bytree': 0.5766641264362407, 'subsample': 0.8372412750211901, 'reg_alpha': 0.0013760963503660077, 'reg_lambda': 0.0007841346857779592}. Best is trial 12 with value: 0.818008877615726.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.456049 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99001
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 968
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/f906c1db914a4439acdb673e5f91bf18
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:45:47,819] Trial 15 finished with value: 0.7683365039103783 and parameters: {'n_estimators': 638, 'learning_rate': 0.010063873951857333, 'max_depth': 8, 'num_leaves': 109, 'min_child_samples': 27, 'colsample_bytree': 0.9961015037118517, 'subsample': 0.6011859819799298, 'reg_alpha': 0.30612077858298276, 'reg_lambda': 0.011628129366275775}. Best is trial 12 with value: 0.818008877615726.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.252087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98781
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 956
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/3dcb7bf7e4dd4017beaa77fc97ad330a
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:47:27,700] Trial 16 finished with value: 0.8063834284506447 and parameters: {'n_estimators': 463, 'learning_rate': 0.0411068015561402, 'max_depth': 12, 'num_leaves': 135, 'min_child_samples': 76, 'colsample_bytree': 0.5609022141750352, 'subsample': 0.5048136568767836, 'reg_alpha': 0.0201663120530134, 'reg_lambda': 0.00046917417782552356}. Best is trial 12 with value: 0.818008877615726.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.256090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98991
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 967
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/1e672017955b4b8d9bdf057460aa55ad
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:48:40,317] Trial 17 finished with value: 0.722997252166561 and parameters: {'n_estimators': 270, 'learning_rate': 0.010254348971451796, 'max_depth': 9, 'num_leaves': 100, 'min_child_samples': 29, 'colsample_bytree': 0.6949939093490547, 'subsample': 0.8236930896373911, 'reg_alpha': 0.0008428988170642206, 'reg_lambda': 0.0037070982663119645}. Best is trial 12 with value: 0.818008877615726.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.253144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99107
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 985
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/e9943defdf524a84b509196aeff28b6a
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:51:02,496] Trial 18 finished with value: 0.8195941661382372 and parameters: {'n_estimators': 614, 'learning_rate': 0.0964906485362646, 'max_depth': 12, 'num_leaves': 138, 'min_child_samples': 11, 'colsample_bytree': 0.5656340551761967, 'subsample': 0.6215662706420778, 'reg_alpha': 0.0001244747035517839, 'reg_lambda': 0.00013838880475804116}. Best is trial 18 with value: 0.8195941661382372.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.258017 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98978
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/11936b53c7f14364b5e4aa25f9c7925b
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:53:29,515] Trial 19 finished with value: 0.8052208835341366 and parameters: {'n_estimators': 627, 'learning_rate': 0.01861168329915589, 'max_depth': 13, 'num_leaves': 135, 'min_child_samples': 35, 'colsample_bytree': 0.5894182836415719, 'subsample': 0.5883542769071916, 'reg_alpha': 0.00010211435894124757, 'reg_lambda': 0.00010933990285348425}. Best is trial 18 with value: 0.8195941661382372.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.454145 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99046
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 974
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/aa88ad9a801b4a6cbb5cb2b75da835c9
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:56:36,961] Trial 20 finished with value: 0.7687592475163814 and parameters: {'n_estimators': 774, 'learning_rate': 0.00535746180156484, 'max_depth': 12, 'num_leaves': 119, 'min_child_samples': 19, 'colsample_bytree': 0.6437661361251076, 'subsample': 0.703525458774214, 'reg_alpha': 0.00025070542706582177, 'reg_lambda': 0.0009654062207381679}. Best is trial 18 with value: 0.8195941661382372.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.261213 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99071
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 978
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/bfa77f9ceb224e1088be164c8cd44fd7
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 03:58:15,252] Trial 21 finished with value: 0.8181145635172268 and parameters: {'n_estimators': 441, 'learning_rate': 0.09822410459351522, 'max_depth': 10, 'num_leaves': 142, 'min_child_samples': 14, 'colsample_bytree': 0.5093511017923735, 'subsample': 0.6318658752270454, 'reg_alpha': 0.0036398587810998763, 'reg_lambda': 0.0003556475282815526}. Best is trial 18 with value: 0.8195941661382372.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.247426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99009
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 969
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/1d473a5a91094b078a836987b003b0e6
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 04:00:10,418] Trial 22 finished with value: 0.8126188966391883 and parameters: {'n_estimators': 557, 'learning_rate': 0.04841965428237357, 'max_depth': 10, 'num_leaves': 138, 'min_child_samples': 22, 'colsample_bytree': 0.5026870937946398, 'subsample': 0.6142857604302268, 'reg_alpha': 0.0020467218135985215, 'reg_lambda': 0.007480856497526747}. Best is trial 18 with value: 0.8195941661382372.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.256381 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99119
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 988
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/0c7dc6cb4b1147f7a1d75c5e4eac249a
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 04:02:13,664] Trial 23 finished with value: 0.8172690763052208 and parameters: {'n_estimators': 451, 'learning_rate': 0.05488217702411083, 'max_depth': 12, 'num_leaves': 102, 'min_child_samples': 10, 'colsample_bytree': 0.5667622816894563, 'subsample': 0.6767915607785251, 'reg_alpha': 0.000552651934490456, 'reg_lambda': 0.00042631830006834714}. Best is trial 18 with value: 0.8195941661382372.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.248656 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98978
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/99344f6205ea42ff9fcd677f80462943
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 04:04:05,198] Trial 24 finished with value: 0.8191714225322342 and parameters: {'n_estimators': 558, 'learning_rate': 0.09854778582946255, 'max_depth': 10, 'num_leaves': 122, 'min_child_samples': 36, 'colsample_bytree': 0.5551487180362112, 'subsample': 0.7275863801526892, 'reg_alpha': 0.028402099038684955, 'reg_lambda': 0.00010138415597236525}. Best is trial 18 with value: 0.8195941661382372.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.796653 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98978
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/e3d4ac7d70434ba6967d57cdda1cde7a
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 04:06:19,754] Trial 25 finished with value: 0.8137814415556964 and parameters: {'n_estimators': 729, 'learning_rate': 0.033756598758001126, 'max_depth': 13, 'num_leaves': 127, 'min_child_samples': 38, 'colsample_bytree': 0.6180373572586492, 'subsample': 0.7354369829869829, 'reg_alpha': 0.03626078505220372, 'reg_lambda': 0.00010316015512453436}. Best is trial 18 with value: 0.8195941661382372.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.251642 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98991
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 967
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/359f8de173ab422eacec0b5ae1b8076e
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 04:08:24,237] Trial 26 finished with value: 0.7946522933840625 and parameters: {'n_estimators': 581, 'learning_rate': 0.016649531302499812, 'max_depth': 10, 'num_leaves': 115, 'min_child_samples': 29, 'colsample_bytree': 0.5494680736767644, 'subsample': 0.986268732575887, 'reg_alpha': 0.5656294428599756, 'reg_lambda': 0.0009712984487200453}. Best is trial 18 with value: 0.8195941661382372.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.459136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98978
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 966
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/8bf51c86f59745beab255b1b1d337ce4
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 04:09:43,593] Trial 27 finished with value: 0.8095540054956669 and parameters: {'n_estimators': 577, 'learning_rate': 0.04687013183949085, 'max_depth': 8, 'num_leaves': 94, 'min_child_samples': 39, 'colsample_bytree': 0.6844569783024634, 'subsample': 0.5509591351245386, 'reg_alpha': 0.02631011205492757, 'reg_lambda': 0.00532217017510019}. Best is trial 18 with value: 0.8195941661382372.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.247911 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99001
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 968
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/8a448b24d3ad4ce0a109db56e797ef42
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 04:12:37,027] Trial 28 finished with value: 0.813570069752695 and parameters: {'n_estimators': 852, 'learning_rate': 0.09448884953986729, 'max_depth': 13, 'num_leaves': 141, 'min_child_samples': 25, 'colsample_bytree': 0.546492775842517, 'subsample': 0.8242914294863171, 'reg_alpha': 0.07578226598409979, 'reg_lambda': 0.0003467458261476543}. Best is trial 18 with value: 0.8195941661382372.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.248416 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98781
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 956
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/6057c38ebcff4d05bd4341fdf4329406
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 04:15:02,543] Trial 29 finished with value: 0.7277531177340942 and parameters: {'n_estimators': 691, 'learning_rate': 0.001851477084001064, 'max_depth': 15, 'num_leaves': 69, 'min_child_samples': 73, 'colsample_bytree': 0.6101191826493975, 'subsample': 0.6729386070956815, 'reg_alpha': 0.00462181173528338, 'reg_lambda': 0.05670169976913147}. Best is trial 18 with value: 0.8195941661382372.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.257948 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99046
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 974
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/81367caadd154347ada10c84a9dc8908
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


[I 2025-08-22 04:17:38,681] Trial 30 finished with value: 0.8136757556541957 and parameters: {'n_estimators': 989, 'learning_rate': 0.022669366040394448, 'max_depth': 11, 'num_leaves': 126, 'min_child_samples': 19, 'colsample_bytree': 0.6705074095876647, 'subsample': 0.7168612648248753, 'reg_alpha': 0.013317464312240512, 'reg_lambda': 0.6925982135946932}. Best is trial 18 with value: 0.8195941661382372.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.250830 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99060
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 976
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
