In [5]:
!pip install mlflow boto3 awscli optuna imbalanced-learn

Collecting mlflow
  Using cached mlflow-3.3.1-py3-none-any.whl.metadata (30 kB)
Collecting boto3
  Using cached boto3-1.40.14-py3-none-any.whl.metadata (6.7 kB)
Collecting awscli
  Using cached awscli-1.42.14-py3-none-any.whl.metadata (11 kB)
Collecting optuna
  Using cached optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting mlflow-skinny==3.3.1 (from mlflow)
  Downloading mlflow_skinny-3.3.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.3.1 (from mlflow)
  Downloading mlflow_tracing-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>

In [9]:
!pip install -Uq 'lightgbm>=4.0.0'

In [10]:
import mlflow
#Setup MLflow tracking server
mlflow.set_tracking_uri("http://18.117.193.162:5000/")

#set experiment
mlflow.set_experiment("Exp 5- Machine Learning Algorithms with HP tuning")

<Experiment: artifact_location='s3://campusxalidvcbucket/425591744193892905', creation_time=1755800992412, experiment_id='425591744193892905', last_update_time=1755800992412, lifecycle_stage='active', name='Exp 5- Machine Learning Algorithms with HP tuning', tags={}>

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
import mlflow
import mlflow.sklearn
import optuna

In [13]:
df = pd.read_csv('reddit_preprocessing.csv')

In [14]:
#Step1: Remap the class labels from [-1, 0, 1] to [2,0,1]
df['category'] = df['category'].map({-1:2, 0:0, 1:1})

#Step2: Remove rows where the target labels (category) are NaN
df = df.dropna(subset=['category'])

# Fill NaN values in 'clean_comment' with empty strings
df['clean_comment'] = df['clean_comment'].fillna('')

ngram_range = (1,3)
max_features = 1000
vectorizer = TfidfVectorizer(ngram_range = ngram_range, max_features=max_features)
X=vectorizer.fit_transform(df['clean_comment'])
y = df['category']

smote = SMOTE(random_state = 42)
X_resampled, y_resampled = smote.fit_resample(X, y)

#Step3: Train-test split before vectorization and resampling
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)


#Function to log results in MLflow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
  with mlflow.start_run():
     # Log model type
        mlflow.set_tag("mlflow.runName", f"{model_name}_SMOTE_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")


#Step5: Optuna Objective function for XGBoost
def objective_lightbgm(trial):
  n_estimators = trial.suggest_int('n_estimators', 50, 300)
  learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
  max_depth = trial.suggest_int('max_depth', 3, 10)

  model = LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth = max_depth, random_state=42)
  return accuracy_score(y_test, model.fit(X_train, y_train).predict(X_test))


#Step7: Run Optuna for XGBoost, log the best model only
def run_optuna_experiment():
  study = optuna.create_study(direction = 'maximize')
  study.optimize(objective_lightbgm, n_trials = 30)

  #Get the best parameters and log only the best model
  best_params = study.best_params
  best_model = LGBMClassifier(n_estimators = best_params['n_estimators'], learning_rate = best_params['learning_rate'], max_depth = best_params['max_depth'], random_state=42)

  #log the model
  log_mlflow("LightGBM", best_model, X_train, X_test, y_train, y_test)

run_optuna_experiment()

[I 2025-08-21 19:08:45,618] A new study created in memory with name: no-name-e7b93684-be56-4926-a065-67a2d95a20cc


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.227077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:09:05,687] Trial 0 finished with value: 0.7808306034027264 and parameters: {'n_estimators': 265, 'learning_rate': 0.03082578272098951, 'max_depth': 10}. Best is trial 0 with value: 0.7808306034027264.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.236785 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:09:09,964] Trial 1 finished with value: 0.5957941456197823 and parameters: {'n_estimators': 63, 'learning_rate': 0.0027517435572243713, 'max_depth': 6}. Best is trial 0 with value: 0.7808306034027264.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.504216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:09:15,490] Trial 2 finished with value: 0.5554263975483462 and parameters: {'n_estimators': 75, 'learning_rate': 0.00026601578719955845, 'max_depth': 5}. Best is trial 0 with value: 0.7808306034027264.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.207906 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:09:20,661] Trial 3 finished with value: 0.5684243897284159 and parameters: {'n_estimators': 81, 'learning_rate': 0.00016192848399253215, 'max_depth': 6}. Best is trial 0 with value: 0.7808306034027264.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.219666 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:09:27,513] Trial 4 finished with value: 0.5526788544858924 and parameters: {'n_estimators': 249, 'learning_rate': 0.0010060004005410838, 'max_depth': 3}. Best is trial 0 with value: 0.7808306034027264.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.229812 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:09:43,726] Trial 5 finished with value: 0.6139701997252457 and parameters: {'n_estimators': 191, 'learning_rate': 0.00025751726493145646, 'max_depth': 9}. Best is trial 0 with value: 0.7808306034027264.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.226076 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:09:56,462] Trial 6 finished with value: 0.5557434217478601 and parameters: {'n_estimators': 257, 'learning_rate': 0.00026286097078382633, 'max_depth': 5}. Best is trial 0 with value: 0.7808306034027264.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.224513 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:10:10,226] Trial 7 finished with value: 0.7857973158617775 and parameters: {'n_estimators': 228, 'learning_rate': 0.05030066566582356, 'max_depth': 8}. Best is trial 7 with value: 0.7857973158617775.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.225594 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:10:19,606] Trial 8 finished with value: 0.6388037620205009 and parameters: {'n_estimators': 124, 'learning_rate': 0.006939812936574263, 'max_depth': 7}. Best is trial 7 with value: 0.7857973158617775.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.235681 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:10:22,613] Trial 9 finished with value: 0.5547923491493184 and parameters: {'n_estimators': 56, 'learning_rate': 0.0009891888408237871, 'max_depth': 5}. Best is trial 7 with value: 0.7857973158617775.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.225264 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:10:34,099] Trial 10 finished with value: 0.7973158617774491 and parameters: {'n_estimators': 195, 'learning_rate': 0.08722346121674643, 'max_depth': 8}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.221974 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:10:45,593] Trial 11 finished with value: 0.7958364155130508 and parameters: {'n_estimators': 192, 'learning_rate': 0.08604540111509643, 'max_depth': 8}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.218017 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:10:55,937] Trial 12 finished with value: 0.7907640283208285 and parameters: {'n_estimators': 170, 'learning_rate': 0.07945452549040383, 'max_depth': 8}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.238746 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:11:11,305] Trial 13 finished with value: 0.7312691535453874 and parameters: {'n_estimators': 184, 'learning_rate': 0.016514758964531088, 'max_depth': 10}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.221284 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:11:32,160] Trial 14 finished with value: 0.7465919898552256 and parameters: {'n_estimators': 300, 'learning_rate': 0.01622135982596169, 'max_depth': 8}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.209840 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:11:41,917] Trial 15 finished with value: 0.7867483884603191 and parameters: {'n_estimators': 143, 'learning_rate': 0.07661335078475065, 'max_depth': 9}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.253742 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:11:54,819] Trial 16 finished with value: 0.7417309521293459 and parameters: {'n_estimators': 209, 'learning_rate': 0.025401235305868273, 'max_depth': 7}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.208408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:12:07,339] Trial 17 finished with value: 0.6647997463806404 and parameters: {'n_estimators': 147, 'learning_rate': 0.00720108524599696, 'max_depth': 9}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.214895 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:12:09,980] Trial 18 finished with value: 0.7344393955405263 and parameters: {'n_estimators': 111, 'learning_rate': 0.09910175098575087, 'max_depth': 3}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.227301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:12:25,141] Trial 19 finished with value: 0.6587762865898764 and parameters: {'n_estimators': 216, 'learning_rate': 0.005953438798742153, 'max_depth': 7}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.224701 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:12:36,314] Trial 20 finished with value: 0.7590616083694389 and parameters: {'n_estimators': 160, 'learning_rate': 0.03695595004656233, 'max_depth': 8}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.208082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:12:47,123] Trial 21 finished with value: 0.7938285955827962 and parameters: {'n_estimators': 173, 'learning_rate': 0.0899725177725296, 'max_depth': 8}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.221216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:13:00,443] Trial 22 finished with value: 0.7868540631934904 and parameters: {'n_estimators': 197, 'learning_rate': 0.05521108783548014, 'max_depth': 9}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.222455 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:13:17,348] Trial 23 finished with value: 0.7358131670717531 and parameters: {'n_estimators': 231, 'learning_rate': 0.018000073391671555, 'max_depth': 8}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.369911 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:13:26,726] Trial 24 finished with value: 0.7930888724505971 and parameters: {'n_estimators': 172, 'learning_rate': 0.0953839789078443, 'max_depth': 7}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.219917 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:13:36,048] Trial 25 finished with value: 0.7611751030328648 and parameters: {'n_estimators': 113, 'learning_rate': 0.0445690936762232, 'max_depth': 10}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.257500 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:13:45,480] Trial 26 finished with value: 0.6589876360562189 and parameters: {'n_estimators': 144, 'learning_rate': 0.01111766628727095, 'max_depth': 6}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.208838 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:14:02,614] Trial 27 finished with value: 0.6489485364049455 and parameters: {'n_estimators': 208, 'learning_rate': 0.003161981720003294, 'max_depth': 9}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.216358 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:14:17,742] Trial 28 finished with value: 0.7629715734967769 and parameters: {'n_estimators': 231, 'learning_rate': 0.028864835503748687, 'max_depth': 8}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.214663 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665


[I 2025-08-21 19:14:29,710] Trial 29 finished with value: 0.7846348937968932 and parameters: {'n_estimators': 162, 'learning_rate': 0.057479221040734586, 'max_depth': 10}. Best is trial 10 with value: 0.7973158617774491.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.247667 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99059
[LightGBM] [Info] Number of data points in the train set: 37850, number of used features: 970
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098586
[LightGBM] [Info] Start training from score -1.098665




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://18.117.193.162:5000/#/experiments/425591744193892905/runs/a1ded57215824e6080916bb56248cf5d
🧪 View experiment at: http://18.117.193.162:5000/#/experiments/425591744193892905


In [None]:
# Load the dataframe.
df = pd.read_csv('/content/reddit_preprocessing.csv')

# Display the first 5 rows.
display(df.head())