In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
dataset = pd.read_csv('dataset.csv')

# Drop rows with NaN values in 'clean_comment'
cleaned_dataset = dataset.dropna()

In [3]:
# Separate features and target
X_cleaned = cleaned_dataset['clean_comment']
y_cleaned = cleaned_dataset['category']

In [4]:
# Split the cleaned data into train and test sets (80-20 split)
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

In [5]:
# Apply TfidfVectorizer with trigram setting and max_features=10_000
tfidf_cleaned = TfidfVectorizer(ngram_range=(1, 3), max_features=10_000)

In [6]:
# Fit the vectorizer on the training data and transform both train and test sets
X_train_tfidf_cleaned = tfidf_cleaned.fit_transform(X_train_cleaned)
X_test_tfidf_cleaned = tfidf_cleaned.transform(X_test_cleaned)

In [7]:
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score
import optuna

In [8]:
# Function to optimize LightGBM hyperparameters
def objective(trial):
    # Define hyperparameters to be tuned
    param = {
        "objective": "multiclass",
        "num_class": 3,  # Assuming 3 categories (-1, 0, 1)
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "metric": "multi_logloss",
        "is_unbalance": True,
        "class_weight": "balanced",
    }

    # Define the LightGBM model with the trial parameters
    model = lgb.LGBMClassifier(**param)

    # Perform cross-validation
    scores = cross_val_score(model, X_train_tfidf_cleaned, y_train_cleaned, cv=3, scoring='accuracy')

    # Return the average score across folds
    return scores.mean()

In [9]:
# Create an Optuna study to optimize the hyperparameters
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-12-09 14:58:57,236] A new study created in memory with name: no-name-0d93a0f6-bd4f-41f7-bb98-02f5d19df5fe


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045619 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.0607

[I 2024-12-09 14:59:07,713] Trial 0 finished with value: 0.8036413954256852 and parameters: {'learning_rate': 0.06573946760218026, 'n_estimators': 371, 'max_depth': 5}. Best is trial 0 with value: 0.8036413954256852.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057541 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056278 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 14:59:15,570] Trial 1 finished with value: 0.8060621994592897 and parameters: {'learning_rate': 0.09458686048532158, 'n_estimators': 173, 'max_depth': 8}. Best is trial 1 with value: 0.8060621994592897.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049620 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063258 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 14:59:37,194] Trial 2 finished with value: 0.8399877797246296 and parameters: {'learning_rate': 0.06665055011805514, 'n_estimators': 309, 'max_depth': 16}. Best is trial 2 with value: 0.8399877797246296.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054439 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.060447 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 14:59:41,239] Trial 3 finished with value: 0.756009359926581 and parameters: {'learning_rate': 0.0912786660288975, 'n_estimators': 141, 'max_depth': 4}. Best is trial 2 with value: 0.8399877797246296.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 14:59:48,251] Trial 4 finished with value: 0.7014560368615097 and parameters: {'learning_rate': 0.027105537385414966, 'n_estimators': 172, 'max_depth': 5}. Best is trial 2 with value: 0.8399877797246296.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072549 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067327 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:00:21,414] Trial 5 finished with value: 0.7094003004591033 and parameters: {'learning_rate': 0.0037338869810449606, 'n_estimators': 404, 'max_depth': 16}. Best is trial 2 with value: 0.8399877797246296.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067671 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059600 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.0706

[I 2024-12-09 15:00:32,672] Trial 6 finished with value: 0.6514714438581569 and parameters: {'learning_rate': 0.007019843058180148, 'n_estimators': 381, 'max_depth': 4}. Best is trial 2 with value: 0.8399877797246296.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.060734 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:00:55,286] Trial 7 finished with value: 0.8398513738529602 and parameters: {'learning_rate': 0.0696192534073794, 'n_estimators': 367, 'max_depth': 13}. Best is trial 2 with value: 0.8399877797246296.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072464 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:01:09,286] Trial 8 finished with value: 0.819632430170433 and parameters: {'learning_rate': 0.056589661453172506, 'n_estimators': 171, 'max_depth': 18}. Best is trial 2 with value: 0.8399877797246296.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067326 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:01:15,201] Trial 9 finished with value: 0.6899998443188169 and parameters: {'learning_rate': 0.027309940069732745, 'n_estimators': 79, 'max_depth': 9}. Best is trial 2 with value: 0.8399877797246296.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071936 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:01:53,198] Trial 10 finished with value: 0.8451702891278566 and parameters: {'learning_rate': 0.07846216724059343, 'n_estimators': 491, 'max_depth': 20}. Best is trial 10 with value: 0.8451702891278566.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063644 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:02:30,037] Trial 11 finished with value: 0.8457158742522349 and parameters: {'learning_rate': 0.07866127582582125, 'n_estimators': 473, 'max_depth': 20}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069729 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.059508 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:03:09,039] Trial 12 finished with value: 0.8447611377750012 and parameters: {'learning_rate': 0.08122455216795157, 'n_estimators': 498, 'max_depth': 20}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066084 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064131 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.0674

[I 2024-12-09 15:03:48,743] Trial 13 finished with value: 0.8447612110121169 and parameters: {'learning_rate': 0.046088219332069184, 'n_estimators': 499, 'max_depth': 20}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068340 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061741 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.0613

[I 2024-12-09 15:04:19,904] Trial 14 finished with value: 0.8440792618658496 and parameters: {'learning_rate': 0.08259896635314337, 'n_estimators': 451, 'max_depth': 16}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062578 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:04:38,227] Trial 15 finished with value: 0.8149271512502957 and parameters: {'learning_rate': 0.044642751455932564, 'n_estimators': 278, 'max_depth': 13}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.080081 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062754 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:05:10,640] Trial 16 finished with value: 0.8453749415288819 and parameters: {'learning_rate': 0.07793976438232537, 'n_estimators': 436, 'max_depth': 18}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061889 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064008 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:05:42,638] Trial 17 finished with value: 0.8447611586998914 and parameters: {'learning_rate': 0.0972548764173202, 'n_estimators': 436, 'max_depth': 18}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063667 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063847 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:06:04,527] Trial 18 finished with value: 0.8333731115495852 and parameters: {'learning_rate': 0.05573217348519868, 'n_estimators': 321, 'max_depth': 14}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058611 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063572 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:06:28,280] Trial 19 finished with value: 0.8416584543447633 and parameters: {'learning_rate': 0.08500179595225242, 'n_estimators': 432, 'max_depth': 11}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064979 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063479 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:06:48,846] Trial 20 finished with value: 0.8133928301896057 and parameters: {'learning_rate': 0.033661731211058234, 'n_estimators': 253, 'max_depth': 18}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065895 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:07:26,436] Trial 21 finished with value: 0.8452726432282228 and parameters: {'learning_rate': 0.07407084733886433, 'n_estimators': 469, 'max_depth': 20}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071905 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066437 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:08:00,909] Trial 22 finished with value: 0.844693009820023 and parameters: {'learning_rate': 0.07433233845701082, 'n_estimators': 457, 'max_depth': 18}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068904 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:08:33,670] Trial 23 finished with value: 0.8449316756300526 and parameters: {'learning_rate': 0.06168072809208962, 'n_estimators': 404, 'max_depth': 20}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061326 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.060206 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:09:07,501] Trial 24 finished with value: 0.8444883678814431 and parameters: {'learning_rate': 0.08716546615504113, 'n_estimators': 464, 'max_depth': 17}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063572 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.0736

[I 2024-12-09 15:09:34,676] Trial 25 finished with value: 0.8439769949525259 and parameters: {'learning_rate': 0.07451119420186818, 'n_estimators': 341, 'max_depth': 19}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070099 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.0624

[I 2024-12-09 15:10:01,542] Trial 26 finished with value: 0.8418630439711179 and parameters: {'learning_rate': 0.07337935921166958, 'n_estimators': 410, 'max_depth': 14}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:10:19,619] Trial 27 finished with value: 0.8284632883403554 and parameters: {'learning_rate': 0.05752685241753537, 'n_estimators': 249, 'max_depth': 15}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:10:55,997] Trial 28 finished with value: 0.8440110118490121 and parameters: {'learning_rate': 0.08975655778336751, 'n_estimators': 463, 'max_depth': 19}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071760 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.0778

[I 2024-12-09 15:11:25,243] Trial 29 finished with value: 0.8423062575577216 and parameters: {'learning_rate': 0.06677872161994769, 'n_estimators': 358, 'max_depth': 17}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.0726

[I 2024-12-09 15:11:51,028] Trial 30 finished with value: 0.8440110781111643 and parameters: {'learning_rate': 0.09945786685649008, 'n_estimators': 422, 'max_depth': 11}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074349 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:12:32,741] Trial 31 finished with value: 0.8454771700799069 and parameters: {'learning_rate': 0.07816099992857556, 'n_estimators': 487, 'max_depth': 20}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070376 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064542 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:13:11,812] Trial 32 finished with value: 0.8447611552124097 and parameters: {'learning_rate': 0.07786421942590804, 'n_estimators': 478, 'max_depth': 19}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064112 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:13:42,802] Trial 33 finished with value: 0.8427154402979123 and parameters: {'learning_rate': 0.06263314086384474, 'n_estimators': 385, 'max_depth': 17}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070008 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:14:19,831] Trial 34 finished with value: 0.8449998524097745 and parameters: {'learning_rate': 0.09197836642242625, 'n_estimators': 444, 'max_depth': 19}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061590 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:15:00,772] Trial 35 finished with value: 0.8450680222145329 and parameters: {'learning_rate': 0.06962643991955618, 'n_estimators': 475, 'max_depth': 20}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.0707

[I 2024-12-09 15:15:18,570] Trial 36 finished with value: 0.8348392104932912 and parameters: {'learning_rate': 0.0858283841436341, 'n_estimators': 423, 'max_depth': 7}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061494 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:15:49,819] Trial 37 finished with value: 0.8398854744490073 and parameters: {'learning_rate': 0.049756671416744586, 'n_estimators': 389, 'max_depth': 17}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072737 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:16:25,554] Trial 38 finished with value: 0.8440792200160692 and parameters: {'learning_rate': 0.09309275385560131, 'n_estimators': 478, 'max_depth': 16}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058976 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063035 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:16:54,140] Trial 39 finished with value: 0.7736028332245518 and parameters: {'learning_rate': 0.01226142194728378, 'n_estimators': 309, 'max_depth': 19}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.082397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069613 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:17:00,908] Trial 40 finished with value: 0.7747961831995896 and parameters: {'learning_rate': 0.07084330345649621, 'n_estimators': 70, 'max_depth': 15}. Best is trial 11 with value: 0.8457158742522349.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.062773 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065357 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.0658

[I 2024-12-09 15:17:43,034] Trial 41 finished with value: 0.8459545644746366 and parameters: {'learning_rate': 0.07940802964603502, 'n_estimators': 496, 'max_depth': 20}. Best is trial 41 with value: 0.8459545644746366.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071812 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067585 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:18:25,098] Trial 42 finished with value: 0.8456817562187796 and parameters: {'learning_rate': 0.07887634468254905, 'n_estimators': 498, 'max_depth': 20}. Best is trial 41 with value: 0.8459545644746366.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062626 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.059674 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:19:04,255] Trial 43 finished with value: 0.844488357418998 and parameters: {'learning_rate': 0.07973125690098765, 'n_estimators': 499, 'max_depth': 18}. Best is trial 41 with value: 0.8459545644746366.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069022 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.0648

[I 2024-12-09 15:19:40,999] Trial 44 finished with value: 0.8448293912793204 and parameters: {'learning_rate': 0.06318045348134564, 'n_estimators': 443, 'max_depth': 19}. Best is trial 41 with value: 0.8459545644746366.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071502 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:20:22,557] Trial 45 finished with value: 0.8449315919304919 and parameters: {'learning_rate': 0.08925788976754984, 'n_estimators': 499, 'max_depth': 20}. Best is trial 41 with value: 0.8459545644746366.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065834 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:20:31,721] Trial 46 finished with value: 0.8026867147481583 and parameters: {'learning_rate': 0.08321639237955411, 'n_estimators': 480, 'max_depth': 3}. Best is trial 41 with value: 0.8459545644746366.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063019 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.0599

[I 2024-12-09 15:20:43,135] Trial 47 finished with value: 0.8198028947883688 and parameters: {'learning_rate': 0.07925380507245333, 'n_estimators': 124, 'max_depth': 18}. Best is trial 41 with value: 0.8459545644746366.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052271 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-12-09 15:20:54,738] Trial 48 finished with value: 0.820280212458501 and parameters: {'learning_rate': 0.09556940296444033, 'n_estimators': 210, 'max_depth': 9}. Best is trial 41 with value: 0.8459545644746366.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072830 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061331 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-12-09 15:21:29,841] Trial 49 finished with value: 0.8442838514922042 and parameters: {'learning_rate': 0.06807576140231852, 'n_estimators': 404, 'max_depth': 20}. Best is trial 41 with value: 0.8459545644746366.


'learning_rate': 0.08081298097796712,
'n_estimators': 367,
'max_depth': 20

In [10]:
# Extract the best hyperparameters
best_params = study.best_params
best_params

{'learning_rate': 0.07940802964603502, 'n_estimators': 496, 'max_depth': 20}

`{'learning_rate': 0.07940802964603502, 'n_estimators': 496, 'max_depth': 20}`

In [11]:
best_model = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=3,
    metric="multi_logloss",
    is_unbalance=True,
    class_weight="balanced",
    reg_alpha=0.1,  # L1 regularization
    reg_lambda=0.1,  # L2 regularization
    learning_rate= 0.07940802964603502, 
    n_estimators=496, 
    max_depth=20
)

In [12]:
# Fit the model on the resampled training data
best_model.fit(X_train_tfidf_cleaned, y_train_cleaned)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.108837 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 131997
[LightGBM] [Info] Number of data points in the train set: 29329, number of used features: 4439
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [13]:
# Predict on the train set
y_train_pred = best_model.predict(X_train_tfidf_cleaned)

In [14]:
# Calculate accuracy on the test set
accuracy_train = accuracy_score(y_train_cleaned, y_train_pred)
accuracy_train

0.9384568174844011

In [15]:
# Generate classification report
report_train = classification_report(y_train_cleaned, y_train_pred)
print(report_train)

              precision    recall  f1-score   support

          -1       0.93      0.92      0.92      6601
           0       0.89      0.98      0.94     10134
           1       0.98      0.91      0.95     12594

    accuracy                           0.94     29329
   macro avg       0.94      0.94      0.94     29329
weighted avg       0.94      0.94      0.94     29329



In [16]:
# Predict on the test set
y_pred = best_model.predict(X_test_tfidf_cleaned)

In [20]:
# Calculate accuracy on the test set
accuracy = accuracy_score(y_test_cleaned, y_pred)
accuracy

0.8651302331924179

In [21]:
# Generate classification report
report = classification_report(y_test_cleaned, y_pred)
print(report)

              precision    recall  f1-score   support

          -1       0.81      0.77      0.79      1647
           0       0.85      0.97      0.90      2510
           1       0.92      0.83      0.87      3176

    accuracy                           0.87      7333
   macro avg       0.86      0.86      0.85      7333
weighted avg       0.87      0.87      0.86      7333



In [22]:
import re
import numpy as np

# Assuming you have pre-trained tfidf_vectorizer and lgbm_model loaded
# tfidf_vectorizer: Your trained TF-IDF vectorizer
# lgbm_model: Your trained LightGBM model

# Function to clean and preprocess a YouTube comment (same as used during training)
def preprocess_comment(comment):
    # Lowercasing
    comment = comment.lower()

    # Remove special characters, URLs, punctuation, and extra spaces
    comment = re.sub(r"http\S+|www\S+|https\S+", '', comment, flags=re.MULTILINE)  # Remove URLs
    comment = re.sub(r'\W', ' ', comment)  # Remove special characters
    comment = re.sub(r'\s+', ' ', comment).strip()  # Remove extra spaces and newlines

    return comment

# Prediction function
def predict_sentiment(comment, tfidf_vectorizer, lgbm_model):
    # Step 1: Preprocess the YouTube comment
    cleaned_comment = preprocess_comment(comment)

    # Step 2: Transform the comment using the trained TF-IDF vectorizer
    comment_tfidf = tfidf_vectorizer.transform([cleaned_comment])

    # Step 3: Use the trained LightGBM model to predict the sentiment
    prediction = lgbm_model.predict(comment_tfidf)
    prediction_proba = lgbm_model.predict_proba(comment_tfidf)

    # Step 4: Get the predicted sentiment (label) and probability
    sentiment_class = np.argmax(prediction_proba)
    sentiment_proba = np.max(prediction_proba)

    # Step 5: Return the sentiment label and confidence
    return {
        'sentiment_class': int(prediction[0]),  # -1, 0, or 1 depending on your labels
        'confidence': sentiment_proba
    }

# Example usage:
comment1 = "I absolutely hate this video!"
comment2 = "The explanations were confusing and the video quality was poor."
comment3 = "I didn’t learn anything useful. Really disappointed."
comment4 = "Wow, the explanation was so clear and helpful. Definitely subscribing!"
comment5 = "This is the worst video I’ve seen on this topic, very misleading"
comment6 = "Not much to say about this, just a standard video."
comment7 = "The video is okay, but I expected more depth in the content."
comment8 = "Superb content! Mazaa aa gaya dekh ke. Best video on this topic!"
comment9 = "Poor video quality aur explanation bhi weak tha."
comment10 = "Yeh video theek tha, but I was expecting more depth."
result = predict_sentiment(comment10, tfidf_cleaned, best_model)
print(f"Predicted Sentiment: {result['sentiment_class']}, Confidence: {result['confidence']}")

Predicted Sentiment: 0, Confidence: 0.8669254635878679
