In [2]:
pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 12.8 MB/s  0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [2]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer

# 1. Load your cleaned CSV
df = pd.read_csv("../data/reviews_cleaned.csv")

# 2. Keep required columns
df = df[['reviewText', 'overall_sentiment']].dropna()

# 3. Features and target
X = df['reviewText']
y = df['overall_sentiment']

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Now these exist in every notebook:
# X_train_tfidf
# X_test_tfidf
# y_train
# y_test
print("Pipeline ready: TF-IDF features created.")


Pipeline ready: TF-IDF features created.


In [3]:
from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    n_estimators=300,
    learning_rate=0.1,
    num_leaves=40
)

lgb_model.fit(X_train_tfidf, y_train)

lgb_pred = lgb_model.predict(X_test_tfidf)


[LightGBM] [Info] Number of positive: 321872, number of negative: 45942
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.770322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 815993
[LightGBM] [Info] Number of data points in the train set: 367814, number of used features: 5000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.875094 -> initscore=1.946774
[LightGBM] [Info] Start training from score 1.946774




In [4]:
from sklearn.metrics import accuracy_score, classification_report

print("LGBM Accuracy:", accuracy_score(y_test, lgb_pred))
print(classification_report(y_test, lgb_pred))


LGBM Accuracy: 0.9452443613110904
              precision    recall  f1-score   support

           0       0.85      0.68      0.76     11486
           1       0.96      0.98      0.97     80468

    accuracy                           0.95     91954
   macro avg       0.90      0.83      0.86     91954
weighted avg       0.94      0.95      0.94     91954



In [6]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

lgb = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=40
)

lgb.fit(X_train_tfidf, y_train)

lgb_pred = lgb.predict(X_test_tfidf)
lgb_acc = accuracy_score(y_test, lgb_pred)

print("LightGBM Accuracy:", lgb_acc)


[LightGBM] [Info] Number of positive: 321872, number of negative: 45942
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.853165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 815993
[LightGBM] [Info] Number of data points in the train set: 367814, number of used features: 5000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.875094 -> initscore=1.946774
[LightGBM] [Info] Start training from score 1.946774




LightGBM Accuracy: 0.9406442351610588
