In [1]:
import os
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
# Load pre-vectorized data
x_path = os.path.join("..","data", "X_tfidf_all.pkl")
X_tfidf = joblib.load(x_path)

In [3]:
y_path = os.path.join("..","data", "y_all.pkl")
y = joblib.load(y_path)

In [4]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, stratify=y, random_state=1311
)

In [5]:
# 1. Logistic Regression
model_log = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=1311)
model_log.fit(X_train, y_train)
y_pred_log = model_log.predict(X_test)
print(" Logistic Regression ")
print(classification_report(y_test, y_pred_log))

 Logistic Regression 
              precision    recall  f1-score   support

    negative       0.61      0.71      0.65     11413
     neutral       0.23      0.57      0.33      5951
    positive       0.96      0.80      0.87     61352

    accuracy                           0.77     78716
   macro avg       0.60      0.69      0.62     78716
weighted avg       0.86      0.77      0.80     78716



In [6]:
# 2. Random Forest
model_rf = RandomForestClassifier(class_weight='balanced', random_state=1311, n_jobs=-1)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
print("\n=== Random Forest ===")
print(classification_report(y_test, y_pred_rf))


=== Random Forest ===
              precision    recall  f1-score   support

    negative       0.82      0.36      0.50     11413
     neutral       0.55      0.02      0.05      5951
    positive       0.83      0.99      0.90     61352

    accuracy                           0.83     78716
   macro avg       0.73      0.46      0.48     78716
weighted avg       0.81      0.83      0.78     78716



In [None]:
# 3. XGBoost
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=1311)
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)
print("\n=== XGBoost ===")
print(classification_report(y_test, y_pred_xgb))