In [1]:
# Data Manipulation
import pandas as pd

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score, silhouette_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset

In [3]:
df_train = pd.read_csv("00_dataset/without_stopwords/train_features.csv")

In [5]:
df_test = pd.read_csv("00_dataset/without_stopwords/test_features.csv", low_memory=False)
df_val = pd.read_csv("00_dataset/without_stopwords/val_features.csv", low_memory=False)

In [4]:
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,sentiment_score,vader_score,review_length,exclamation_count,question_count,uppercase_ratio,duplicate_word_count,emoji_count,avg_word_length,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.194444,0.9611,73,0,0,0.027708,23,0,4.452055,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.187037,0.9422,46,1,0,0.030534,12,0,4.717391,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.147253,0.7906,38,0,0,0.031579,9,0,4.026316,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.253842,0.9874,108,8,0,0.076923,41,0,4.185185,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.257143,0.8903,53,0,0,0.040134,21,0,4.660377,0


In [6]:
# Separate into X and y
X_train = df_train.drop(columns=["label"])
y_train = df_train["label"]

X_test = df_test.drop(columns=["label"])
y_test = df_test["label"]

X_val = df_val.drop(columns=["label"])
y_val = df_val["label"]

In [7]:
# Check class distribution after split
print("Train Class Distribution:\n", y_train.value_counts(normalize=True))
print("\nValidation Class Distribution:\n", y_val.value_counts(normalize=True))
print("\nTest Class Distribution:\n", y_test.value_counts(normalize=True))

Train Class Distribution:
 label
0    0.867799
1    0.132201
Name: proportion, dtype: float64

Validation Class Distribution:
 label
0    0.867797
1    0.132203
Name: proportion, dtype: float64

Test Class Distribution:
 label
0    0.867797
1    0.132203
Name: proportion, dtype: float64


In [None]:
lr = LogisticRegression(max_iter=1000, solver='saga')

lr.fit(X_train, y_train)

In [None]:
# Predict on test set
y_pred = lr.predict(X_test)

# Evaluate Baseline Performance
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Compute ROC-AUC
y_proba = lr.predict_proba(X_test)[:, 1]
print("ROC-AUC Score: ", roc_auc_score(y_test, y_proba))

TypeError: 'LogisticRegression' object is not callable

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
disp.plot(cmap="Blues")  # Choose a color map for visualization