## Logistic Regression

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load your CSV data into a pandas DataFrame
df = pd.read_csv('../../../data/processed/tokenized_data_sample_with_custom_stopwords.csv')

# Load the vectorized features from the .npy file
vectorized_features = np.load('../../../data/processed/sample_vectorized_features_with_custom_stopwords_word2vec.npy')

feature_vectors = pd.DataFrame(vectorized_features)


feature_vectors.columns = [f'vec_feature_{i}' for i in range(feature_vectors.shape[1])]

X = feature_vectors # Feature vector
y = df['overall']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Initialize and train the logistic regression model
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

# Predict the target variable for the test set
y_pred = model.predict(X_test)

In [5]:
# Calculate classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.67      0.39      0.49      1236
         2.0       0.73      0.19      0.31       759
         3.0       0.59      0.27      0.37      1082
         4.0       0.51      0.28      0.36      1669
         5.0       0.65      0.95      0.77      5254

    accuracy                           0.64     10000
   macro avg       0.63      0.41      0.46     10000
weighted avg       0.63      0.64      0.59     10000



## Logistic Regression (w/ class weighting)

In [6]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(y_train),
                                        y = y_train                                                    
                                    )
class_weights = dict(zip(np.unique(y_train), class_weights))

# Initialize and train the logistic regression model with class weights
model = LogisticRegression(class_weight=class_weights, solver='liblinear')
model.fit(X_train, y_train)


In [8]:
# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.49      0.52      0.50      1236
         2.0       0.32      0.37      0.34       759
         3.0       0.33      0.43      0.37      1082
         4.0       0.44      0.40      0.42      1669
         5.0       0.80      0.74      0.77      5254

    accuracy                           0.60     10000
   macro avg       0.48      0.49      0.48     10000
weighted avg       0.61      0.60      0.60     10000



## Logistic Regression (w/ undersampling the majority class)

In [9]:
from imblearn.under_sampling import RandomUnderSampler

# Undersample the majority class (5-star reviews) automatically
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Initialize and train the logistic regression model
model = LogisticRegression(solver='liblinear')
model.fit(X_train_resampled, y_train_resampled)

# Predict the target variable for the test set
y_pred2 = model.predict(X_test)

In [10]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

         1.0       0.42      0.53      0.47      1236
         2.0       0.26      0.45      0.33       759
         3.0       0.29      0.48      0.36      1082
         4.0       0.41      0.43      0.42      1669
         5.0       0.87      0.60      0.71      5254

    accuracy                           0.54     10000
   macro avg       0.45      0.50      0.46     10000
weighted avg       0.63      0.54      0.56     10000



## Logistic Regression (w/ oversampling minority classes)

In [15]:
from imblearn.over_sampling import RandomOverSampler

# Oversample the minority classes (1 to 4-star reviews) automatically
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Initialize and train the logistic regression model
model = LogisticRegression(solver='liblinear')
model.fit(X_train_resampled, y_train_resampled)

# Predict the target variable for the test set
y_pred3 = model.predict(X_test)


In [16]:
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

         1.0       0.44      0.53      0.48      1236
         2.0       0.26      0.46      0.34       759
         3.0       0.29      0.47      0.36      1082
         4.0       0.41      0.43      0.42      1669
         5.0       0.87      0.61      0.72      5254

    accuracy                           0.54     10000
   macro avg       0.45      0.50      0.46     10000
weighted avg       0.63      0.54      0.57     10000



: 

## Logistic Regression (w/ SMOTE sampling technique)

In [13]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize and train the logistic regression model
model = LogisticRegression(solver='liblinear')
model.fit(X_train_resampled, y_train_resampled)

# Predict the target variable for the test set
y_pred = model.predict(X_test)

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.44      0.54      0.48      1236
         2.0       0.26      0.45      0.33       759
         3.0       0.30      0.47      0.36      1082
         4.0       0.42      0.43      0.43      1669
         5.0       0.86      0.62      0.72      5254

    accuracy                           0.55     10000
   macro avg       0.46      0.50      0.47     10000
weighted avg       0.63      0.55      0.57     10000

