## Logistic Regression

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load your CSV data into a pandas DataFrame
df = pd.read_csv('../../../data/processed/tokenized_data_sample_with_custom_stopwords.csv')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['reviewTextTotal'], df['overall'], test_size=0.2, random_state=42)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

X_train.fillna('', inplace=True)
X_test.fillna('', inplace=True)

# Initialize and fit the CountVectorizer to transform text data into bag-of-words vectors
vectorizer = CountVectorizer()

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [4]:
# Initialize and train the logistic regression model
model = LogisticRegression(solver='liblinear')
model.fit(X_train_bow, y_train)

# Predict the target variable for the test set
y_pred = model.predict(X_test_bow)

In [5]:
# Calculate classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.75      0.78      0.77      1236
         2.0       0.60      0.43      0.50       759
         3.0       0.57      0.55      0.56      1082
         4.0       0.63      0.52      0.57      1669
         5.0       0.85      0.93      0.89      5254

    accuracy                           0.77     10000
   macro avg       0.68      0.64      0.66     10000
weighted avg       0.75      0.77      0.76     10000



## Logistic Regression (w/ class weighting)

In [5]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(y_train),
                                        y = y_train                                                    
                                    )
class_weights = dict(zip(np.unique(y_train), class_weights))

# Initialize and train the logistic regression model with class weights
model = LogisticRegression(class_weight=class_weights, solver='liblinear')
model.fit(X_train_bow, y_train)

In [6]:
# Evaluate the model
y_pred = model.predict(X_test_bow)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.73      0.77      0.75      1236
         2.0       0.51      0.49      0.50       759
         3.0       0.54      0.58      0.56      1082
         4.0       0.61      0.55      0.58      1669
         5.0       0.89      0.90      0.89      5254

    accuracy                           0.76     10000
   macro avg       0.66      0.66      0.66     10000
weighted avg       0.76      0.76      0.76     10000



: 

## Logistic Regression (w/ undersampling the majority class)

In [9]:
from imblearn.under_sampling import RandomUnderSampler

# Undersample the majority class (5-star reviews) automatically
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_bow, y_train)

# Initialize and train the logistic regression model
model = LogisticRegression(solver='liblinear')
model.fit(X_train_resampled, y_train_resampled)

# Predict the target variable for the test set
y_pred2 = model.predict(X_test_bow)

In [11]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

         1.0       0.73      0.74      0.73      1236
         2.0       0.42      0.53      0.47       759
         3.0       0.49      0.59      0.54      1082
         4.0       0.51      0.59      0.55      1669
         5.0       0.92      0.80      0.86      5254

    accuracy                           0.71     10000
   macro avg       0.61      0.65      0.63     10000
weighted avg       0.74      0.71      0.73     10000



## Logistic Regression (w/ oversampling minority classes)

In [12]:
from imblearn.over_sampling import RandomOverSampler

# Oversample the minority classes (1 to 4-star reviews) automatically
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_bow, y_train)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

# Predict the target variable for the test set
y_pred3 = model.predict(X_test_bow)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

         1.0       0.76      0.74      0.75      1236
         2.0       0.46      0.53      0.49       759
         3.0       0.51      0.59      0.55      1082
         4.0       0.53      0.60      0.57      1669
         5.0       0.91      0.83      0.87      5254

    accuracy                           0.73     10000
   macro avg       0.64      0.66      0.65     10000
weighted avg       0.75      0.73      0.74     10000



## Logistic Regression (w/ SMOTE sampling technique)

In [14]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_bow, y_train)

# Initialize and train the logistic regression model
model = LogisticRegression(solver='liblinear')
model.fit(X_train_resampled, y_train_resampled)

# Predict the target variable for the test set
y_pred = model.predict(X_test_bow)

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.73      0.78      0.75      1236
         2.0       0.51      0.46      0.49       759
         3.0       0.52      0.58      0.55      1082
         4.0       0.57      0.55      0.56      1669
         5.0       0.89      0.87      0.88      5254

    accuracy                           0.74     10000
   macro avg       0.64      0.65      0.64     10000
weighted avg       0.74      0.74      0.74     10000

