## Logistic Regression

In [2]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load your CSV data into a pandas DataFrame
df = pd.read_csv('../../../data/processed/tokenized_data.csv')

# Load the vectorized features from the .npy file
vectorized_features = np.load('../../../data/processed/vectorized_features_with_summary_tokens.npy')

feature_vectors = pd.DataFrame(vectorized_features)


feature_vectors.columns = [f'vec_feature_{i}' for i in range(feature_vectors.shape[1])]

X = feature_vectors # Feature vector
y = df['overall']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict the target variable for the test set
y_pred = model.predict(X_test)

# Save the trained model to a file
with open('logistic_regression_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Logistic Regression TFIDF matrix

In [21]:
from scipy.sparse import load_npz
df = pd.read_csv('../../../data/processed/tokenized_data.csv')

tfidf_matrix_reviewOnly = load_npz('../../../data/processed/tfidf_matrix_reviewOnly.npz')
labels = df['overall'].values

with open('../../../data/processed/feature_names_100vocab.pkl', 'rb') as f:
    feature_names_100vocab = pickle.load(f)

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix_reviewOnly, labels, test_size=0.2, random_state=42)

lr_classifier_reviewOnly = LogisticRegression(random_state=42)

# Train the classifier
lr_classifier_reviewOnly.fit(X_train, y_train)

# Predict on the test data
y_pred = lr_classifier_reviewOnly.predict(X_test)

# Save the trained model to a file
with open('logistic_regression_model_reviewOnly.pkl', 'wb') as file:
    pickle.dump(model, file)

In [22]:
# Load the trained logistic regression model from the file
with open('logistic_regression_model_reviewOnly.pkl', 'rb') as file:
    model = pickle.load(file)

# Calculate classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.47      0.58      0.52     20917
         2.0       0.27      0.01      0.03     12765
         3.0       0.32      0.14      0.19     19019
         4.0       0.43      0.16      0.23     28682
         5.0       0.66      0.93      0.77     88965

    accuracy                           0.60    170348
   macro avg       0.43      0.36      0.35    170348
weighted avg       0.53      0.60      0.53    170348



## Logistic Regression (w/ class weighting)

In [24]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(y_train),
                                        y = y_train                                                    
                                    )
class_weights = dict(zip(np.unique(y_train), class_weights))

# Initialize and train the logistic regression model with class weights
model = LogisticRegression(class_weight=class_weights)
model.fit(X_train, y_train)

y_pred4 = model.predict(X_train)

# Save the trained model to a file
with open('logistic_regression_model_with_class_weights.pkl', 'wb') as file:
    pickle.dump(model, file)


In [26]:
# Load the trained logistic regression model from the file
with open('logistic_regression_model_with_class_weights.pkl', 'rb') as file:
    model = pickle.load(file)

# Evaluate the model
print(classification_report(y_))
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.5025946885199709


## Logistic Regression (w/ undersampling the majority class)

In [17]:
from imblearn.under_sampling import RandomUnderSampler

# Undersample the majority class (5-star reviews) automatically
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

# Predict the target variable for the test set
y_pred2 = model.predict(X_test)

# Save the trained model to a file
with open('logistic_regression_model_with_undersampling.pkl', 'wb') as file:
    pickle.dump(model, file)

In [15]:
# Load the trained logistic regression model from the file
with open('logistic_regression_model_with_undersampling.pkl', 'rb') as file:
    model = pickle.load(file)

print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

         1.0       0.37      0.67      0.48     20917
         2.0       0.20      0.25      0.22     12765
         3.0       0.26      0.26      0.26     19019
         4.0       0.32      0.37      0.34     28682
         5.0       0.82      0.59      0.69     88965

    accuracy                           0.50    170348
   macro avg       0.39      0.43      0.40    170348
weighted avg       0.57      0.50      0.52    170348



## Logistic Regression (w/ oversampling minority classes)

In [18]:
from imblearn.over_sampling import RandomOverSampler

# Oversample the minority classes (1 to 4-star reviews) automatically
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

# Predict the target variable for the test set
y_pred3 = model.predict(X_test)

# Save the trained model to a file
with open('logistic_regression_model_with_oversampling.pkl', 'wb') as file:
    pickle.dump(model, file)


In [19]:
# Load the trained logistic regression model from the file
with open('logistic_regression_model_with_oversampling.pkl', 'rb') as file:
    model = pickle.load(file)

print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

         1.0       0.37      0.67      0.48     20917
         2.0       0.20      0.25      0.22     12765
         3.0       0.26      0.26      0.26     19019
         4.0       0.32      0.37      0.34     28682
         5.0       0.82      0.59      0.69     88965

    accuracy                           0.50    170348
   macro avg       0.39      0.43      0.40    170348
weighted avg       0.57      0.50      0.52    170348



## Logistic Regression (w/ SMOTE sampling technique)

In [8]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

# Predict the target variable for the test set
y_pred = model.predict(X_test)

# Save the trained model to a file
with open('logistic_regression_model_with_SMOTE.pkl', 'wb') as file:
    pickle.dump(model, file)

OSError: [WinError -1066598274] Windows Error 0xc06d007e

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.49      0.59      0.53     20917
         2.0       0.28      0.31      0.29     12765
         3.0       0.37      0.38      0.37     19019
         4.0       0.41      0.49      0.45     28682
         5.0       0.83      0.71      0.77     88965

    accuracy                           0.59    170348
   macro avg       0.47      0.50      0.48    170348
weighted avg       0.62      0.59      0.60    170348

