In [None]:
import pickle
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Load your CSV data into a pandas DataFrame
df = pd.read_csv('../../data/processed/tokenized_data.csv')

# Load the vectorized features from the .npy file
vectorized_features = np.load('../../data/processed/vectorized_features.npy')

feature_vectors = pd.DataFrame(vectorized_features)


feature_vectors.columns = [f'vec_feature_{i}' for i in range(feature_vectors.shape[1])]

X = feature_vectors # Feature vector
y = df['overall']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Oversample the minority classes (1 to 4-star reviews) automatically
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

# Predict the target variable for the test set
y_pred3 = model.predict(X_test)

# Save the trained model to a file
with open('logistic_regression_model_with_oversampling2.pkl', 'wb') as file:
    pickle.dump(model, file)
print(classification_report(y_test, y_pred3))
