# Imbalanced Review Dataset Preparation
The following steps preprocess and lemmatize the imbalanced data. No balancing is performed.

In [9]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.utils import resample

nltk.download('punkt')
nltk.download('wordnet')

# Column names
REVIEW_COL = "Text"
LABEL_COL = "Score"

IMBALANCED_PATH = r"D:\Projects\automated-review-rating-system\data\cleaned_dataset\imbalanced_data.csv"
IMBALANCED_SAVE_PATH = r"D:\Projects\automated-review-rating-system\data\cleaned_dataset\imbalanced_data_lemmatized.csv"




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Cleaning
Remove unwanted characters, lowercase all text, and ensure no missing values.



In [10]:
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = ''.join(char if char.isalpha() or char.isspace() else ' ' for char in text)
    text = ' '.join(text.split())
    return text

df_imbalanced[REVIEW_COL] = df_imbalanced[REVIEW_COL].apply(clean_text)
df_imbalanced = df_imbalanced.dropna(subset=[REVIEW_COL, LABEL_COL]).reset_index(drop=True)
df_imbalanced.head()



Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,review_length,lemmatized_text
0,249591,B001LGGH54,AORGKBNQZ83O8,"MacGuffin ""MacGuffin""",0,0,4,1235865600,Interesting,i m always down to review a beverage if it mee...,1709,i m always down to review a beverage if it mee...
1,542900,B0001HAEJY,A289SYWE4BHCF,akilah,0,0,5,1351209600,Great!,very good product great for your blood you wil...,196,very good product great for your blood you wil...
2,310716,B002QZ7ZBY,A8JB6RLAKR0T0,Mary,0,0,3,1340668800,Delicious chips but wrong order!,i love pita chips and i love stacy s unfortuna...,631,i love pita chip and i love stacy s unfortunat...
3,372491,B0083T6HC0,A2QCHBEXUBN2S8,"Alaskan ""Alaskan""",0,0,5,1346716800,K-Cups,i love the assorted k cups it is a great way t...,151,i love the assorted k cup it is a great way to...
4,164438,B0000E2T62,AZ1ZE53AR3EWO,Jane916,0,0,1,1323216000,Gumballs,the ad for gumballs on amazon indicated they w...,137,the ad for gumballs on amazon indicated they w...


## Text Lemmatization
Lemmatize every review so words are reduced to their root forms.


In [11]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized)

df_imbalanced["lemmatized_text"] = df_imbalanced[REVIEW_COL].apply(lemmatize_text)
df_imbalanced.head()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,review_length,lemmatized_text
0,249591,B001LGGH54,AORGKBNQZ83O8,"MacGuffin ""MacGuffin""",0,0,4,1235865600,Interesting,i m always down to review a beverage if it mee...,1709,i m always down to review a beverage if it mee...
1,542900,B0001HAEJY,A289SYWE4BHCF,akilah,0,0,5,1351209600,Great!,very good product great for your blood you wil...,196,very good product great for your blood you wil...
2,310716,B002QZ7ZBY,A8JB6RLAKR0T0,Mary,0,0,3,1340668800,Delicious chips but wrong order!,i love pita chips and i love stacy s unfortuna...,631,i love pita chip and i love stacy s unfortunat...
3,372491,B0083T6HC0,A2QCHBEXUBN2S8,"Alaskan ""Alaskan""",0,0,5,1346716800,K-Cups,i love the assorted k cups it is a great way t...,151,i love the assorted k cup it is a great way to...
4,164438,B0000E2T62,AZ1ZE53AR3EWO,Jane916,0,0,1,1323216000,Gumballs,the ad for gumballs on amazon indicated they w...,137,the ad for gumballs on amazon indicated they w...


## Save Processed Imbalanced Dataset
Save the cleaned and lemmatized imbalanced dataset for use in model training and evaluation.

In [12]:
save_columns = [REVIEW_COL, "lemmatized_text", LABEL_COL]
df_imbalanced.to_csv(IMBALANCED_SAVE_PATH, index=False, columns=save_columns)
print(f"Imbalanced dataset saved to: {IMBALANCED_SAVE_PATH}")


Imbalanced dataset saved to: D:\Projects\automated-review-rating-system\data\cleaned_dataset\imbalanced_data_lemmatized.csv


## Feature Extraction
Using TF-IDF vectorization on the lemmatized text column to convert reviews into numerical features for model training.

In [17]:
REVIEW_COL = "Text"
LEMMATIZED_COL = "lemmatized_text"
LABEL_COL = "Score"

# Load your data
df_imbalanced = pd.read_csv('data\cleaned_dataset\imbalanced_data_lemmatized.csv')

# Now you can safely use
X = df_imbalanced[LEMMATIZED_COL]
y = df_imbalanced[LABEL_COL]



## Train-Test Split
Split data into training and test sets for unbiased evaluation.

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_vec = tfidf.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42, stratify=y)


## Model Training
Train a Logistic Regression model on the imbalanced dataset.

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Use class_weight='balanced'
lr = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.375
Confusion Matrix:
 [[17  8  8  4  3]
 [23 13  9  7  8]
 [ 8 23 36 22 11]
 [ 7 12 29 43 29]
 [ 7  7  6 19 41]]
Classification Report:
               precision    recall  f1-score   support

           1       0.27      0.42      0.33        40
           2       0.21      0.22      0.21        60
           3       0.41      0.36      0.38       100
           4       0.45      0.36      0.40       120
           5       0.45      0.51      0.48        80

    accuracy                           0.38       400
   macro avg       0.36      0.37      0.36       400
weighted avg       0.39      0.38      0.38       400



GridSearchCV for hyperparameter tuning

In [27]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "solver": ["liblinear", "lbfgs"],
    "max_iter": [500, 1000],
    "class_weight": ["balanced"]  # Ensure imbalance handling
}

lr = LogisticRegression(random_state=42)
grid = GridSearchCV(lr, param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Best cross-validation accuracy: {grid.best_score_:.4f}")

# Evaluate the best model
best_lr = grid.best_estimator_
y_pred_best = best_lr.predict(X_test)
print("Tuned Model Accuracy:", accuracy_score(y_test, y_pred_best))
print("Classification Report:\n", classification_report(y_test, y_pred_best))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters: {'C': 1, 'class_weight': 'balanced', 'max_iter': 500, 'solver': 'liblinear'}
Best cross-validation accuracy: 0.4381
Tuned Model Accuracy: 0.405
Classification Report:
               precision    recall  f1-score   support

           1       0.35      0.38      0.36        40
           2       0.23      0.20      0.21        60
           3       0.41      0.41      0.41       100
           4       0.45      0.42      0.43       120
           5       0.46      0.55      0.50        80

    accuracy                           0.41       400
   macro avg       0.38      0.39      0.38       400
weighted avg       0.40      0.41      0.40       400



Fetch best model

In [28]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

best_lr = grid.best_estimator_
y_pred_best = best_lr.predict(X_test)

print("Tuned Model Accuracy:", accuracy_score(y_test, y_pred_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))


Tuned Model Accuracy: 0.405

Confusion Matrix:
 [[15  8 10  5  2]
 [17 12 14  7 10]
 [ 4 19 41 26 10]
 [ 5  8 28 50 29]
 [ 2  6  6 22 44]]

Classification Report:
               precision    recall  f1-score   support

           1       0.35      0.38      0.36        40
           2       0.23      0.20      0.21        60
           3       0.41      0.41      0.41       100
           4       0.45      0.42      0.43       120
           5       0.46      0.55      0.50        80

    accuracy                           0.41       400
   macro avg       0.38      0.39      0.38       400
weighted avg       0.40      0.41      0.40       400



In [29]:
import joblib

# Save the best Logistic Regression model after tuning
joblib.dump(best_lr, "model_B_imbalanced.pkl")

# Save the TF-IDF vectorizer (make sure to use the one used for train/test split)
joblib.dump(tfidf, "vectorizer_model_B.pkl")

print("Model and vectorizer saved as model_B_imbalanced.pkl and vectorizer_model_B.pkl")


Model and vectorizer saved as model_B_imbalanced.pkl and vectorizer_model_B.pkl
