In [4]:
import pandas as pd
import numpy as np
import nltk

# Download both 'punkt' and 'punkt_tab' – this is now required!
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')  # (for lemmatization, as before)


from nltk.stem import WordNetLemmatizer
from sklearn.utils import resample



# File paths
DATA_PATH = r"D:\Projects\automated-review-rating-system\data\cleaned_dataset\balanced_data.csv"
BALANCED_SAVE_PATH = r"D:\Projects\automated-review-rating-system\data\cleaned_dataset\balanced_data_lemmatized.csv"

# Column names
REVIEW_COL = "Text"
LABEL_COL = "Score"

df = pd.read_csv(DATA_PATH)
df.head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,review_length
0,171065,B00009ZIY2,A362H8YI6T7B2I,Patricia J Gilchrist,0,0,1,1322438400,not what we wanted...,We have two very picky cats who loved the old ...,343
1,9366,B006N3IG4K,A1KJDUS91L5OOE,bears22687,0,1,1,1343952000,not good,The coffee tasted bitter and like it was burnt...,136
2,410515,B000FIXYDC,A2C8O554YMY2ZL,"Bixby ""Bixby""",1,2,1,1315785600,Strange taste and texture,I purchased a can that the store before I boug...,343
3,482561,B000G2UUDO,A145B9FRIAAHAY,Rita's talented tastebuds,3,6,1,1278288000,AWFUL!!! AWFUL!!!,This product is a perfect example for the erro...,1096
4,120766,B005K4Q37A,A1UQFVHBQJ2K8Z,Brenda,2,5,1,1325808000,Grove Square Cappuchino,I found that Grove Square French Vanilla Cappu...,430


Data Cleaning
We remove unwanted characters and lower-case the review text for uniformity. Missing values are handled as well.

In [5]:
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = ''.join(char if char.isalpha() or char.isspace() else ' ' for char in text)
    text = ' '.join(text.split())
    return text

df[REVIEW_COL] = df[REVIEW_COL].apply(clean_text)
df = df.dropna(subset=[REVIEW_COL, LABEL_COL]).reset_index(drop=True)
df.head()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,review_length
0,171065,B00009ZIY2,A362H8YI6T7B2I,Patricia J Gilchrist,0,0,1,1322438400,not what we wanted...,we have two very picky cats who loved the old ...,343
1,9366,B006N3IG4K,A1KJDUS91L5OOE,bears22687,0,1,1,1343952000,not good,the coffee tasted bitter and like it was burnt...,136
2,410515,B000FIXYDC,A2C8O554YMY2ZL,"Bixby ""Bixby""",1,2,1,1315785600,Strange taste and texture,i purchased a can that the store before i boug...,343
3,482561,B000G2UUDO,A145B9FRIAAHAY,Rita's talented tastebuds,3,6,1,1278288000,AWFUL!!! AWFUL!!!,this product is a perfect example for the erro...,1096
4,120766,B005K4Q37A,A1UQFVHBQJ2K8Z,Brenda,2,5,1,1325808000,Grove Square Cappuchino,i found that grove square french vanilla cappu...,430


Text Lemmatization
Lemmatize each review to reduce words to their root forms.

In [6]:
import nltk

print(nltk.data.path)  # Check where NLTK is looking for resources
nltk.download('punkt', quiet=False, force=True)
nltk.download('wordnet', quiet=False, force=True)

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized)

df["lemmatized_text"] = df[REVIEW_COL].apply(lemmatize_text)
df.head()



['C:\\Users\\HP/nltk_data', 'c:\\Users\\HP\\anaconda3\\nltk_data', 'c:\\Users\\HP\\anaconda3\\share\\nltk_data', 'c:\\Users\\HP\\anaconda3\\lib\\nltk_data', 'C:\\Users\\HP\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,review_length,lemmatized_text
0,171065,B00009ZIY2,A362H8YI6T7B2I,Patricia J Gilchrist,0,0,1,1322438400,not what we wanted...,we have two very picky cats who loved the old ...,343,we have two very picky cat who loved the old o...
1,9366,B006N3IG4K,A1KJDUS91L5OOE,bears22687,0,1,1,1343952000,not good,the coffee tasted bitter and like it was burnt...,136,the coffee tasted bitter and like it wa burnt ...
2,410515,B000FIXYDC,A2C8O554YMY2ZL,"Bixby ""Bixby""",1,2,1,1315785600,Strange taste and texture,i purchased a can that the store before i boug...,343,i purchased a can that the store before i boug...
3,482561,B000G2UUDO,A145B9FRIAAHAY,Rita's talented tastebuds,3,6,1,1278288000,AWFUL!!! AWFUL!!!,this product is a perfect example for the erro...,1096,this product is a perfect example for the erro...
4,120766,B005K4Q37A,A1UQFVHBQJ2K8Z,Brenda,2,5,1,1325808000,Grove Square Cappuchino,i found that grove square french vanilla cappu...,430,i found that grove square french vanilla cappu...


Data Balancing

In [7]:
dfs = []
max_count = df[LABEL_COL].value_counts().max()
for label in df[LABEL_COL].unique():
    df_label = df[df[LABEL_COL] == label]
    df_upsampled = resample(df_label, replace=True, n_samples=max_count, random_state=42)
    dfs.append(df_upsampled)
df_balanced = pd.concat(dfs).sample(frac=1, random_state=42).reset_index(drop=True)
df_balanced[LABEL_COL].value_counts()
df_balanced.head()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,review_length,lemmatized_text
0,298500,B006N3I0DM,A254CKI391CFLO,Michael,0,0,4,1300838400,Emeril's Bold - great cup of coffee,i keep this in my regular rotation of keurig b...,201,i keep this in my regular rotation of keurig b...
1,121863,B0014X8WIE,AJ5XH2V209QFJ,Donald L. Nagle,3,3,3,1254960000,Very Vanilla,now that this is made in mexico it s much swee...,159,now that this is made in mexico it s much swee...
2,535050,B003DSBCZS,A1L6T8GSSV7BC1,grant,0,0,1,1339545600,not as discribed,these are not as described these are not the c...,145,these are not a described these are not the ch...
3,423930,B0067TI2EY,A1R4IVSZIDVGFZ,Alison Tripp,1,1,3,1326067200,Starbucks K cup,disappointed in the product overly expensive a...,142,disappointed in the product overly expensive a...
4,13810,B0008D6XH8,A2Q1OFIKIEOYGD,"MendoMama ""MendoMama""",0,0,3,1349568000,Adds a nice flavor to plain drinking water.,i use this lemon oil to add a few drops to my ...,303,i use this lemon oil to add a few drop to my d...


## Save the Processed Balanced Dataset
Export the cleaned, lemmatized, and balanced dataset for model training and evaluation.

In [8]:
save_columns = [REVIEW_COL, "lemmatized_text", LABEL_COL]
df_balanced.to_csv(BALANCED_SAVE_PATH, index=False, columns=save_columns)
print(f"Balanced dataset saved to: {BALANCED_SAVE_PATH}")


Balanced dataset saved to: D:\Projects\automated-review-rating-system\data\cleaned_dataset\balanced_data_lemmatized.csv


Data Cleaning

In [9]:
import spacy
import re
nlp = spacy.load('en_core_web_sm')

nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    """
    Lowercase, remove URLs, HTML tags, emojis, punctuation, non-ASCII, digits, extra whitespace.
    """
    text = str(text).lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove emojis/unicode
    text = re.sub(r'[^\w\s]', '', text)        # Remove punctuation
    text = re.sub(r'\d+', '', text)            # Remove digits
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def lemmatize_text(text):
    """
    Lemmatize using spaCy, remove stopwords, keep alphabetic tokens longer than 1 character.
    """
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and len(token.lemma_) > 1]
    return ' '.join(tokens)

df_balanced['clean_text'] = df_balanced['Text'].apply(clean_text)
df_balanced['lemmatized'] = df_balanced['clean_text'].apply(lemmatize_text)
print(df_balanced[['Text', 'clean_text', 'lemmatized']].head())


                                                Text  \
0  i keep this in my regular rotation of keurig b...   
1  now that this is made in mexico it s much swee...   
2  these are not as described these are not the c...   
3  disappointed in the product overly expensive a...   
4  i use this lemon oil to add a few drops to my ...   

                                          clean_text  \
0  i keep this in my regular rotation of keurig b...   
1  now that this is made in mexico it s much swee...   
2  these are not as described these are not the c...   
3  disappointed in the product overly expensive a...   
4  i use this lemon oil to add a few drops to my ...   

                                          lemmatized  
0  regular rotation keurig bold coffee amazon sub...  
1  mexico sweet old nesquick vanilla like strong ...  
2  describe cherry fill gum product taste terribl...  
3  disappoint product overly expensive taste like...  
4  use lemon oil add drop drinking water refreshi..

In [10]:
print(df_balanced.columns)

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text',
       'review_length', 'lemmatized_text', 'clean_text', 'lemmatized'],
      dtype='object')


 Split Data

In [11]:
from sklearn.model_selection import train_test_split

X = df_balanced['lemmatized']  # Using lemmatized column
y = df_balanced['Score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True
)

print("Train distribution:\n", y_train.value_counts())
print("Test distribution:\n", y_test.value_counts())


Train distribution:
 Score
1    1600
4    1600
3    1600
2    1600
5    1600
Name: count, dtype: int64
Test distribution:
 Score
1    400
2    400
3    400
5    400
4    400
Name: count, dtype: int64


TF-IDF Vectorization

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer()
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

print("TF-IDF train shape:", X_train_vec.shape)
print("TF-IDF test shape:", X_test_vec.shape)


TF-IDF train shape: (8000, 12138)
TF-IDF test shape: (2000, 12138)


Model Training with Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  

clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=3))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.66

Classification Report:
               precision    recall  f1-score   support

           1      0.720     0.715     0.718       400
           2      0.635     0.625     0.630       400
           3      0.628     0.603     0.615       400
           4      0.619     0.625     0.622       400
           5      0.696     0.733     0.714       400

    accuracy                          0.660      2000
   macro avg      0.659     0.660     0.660      2000
weighted avg      0.659     0.660     0.660      2000

Confusion Matrix:
 [[286  53  19  16  26]
 [ 61 250  54  22  13]
 [ 34  45 241  55  25]
 [  9  31  46 250  64]
 [  7  15  24  61 293]]


Hyper Parameter Tuning 

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import pandas as pd


X = df_balanced['lemmatized']
y = df_balanced['Score']

# Apply SMOTE only to numeric vectors (for balanced training)
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_vec, y_train)

# Logistic Regression model
logreg = LogisticRegression()

# Hyperparameter search space
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],   # Regularization strength
    'penalty': ['l1', 'l2'],        # Norm type
    'solver': ['liblinear'],        # Solver that supports l1 and l2 penalties
    'max_iter': [100, 200, 300]
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1
)

# 7️⃣ Fit the best model on balanced data
grid_search.fit(X_train_bal, y_train_bal)
best_model = grid_search.best_estimator_

print("\nBest Hyperparameters:", grid_search.best_params_)

# Predictions on test set (using original test vectorized data)
y_pred = best_model.predict(X_test_vec)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\n Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Fitting 5 folds for each of 30 candidates, totalling 150 fits

Best Hyperparameters: {'C': 10, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}

Classification Report:
              precision    recall  f1-score   support

           1       0.80      0.79      0.79       400
           2       0.71      0.72      0.72       400
           3       0.72      0.73      0.73       400
           4       0.72      0.74      0.73       400
           5       0.79      0.77      0.78       400

    accuracy                           0.75      2000
   macro avg       0.75      0.75      0.75      2000
weighted avg       0.75      0.75      0.75      2000


 Confusion Matrix:
[[314  35  20  13  18]
 [ 41 288  36  24  11]
 [ 18  36 294  31  21]
 [ 15  24  36 295  30]
 [  4  20  25  45 306]]


In [22]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = joblib.load('vectorizer_model_A.pkl')

# Save the best trained balanced model as Model A
joblib.dump(best_model, 'model_A_balanced.pkl')

# Save the TF-IDF vectorizer used for training
joblib.dump(vectorizer, 'vectorizer_model_A.pkl')

print("✅ Model A and its vectorizer have been saved successfully!")


✅ Model A and its vectorizer have been saved successfully!
