In [6]:
import pandas as pd

# Read your cleaned, balanced dataset
bal_path = r'D:\Projects\automated-review-rating-system\data\cleaned_dataset\balanced_data.csv'
df_bal = pd.read_csv(bal_path)
print(df_bal.head())
print(df_bal.columns)


       Id   ProductId          UserId                ProfileName  \
0  171065  B00009ZIY2  A362H8YI6T7B2I       Patricia J Gilchrist   
1    9366  B006N3IG4K  A1KJDUS91L5OOE                 bears22687   
2  410515  B000FIXYDC  A2C8O554YMY2ZL              Bixby "Bixby"   
3  482561  B000G2UUDO  A145B9FRIAAHAY  Rita's talented tastebuds   
4  120766  B005K4Q37A  A1UQFVHBQJ2K8Z                     Brenda   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     0                       0      1  1322438400   
1                     0                       1      1  1343952000   
2                     1                       2      1  1315785600   
3                     3                       6      1  1278288000   
4                     2                       5      1  1325808000   

                     Summary  \
0      not what we wanted...   
1                   not good   
2  Strange taste and texture   
3          AWFUL!!! AWFUL!!!   
4    Grove

In [7]:
print(df_bal.columns)

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text',
       'review_length'],
      dtype='object')


In [8]:
df_bal.drop(['Id', 'ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Time'], axis=1, inplace=True)

In [9]:
df_bal.to_csv('balanced_dataset.csv', index=False)
print("Columns after dropping unnecessary ones:")
print(df_bal.columns)

Columns after dropping unnecessary ones:
Index(['Score', 'Summary', 'Text', 'review_length'], dtype='object')


In [10]:
print(df_bal.columns)

Index(['Score', 'Summary', 'Text', 'review_length'], dtype='object')


In [10]:
print(df_bal.columns)

Index(['Score', 'Summary', 'Text', 'review_length'], dtype='object')


In [11]:
df_bal.to_csv('balanced_dataset_cleaned.csv', index=False)


In [12]:
print(df_bal.columns)


Index(['Score', 'Summary', 'Text', 'review_length'], dtype='object')


In [12]:
print(df_bal.isnull().sum())
print("No null values found in the dataset after cleaning.")
print(df_bal['Score'].value_counts())
print("Balanced dataset saved as 'balanced_dataset_cleaned.csv'.")

Score            0
Summary          1
Text             0
review_length    0
dtype: int64
No null values found in the dataset after cleaning.
Score
1    2000
2    2000
3    2000
4    2000
5    2000
Name: count, dtype: int64
Balanced dataset saved as 'balanced_dataset_cleaned.csv'.


In [13]:
import spacy
nlp = spacy.load('en_core_web_sm')

def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])

df_bal['Text'] = df_bal['Text'].apply(lemmatize_text)
print(df_bal['Text'].head())

0    we have two very picky cat who love the old or...
1    the coffee taste bitter and like it be burn I ...
2    I purchase a can that the store before I buy a...
3    this product be a perfect example for the erro...
4    I find that Grove Square French Vanilla Cappuc...
Name: Text, dtype: object


Data Cleaning

In [14]:
import spacy
import re
nlp = spacy.load('en_core_web_sm')

nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    """
    Lowercase, remove URLs, HTML tags, emojis, punctuation, non-ASCII, digits, extra whitespace.
    """
    text = str(text).lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove emojis/unicode
    text = re.sub(r'[^\w\s]', '', text)        # Remove punctuation
    text = re.sub(r'\d+', '', text)            # Remove digits
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def lemmatize_text(text):
    """
    Lemmatize using spaCy, remove stopwords, keep alphabetic tokens longer than 1 character.
    """
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and len(token.lemma_) > 1]
    return ' '.join(tokens)

df_bal['clean_text'] = df_bal['Text'].apply(clean_text)
df_bal['lemmatized'] = df_bal['clean_text'].apply(lemmatize_text)
print(df_bal[['Text', 'clean_text', 'lemmatized']].head())


                                                Text  \
0  we have two very picky cat who love the old or...   
1  the coffee taste bitter and like it be burn I ...   
2  I purchase a can that the store before I buy a...   
3  this product be a perfect example for the erro...   
4  I find that Grove Square French Vanilla Cappuc...   

                                          clean_text  \
0  we have two very picky cat who love the old or...   
1  the coffee taste bitter and like it be burn i ...   
2  i purchase a can that the store before i buy a...   
3  this product be a perfect example for the erro...   
4  i find that grove square french vanilla cappuc...   

                                          lemmatized  
0  picky cat love old original whiskas think phot...  
1  coffee taste bitter like burn clean machine ta...  
2  purchase store buy pack amazon try gross love ...  
3  product perfect example erroneous notion label...  
4  find grove square french vanilla cappuchino pl..

In [7]:
print(df_bal.columns)

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text',
       'review_length', 'clean_text', 'lemmatized'],
      dtype='object')


Text Cleaning + Lemmatization

In [15]:
nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    """
    Lowercase, remove URLs, HTML tags, emojis, punctuation, non-ASCII, digits, extra whitespace.
    """
    text = str(text).lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove emojis/unicode
    text = re.sub(r'[^\w\s]', '', text)        # Remove punctuation
    text = re.sub(r'\d+', '', text)            # Remove digits
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def lemmatize_text(text):
    """
    Lemmatize using spaCy, remove stopwords, keep alphabetic tokens longer than 1 character.
    """
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and len(token.lemma_) > 1]
    return ' '.join(tokens)

# If not already, apply cleaning + lemmatization
df_bal['clean_text'] = df_bal['Text'].apply(clean_text)
df_bal['lemmatized'] = df_bal['clean_text'].apply(lemmatize_text)
print(df_bal[['Text', 'clean_text', 'lemmatized']].head())


                                                Text  \
0  we have two very picky cat who love the old or...   
1  the coffee taste bitter and like it be burn I ...   
2  I purchase a can that the store before I buy a...   
3  this product be a perfect example for the erro...   
4  I find that Grove Square French Vanilla Cappuc...   

                                          clean_text  \
0  we have two very picky cat who love the old or...   
1  the coffee taste bitter and like it be burn i ...   
2  i purchase a can that the store before i buy a...   
3  this product be a perfect example for the erro...   
4  i find that grove square french vanilla cappuc...   

                                          lemmatized  
0  picky cat love old original whiskas think phot...  
1  coffee taste bitter like burn clean machine ta...  
2  purchase store buy pack amazon try gross love ...  
3  product perfect example erroneous notion label...  
4  find grove square french vanilla cappuchino pl..

 Split Data

In [16]:
X = df_bal['lemmatized']  # Using lemmatized column
y = df_bal['Score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True
)

print("Train distribution:\n", y_train.value_counts())
print("Test distribution:\n", y_test.value_counts())


Train distribution:
 Score
1    1600
4    1600
3    1600
2    1600
5    1600
Name: count, dtype: int64
Test distribution:
 Score
1    400
2    400
3    400
5    400
4    400
Name: count, dtype: int64


TF-IDF Vectorization

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer()
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

print("TF-IDF train shape:", X_train_vec.shape)
print("TF-IDF test shape:", X_test_vec.shape)

TF-IDF train shape: (8000, 18363)
TF-IDF test shape: (2000, 18363)


Model Training with Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  

clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=3))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.461

Classification Report:
               precision    recall  f1-score   support

           1      0.554     0.620     0.585       400
           2      0.410     0.372     0.391       400
           3      0.410     0.378     0.393       400
           4      0.390     0.380     0.385       400
           5      0.515     0.555     0.534       400

    accuracy                          0.461      2000
   macro avg      0.456     0.461     0.458      2000
weighted avg      0.456     0.461     0.458      2000

Confusion Matrix:
 [[248  77  36  18  21]
 [103 149  75  45  28]
 [ 54  74 151  80  41]
 [ 18  35  76 152 119]
 [ 25  28  30  95 222]]


Model Training with Random Forest


In [19]:
from sklearn.ensemble import RandomForestClassifier

models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}

for name, clf in models.items():
    print(f"\n----- {name} -----")
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=3))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    results[name] = acc



----- RandomForest -----
Accuracy: 0.4775
Classification Report:
               precision    recall  f1-score   support

           1      0.515     0.632     0.568       400
           2      0.469     0.345     0.398       400
           3      0.442     0.350     0.391       400
           4      0.434     0.403     0.418       400
           5      0.499     0.657     0.567       400

    accuracy                          0.477      2000
   macro avg      0.472     0.478     0.468      2000
weighted avg      0.472     0.477     0.468      2000

Confusion Matrix:
 [[253  55  30  24  38]
 [106 138  65  42  49]
 [ 72  54 140  74  60]
 [ 36  29  57 161 117]
 [ 24  18  25  70 263]]


Model Training with Linear SVC


In [20]:
from sklearn.svm import LinearSVC

models = {
    "LinearSVC": LinearSVC(max_iter=1000, random_state=42)
}

results = {}

for name, clf in models.items():
    print(f"\n----- {name} -----")
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=3))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    results[name] = acc


----- LinearSVC -----
Accuracy: 0.4385
Classification Report:
               precision    recall  f1-score   support

           1      0.547     0.600     0.572       400
           2      0.413     0.380     0.396       400
           3      0.383     0.367     0.375       400
           4      0.347     0.310     0.328       400
           5      0.473     0.535     0.502       400

    accuracy                          0.439      2000
   macro avg      0.433     0.439     0.435      2000
weighted avg      0.433     0.439     0.435      2000

Confusion Matrix:
 [[240  70  35  29  26]
 [ 88 152  83  43  34]
 [ 57  73 147  82  41]
 [ 22  40  77 124 137]
 [ 32  33  42  79 214]]
