In [1]:
import pandas as pd

# Read your cleaned, balanced dataset
bal_path = r'D:\Projects\automated-review-rating-system\data\cleaned_dataset\balanced_data.csv'
df_bal = pd.read_csv(bal_path)
print(df_bal.head())
print(df_bal.columns)


       Id   ProductId          UserId                ProfileName  \
0  171065  B00009ZIY2  A362H8YI6T7B2I       Patricia J Gilchrist   
1    9366  B006N3IG4K  A1KJDUS91L5OOE                 bears22687   
2  410515  B000FIXYDC  A2C8O554YMY2ZL              Bixby "Bixby"   
3  482561  B000G2UUDO  A145B9FRIAAHAY  Rita's talented tastebuds   
4  120766  B005K4Q37A  A1UQFVHBQJ2K8Z                     Brenda   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     0                       0      1  1322438400   
1                     0                       1      1  1343952000   
2                     1                       2      1  1315785600   
3                     3                       6      1  1278288000   
4                     2                       5      1  1325808000   

                     Summary  \
0      not what we wanted...   
1                   not good   
2  Strange taste and texture   
3          AWFUL!!! AWFUL!!!   
4    Grove

In [3]:
print(df_bal.columns)

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text',
       'review_length'],
      dtype='object')


In [4]:
df_bal.drop(['Id', 'ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Time'], axis=1, inplace=True)

In [6]:
df_bal.to_csv('balanced_dataset.csv', index=False)
print("Columns after dropping unnecessary ones:")
print(df_bal.columns)

Columns after dropping unnecessary ones:
Index(['Score', 'Summary', 'Text', 'review_length'], dtype='object')


In [8]:
print(df_bal.columns)

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text',
       'review_length'],
      dtype='object')


In [9]:
df_bal.drop(['Id', 'ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Time'], axis=1, inplace=True)

In [10]:
print(df_bal.columns)

Index(['Score', 'Summary', 'Text', 'review_length'], dtype='object')


In [11]:
df_bal.to_csv('balanced_dataset_cleaned.csv', index=False)


In [12]:
print(df_bal.columns)


Index(['Score', 'Summary', 'Text', 'review_length'], dtype='object')


In [13]:
print(df_bal.isnull().sum())
print("No null values found in the dataset after cleaning.")
print(df_bal['Score'].value_counts())
print("Balanced dataset saved as 'balanced_dataset_cleaned.csv'.")

Score            0
Summary          1
Text             0
review_length    0
dtype: int64
No null values found in the dataset after cleaning.
Score
1    2000
2    2000
3    2000
4    2000
5    2000
Name: count, dtype: int64
Balanced dataset saved as 'balanced_dataset_cleaned.csv'.


In [16]:
import spacy
nlp = spacy.load('en_core_web_sm')

def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])

df_bal['Text'] = df_bal['Text'].apply(lemmatize_text)
print(df_bal['Text'].head())

0    we have two very picky cat who love the old or...
1    the coffee taste bitter and like it be burn I ...
2    I purchase a can that the store before I buy a...
3    this product be a perfect example for the erro...
4    I find that Grove Square French Vanilla Cappuc...
Name: Text, dtype: object


Data Cleaning

In [None]:
import spacy
import re
nlp = spacy.load('en_core_web_sm')

nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    """
    Lowercase, remove URLs, HTML tags, emojis, punctuation, non-ASCII, digits, extra whitespace.
    """
    text = str(text).lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove emojis/unicode
    text = re.sub(r'[^\w\s]', '', text)        # Remove punctuation
    text = re.sub(r'\d+', '', text)            # Remove digits
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def lemmatize_text(text):
    """
    Lemmatize using spaCy, remove stopwords, keep alphabetic tokens longer than 1 character.
    """
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and len(token.lemma_) > 1]
    return ' '.join(tokens)

df_bal['clean_text'] = df_bal['Text'].apply(clean_text)
df_bal['lemmatized'] = df_bal['clean_text'].apply(lemmatize_text)
print(df_bal[['Text', 'clean_text', 'lemmatized']].head())


                                                Text  \
0  We have two very picky cats who loved the old ...   
1  The coffee tasted bitter and like it was burnt...   
2  I purchased a can that the store before I boug...   
3  This product is a perfect example for the erro...   
4  I found that Grove Square French Vanilla Cappu...   

                                          clean_text  \
0  we have two very picky cats who loved the old ...   
1  the coffee tasted bitter and like it was burnt...   
2  i purchased a can that the store before i boug...   
3  this product is a perfect example for the erro...   
4  i found that grove square french vanilla cappu...   

                                          lemmatized  
0  picky cat love old original whiskas think phot...  
1  coffee taste bitter like burn clean machine ta...  
2  purchase store buy pack amazon try gross love ...  
3  product perfect example erroneous notion label...  
4  find grove square french vanilla cappuchino pl..

In [7]:
print(df_bal.columns)

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text',
       'review_length', 'clean_text', 'lemmatized'],
      dtype='object')


Text Cleaning + Lemmatization

In [None]:

nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    """
    Lowercase, remove URLs, HTML tags, emojis, punctuation, non-ASCII, digits, extra whitespace.
    """
    text = str(text).lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove emojis/unicode
    text = re.sub(r'[^\w\s]', '', text)        # Remove punctuation
    text = re.sub(r'\d+', '', text)            # Remove digits
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def lemmatize_text(text):
    """
    Lemmatize using spaCy, remove stopwords, keep alphabetic tokens longer than 1 character.
    """
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and len(token.lemma_) > 1]
    return ' '.join(tokens)

# If not already, apply cleaning + lemmatization
df_bal['clean_text'] = df_bal['Text'].apply(clean_text)
df_bal['lemmatized'] = df_bal['clean_text'].apply(lemmatize_text)
print(df_bal[['Text', 'clean_text', 'lemmatized']].head())


                                                Text  \
0  We have two very picky cats who loved the old ...   
1  The coffee tasted bitter and like it was burnt...   
2  I purchased a can that the store before I boug...   
3  This product is a perfect example for the erro...   
4  I found that Grove Square French Vanilla Cappu...   

                                          clean_text  \
0  we have two very picky cats who loved the old ...   
1  the coffee tasted bitter and like it was burnt...   
2  i purchased a can that the store before i boug...   
3  this product is a perfect example for the erro...   
4  i found that grove square french vanilla cappu...   

                                          lemmatized  
0  picky cat love old original whiskas think phot...  
1  coffee taste bitter like burn clean machine ta...  
2  purchase store buy pack amazon try gross love ...  
3  product perfect example erroneous notion label...  
4  find grove square french vanilla cappuchino pl..

 Split Data

In [None]:
X = df_bal['lemmatized']  # Using lemmatized column
y = df_bal['Score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True
)

print("Train distribution:\n", y_train.value_counts())
print("Test distribution:\n", y_test.value_counts())


Train distribution:
 Score
1    1600
4    1600
3    1600
2    1600
5    1600
Name: count, dtype: int64
Test distribution:
 Score
1    400
2    400
3    400
5    400
4    400
Name: count, dtype: int64


TF-IDF Vectorization

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer()
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

print("TF-IDF train shape:", X_train_vec.shape)
print("TF-IDF test shape:", X_test_vec.shape)

TF-IDF train shape: (8000, 21702)
TF-IDF test shape: (2000, 21702)


Model Training with Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  

clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=3))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Accuracy: 0.456

Classification Report:
               precision    recall  f1-score   support

           1      0.557     0.610     0.582       400
           2      0.399     0.378     0.388       400
           3      0.416     0.372     0.393       400
           4      0.374     0.355     0.364       400
           5      0.507     0.565     0.534       400

    accuracy                          0.456      2000
   macro avg      0.451     0.456     0.452      2000
weighted avg      0.451     0.456     0.452      2000

Confusion Matrix:
 [[244  81  35  20  20]
 [102 151  73  44  30]
 [ 49  78 149  81  43]
 [ 19  41  71 142 127]
 [ 24  27  30  93 226]]


Model Training with Random Forest


In [21]:
from sklearn.ensemble import RandomForestClassifier

models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}

for name, clf in models.items():
    print(f"\n----- {name} -----")
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=3))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    results[name] = acc



----- RandomForest -----
Accuracy: 0.4630
Classification Report:
               precision    recall  f1-score   support

           1      0.507     0.632     0.563       400
           2      0.432     0.335     0.377       400
           3      0.440     0.350     0.390       400
           4      0.416     0.367     0.390       400
           5      0.485     0.630     0.548       400

    accuracy                          0.463      2000
   macro avg      0.456     0.463     0.454      2000
weighted avg      0.456     0.463     0.454      2000

Confusion Matrix:
 [[253  64  28  17  38]
 [107 134  74  41  44]
 [ 73  60 140  72  55]
 [ 35  37  50 147 131]
 [ 31  15  26  76 252]]


Model Training with Linear SVC


In [23]:
from sklearn.svm import LinearSVC

models = {
    "LinearSVC": LinearSVC(max_iter=1000, random_state=42)
}

results = {}

for name, clf in models.items():
    print(f"\n----- {name} -----")
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=3))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    results[name] = acc


----- LinearSVC -----
Accuracy: 0.4405
Classification Report:
               precision    recall  f1-score   support

           1      0.544     0.578     0.560       400
           2      0.408     0.393     0.400       400
           3      0.407     0.398     0.402       400
           4      0.355     0.315     0.334       400
           5      0.468     0.520     0.493       400

    accuracy                          0.441      2000
   macro avg      0.436     0.440     0.438      2000
weighted avg      0.436     0.441     0.438      2000

Confusion Matrix:
 [[231  75  40  30  24]
 [ 88 157  76  45  34]
 [ 52  71 159  75  43]
 [ 23  40  76 126 135]
 [ 31  42  40  79 208]]
