In [15]:
import pandas as pd
import re
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load your imbalanced dataset
df = pd.read_csv(r'D:\Projects\automated-review-rating-system\data\cleaned_dataset\imbalanced_data.csv')
print(df.head())
print(df.columns)


       Id   ProductId          UserId            ProfileName  \
0  249591  B001LGGH54   AORGKBNQZ83O8  MacGuffin "MacGuffin"   
1  542900  B0001HAEJY   A289SYWE4BHCF                 akilah   
2  310716  B002QZ7ZBY   A8JB6RLAKR0T0                   Mary   
3  372491  B0083T6HC0  A2QCHBEXUBN2S8      Alaskan "Alaskan"   
4  164438  B0000E2T62   AZ1ZE53AR3EWO                Jane916   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     0                       0      4  1235865600   
1                     0                       0      5  1351209600   
2                     0                       0      3  1340668800   
3                     0                       0      5  1346716800   
4                     0                       0      1  1323216000   

                            Summary  \
0                       Interesting   
1                            Great!   
2  Delicious chips but wrong order!   
3                            K-Cups   

In [17]:
nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove emojis/unicode
    text = re.sub(r'[^\w\s]', '', text)         # Remove punctuation
    text = re.sub(r'\d+', '', text)             # Remove digits
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def lemmatize_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and len(token.lemma_) > 1]
    return ' '.join(tokens)

if 'lemmatized' not in df.columns:
    df['clean_text'] = df['Text'].apply(clean_text)
    df['lemmatized'] = df['clean_text'].apply(lemmatize_text)


In [19]:
X = df['lemmatized']
y = df['Score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True
)
print("Training samples:", len(X_train), "Test samples:", len(X_test))


Training samples: 1600 Test samples: 400


In [20]:
tfidf = TfidfVectorizer()
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)


Random Forest

In [21]:
clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
clf.fit(X_train_vec, y_train)


In [22]:
y_pred = clf.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=3))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.38

Classification Report:
               precision    recall  f1-score   support

           1      0.600     0.150     0.240        40
           2      0.455     0.083     0.141        60
           3      0.350     0.410     0.378       100
           4      0.340     0.567     0.425       120
           5      0.516     0.400     0.451        80

    accuracy                          0.380       400
   macro avg      0.452     0.322     0.327       400
weighted avg      0.421     0.380     0.357       400

Confusion Matrix:
 [[ 6  2 15 17  0]
 [ 3  5 22 24  6]
 [ 1  1 41 52  5]
 [ 0  3 30 68 19]
 [ 0  0  9 39 32]]


Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


clf = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
clf.fit(X_train_vec, y_train)
    
    # Predictions
y_pred = clf.predict(X_test_vec)
    
    # Results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=3))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.3675

Classification Report:
               precision    recall  f1-score   support

           1      0.431     0.550     0.484        40
           2      0.141     0.167     0.153        60
           3      0.278     0.250     0.263       100
           4      0.500     0.383     0.434       120
           5      0.458     0.550     0.500        80

    accuracy                          0.367       400
   macro avg      0.362     0.380     0.367       400
weighted avg      0.375     0.367     0.367       400

Confusion Matrix:
 [[22  6  6  4  2]
 [15 10 23  4  8]
 [10 30 25 20 15]
 [ 2 22 23 46 27]
 [ 2  3 13 18 44]]


SVM

In [25]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

clf = LinearSVC(max_iter=1000, random_state=42, class_weight='balanced')
clf.fit(X_train_vec, y_train)

    # Predictions
y_pred = clf.predict(X_test_vec)

    # Results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=3))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.35

Classification Report:
               precision    recall  f1-score   support

           1      0.314     0.275     0.293        40
           2      0.185     0.200     0.192        60
           3      0.298     0.310     0.304       100
           4      0.443     0.392     0.416       120
           5      0.433     0.487     0.459        80

    accuracy                          0.350       400
   macro avg      0.335     0.333     0.333       400
weighted avg      0.353     0.350     0.351       400

Confusion Matrix:
 [[11 10 11  5  3]
 [12 12 21  5 10]
 [ 5 23 31 29 12]
 [ 4 15 28 47 26]
 [ 3  5 13 20 39]]


In [36]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'))
])

param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],    
    'tfidf__max_df': [0.7, 1.0],               
    'tfidf__min_df': [1, 5],                    
    'clf__C': [0.01, 0.1, 1, 10],               
    'clf__penalty': ['l2'],                     
    'clf__solver': ['lbfgs']                    
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,                     
    scoring='f1_weighted',     
    n_jobs=-1,                 
    verbose=2                  
)

print("Starting hyperparameter tuning...")
grid_search.fit(X_train, y_train)

# Display best hyperparameters found
print("\nBest hyperparameters:")
print(grid_search.best_params_)

print(f"Best cross-validation weighted F1-score: {grid_search.best_score_:.4f}")

# Use the best model to predict on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate and print classification metrics on test set
print("\nClassification report on test set:")
print(classification_report(y_test, y_pred, digits=4))

Starting hyperparameter tuning...
Fitting 5 folds for each of 32 candidates, totalling 160 fits

Best hyperparameters:
{'clf__C': 1, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs', 'tfidf__max_df': 0.7, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}
Best cross-validation weighted F1-score: 0.3950

Classification report on test set:
              precision    recall  f1-score   support

           1     0.4865    0.4500    0.4675        40
           2     0.2623    0.2667    0.2645        60
           3     0.3545    0.3900    0.3714       100
           4     0.4906    0.4333    0.4602       120
           5     0.4767    0.5125    0.4940        80

    accuracy                         0.4150       400
   macro avg     0.4141    0.4105    0.4115       400
weighted avg     0.4191    0.4150    0.4161       400



In [38]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

pipeline = ImbPipeline([
    ('tfidf', TfidfVectorizer()),
    ('smote', SMOTE(random_state=42)),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

# ---------- 5. Expanded Hyperparameter Grid ----------
param_grid = {
    # TF-IDF improvements
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)],  # unigrams, bigrams, trigrams
    'tfidf__max_df': [0.7, 0.85, 1.0],
    'tfidf__min_df': [1, 3, 5],
    'tfidf__max_features': [5000, 10000, None],
    'tfidf__stop_words': ['english', None],

    # Logistic Regression tuning
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'clf__penalty': ['l2'],
    'clf__solver': ['lbfgs'],
    'clf__class_weight': [None, 'balanced']
}

# ---------- 6. Grid Search ----------
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    scoring='f1_weighted',  # better for class imbalance
    cv=5,
    n_jobs=-1,
    verbose=2
)

print("Starting Hyperparameter Tuning with SMOTE...")
grid_search.fit(X_train, y_train)

# ---------- 7. Best Params & CV Score ----------
print("\nBest Parameters Found:")
print(grid_search.best_params_)
print(f"Best Cross-Validation Weighted F1: {grid_search.best_score_:.4f}")

# ---------- 8. Final Evaluation on Test Data ----------
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))

Starting Hyperparameter Tuning with SMOTE...
Fitting 5 folds for each of 1944 candidates, totalling 9720 fits

Best Parameters Found:
{'clf__C': 1, 'clf__class_weight': None, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs', 'tfidf__max_df': 0.7, 'tfidf__max_features': 10000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}
Best Cross-Validation Weighted F1: 0.3971

Test Accuracy: 0.3875

Classification Report:
               precision    recall  f1-score   support

           1     0.4222    0.4750    0.4471        40
           2     0.1765    0.2000    0.1875        60
           3     0.3402    0.3300    0.3350       100
           4     0.4953    0.4417    0.4670       120
           5     0.4578    0.4750    0.4663        80

    accuracy                         0.3875       400
   macro avg     0.3784    0.3843    0.3806       400
weighted avg     0.3939    0.3875    0.3899       400



In [None]:
|