In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score, precision_score, recall_score, f1_score
import joblib

## Loading the DataSet

In [2]:
df=pd.read_csv('preprocessed datase.csv')

In [4]:
df.head()

Unnamed: 0,rating,label,text_,review_length,cleaned_text,vector_0,vector_1,vector_2,vector_3,vector_4,...,vector_290,vector_291,vector_292,vector_293,vector_294,vector_295,vector_296,vector_297,vector_298,vector_299
0,5,1,"Love this! Well made, sturdy, and very comfor...",75,love well made sturdy comfortable love itvery ...,0.009552,0.055578,-0.074036,-0.004832,-0.087739,...,0.13559,0.035983,0.118541,0.009863,-0.006641,0.030139,-0.006002,-0.066576,0.032783,-0.117144
1,5,1,"love it, a great upgrade from the original. I...",80,love great upgrade original mine couple year,0.024314,0.011938,-0.050561,-0.000509,-0.047107,...,0.06564,0.011398,0.079855,-0.008035,-0.018,-0.024986,-0.004174,-0.053297,0.013269,-0.065177
2,5,1,This pillow saved my back. I love the look and...,67,pillow saved back love look feel pillow,0.037266,0.021903,-0.104903,-0.024621,-0.121138,...,0.136369,0.025341,0.071049,-0.008753,0.033459,0.084169,0.008969,-0.039396,0.08463,-0.111436
3,1,1,"Missing information on how to use it, but it i...",81,missing information use great product price,0.038293,-0.021084,-0.03479,0.013776,-0.031484,...,0.06417,0.03757,0.096194,-0.001511,0.026777,-0.060727,-0.007324,-0.053665,0.04203,-0.102379
4,5,1,Very nice set. Good quality. We have had the s...,85,nice set good quality set two month not,-0.018134,0.016582,0.006793,0.012251,-0.033542,...,0.127616,0.023219,0.063599,-0.049727,0.007967,-0.039801,-0.004324,-0.021565,0.060416,-0.048606


##  Spliting the dataset into training and testing sets

In [5]:
X = df.drop(['label', 'text_', 'cleaned_text'], axis=1)
y = df['label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Training models 

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


pipelines = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42,probability=True),
    'Logistic Regression': LogisticRegression(random_state=42)
}

## Evaluating each model’s performance

In [None]:
best_model = None
best_performance = None

for model_name, model in pipelines.items():
    print(f"\nTraining and evaluating {model_name}...")
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    model_filename = f"{model_name.replace(' ', '_')}_model.pkl"
    joblib.dump(model, model_filename)
    print(f"Model saved as {model_filename}")  
     


Training and evaluating Random Forest...

Accuracy: 0.6888
Precision: 0.6905
Recall: 0.6888
F1 Score: 0.6883

Confusion Matrix:
[[2636 1438]
 [1078 2932]]

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.65      0.68      4074
           1       0.67      0.73      0.70      4010

    accuracy                           0.69      8084
   macro avg       0.69      0.69      0.69      8084
weighted avg       0.69      0.69      0.69      8084

Model saved as Random_Forest_model.pkl

Training and evaluating SVM...

Accuracy: 0.7006
Precision: 0.7009
Recall: 0.7006
F1 Score: 0.7006

Confusion Matrix:
[[2799 1275]
 [1145 2865]]

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.69      0.70      4074
           1       0.69      0.71      0.70      4010

    accuracy                           0.70      8084
   macro avg       0.70      0.70      0.70      8084
weighted avg

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Loading Models

In [10]:
random_forest_model = joblib.load('Random_Forest_model.pkl')
svm_model = joblib.load('SVM_model.pkl')
logistic_regression_model = joblib.load('Logistic_Regression_model.pkl')


rf_predictions = random_forest_model.predict(X_test)
svm_predictions = svm_model.predict(X_test)
logistic_predictions = logistic_regression_model.predict(X_test)

## Test Predictions

In [11]:
comparison_df_SVM = pd.DataFrame({
    'Actual': y_test[:10],
    'Predicted by SVM': svm_predictions[:10]
})
print("SVM Model Predictions:")
comparison_df_SVM

SVM Model Predictions:


Unnamed: 0,Actual,Predicted by SVM
33075,1,1
16852,1,0
24930,0,0
29281,1,1
4990,0,1
6860,1,1
24467,1,1
16177,1,1
1487,1,0
13713,0,1
