In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load dataset
df = pd.read_csv('data/amazon.csv') # Replace with actual dataset path

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviewText  20000 non-null  object
 1   Positive    20000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [4]:
df.head()

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1


In [5]:
df.isnull().sum()

reviewText    0
Positive      0
dtype: int64

In [6]:
# Text preprocessing
class TextPreprocessor(TransformerMixin):
    def transform(self, X, **transform_params):
        return [self._clean_text(text) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def _clean_text(self, text):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = ' '.join([word for word in text.split() if word not in stop_words])
        return text

In [7]:
# List of stop words
stop_words = set(['the', 'and', 'is', 'in', 'to', 'of', 'for', 'it', 'on', 'with', 'as', 'this', 'that', 'are'])


In [8]:
# Applying text preprocessing
preprocessor = TextPreprocessor()
df['cleaned_reviewText'] = preprocessor.transform(df['reviewText'])


In [9]:
df.head()

Unnamed: 0,reviewText,Positive,cleaned_reviewText
0,This is a one of the best apps acording to a b...,1,a one best apps acording a bunch people i agre...
1,This is a pretty good version of the game for ...,1,a pretty good version game being free there lo...
2,this is a really cool game. there are a bunch ...,1,a really cool game there a bunch levels you ca...
3,"This is a silly game and can be frustrating, b...",1,a silly game can be frustrating but lots fun d...
4,This is a terrific game on any pad. Hrs of fun...,1,a terrific game any pad hrs fun my grandkids l...


In [10]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_reviewText'], df['Positive'], test_size=0.2, random_state=42)


In [11]:
# 3. Model Selection
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Naïve Bayes': MultinomialNB(),
    'Gradient Boosting': GradientBoostingClassifier()
}

In [12]:
# Vectorization and training pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

In [13]:
# 4. Model Training and 5. Formal Evaluation
results = {}

for name, model in models.items():
    pipeline.set_params(clf=model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm
    }
    
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"Confusion Matrix:\n{cm}\n")

Logistic Regression Results:
Accuracy: 0.90
Precision: 0.91
Recall: 0.96
F1 Score: 0.93
Confusion Matrix:
[[ 653  305]
 [ 108 2934]]

Random Forest Results:
Accuracy: 0.87
Precision: 0.86
Recall: 0.98
F1 Score: 0.92
Confusion Matrix:
[[ 480  478]
 [  53 2989]]

SVM Results:
Accuracy: 0.90
Precision: 0.91
Recall: 0.96
F1 Score: 0.94
Confusion Matrix:
[[ 677  281]
 [ 107 2935]]

Naïve Bayes Results:
Accuracy: 0.79
Precision: 0.78
Recall: 1.00
F1 Score: 0.88
Confusion Matrix:
[[ 112  846]
 [   7 3035]]

Gradient Boosting Results:
Accuracy: 0.85
Precision: 0.86
Recall: 0.97
F1 Score: 0.91
Confusion Matrix:
[[ 467  491]
 [  94 2948]]



In [None]:
# 6. Hyperparameter Tuning for Logistic Regression as an example
gb_param_grid = {
    'clf__n_estimators': [50, 100, 150],
    'clf__learning_rate': [0.01, 0.1, 0.2],
    'clf__max_depth': [3, 5, 7],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

# Update pipeline for Gradient Boosting
pipeline.set_params(clf=GradientBoostingClassifier())

# Perform Grid Search
grid_search = GridSearchCV(pipeline, gb_param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
# Best parameters and results
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [None]:
print(f"Best parameters for Logistic Regression: {best_params}")
print(f"Best cross-validated score: {best_score:.2f}")

In [None]:
# 7. Comparative Analysis
# Plotting metrics for comparison
metrics = ['accuracy', 'precision', 'recall', 'f1']
for metric in metrics:
    scores = [results[model][metric] for model in models]
    plt.figure(figsize=(10, 5))
    sns.barplot(x=list(models.keys()), y=scores)
    plt.title(f'Comparison of {metric.capitalize()}')
    plt.ylabel(metric.capitalize())
    plt.xlabel('Model')
    plt.show()