In [30]:
import pandas as pd
import numpy as np
import nltk
import re
import os
import contractions

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

%load_ext autotime


The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 14.1 ms (started: 2025-01-29 07:05:26 +00:00)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
!pip install bs4 # in case you don't have it installed
!pip install contractions
!pip install ipython-autotime
# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz

time: 8.34 s (started: 2025-01-29 07:05:15 +00:00)


In [31]:
class ConfigValues:
    RANDOM_STATE_VALUE = 42
    MAX_TFIDF_FEATURES = 45000

time: 697 µs (started: 2025-01-29 07:05:38 +00:00)


## Read Data
We are using Amazon Reviews dataset for Sentiment Analysis.

- `sep='\t'`: tab-separated values in our dataset

- `compression="gzip"` : to decompress our dataset

- `on_bad_lines="skip"`: in case or incorrect format / error in lines, skip it

###### Extract the dataset and store in data, for further processing


In [32]:
url = "https://web.archive.org/web/20201127142707if_/https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz"
data = pd.read_csv(url, sep='\t', compression='gzip', on_bad_lines='skip')

  data = pd.read_csv(url, sep='\t', compression='gzip', on_bad_lines='skip')


time: 55 s (started: 2025-01-29 07:05:41 +00:00)


## Keep Reviews and Ratings

Extract reviews and ratings for further processing, peek at what the data looks like.

In [33]:
data = data[['review_body', 'star_rating']]
data.head(10)

Unnamed: 0,review_body,star_rating
0,Great product.,5
1,What's to say about this commodity item except...,5
2,"Haven't used yet, but I am sure I will like it.",5
3,Although this was labeled as &#34;new&#34; the...,1
4,Gorgeous colors and easy to use,4
5,Perfect for planning weekly meals. Removrd the...,5
6,Gold plated fusers are the best! It will never...,5
7,I have used these highlighters for my bible fo...,5
8,Heavy pen that writes very well. I've had it ...,5
9,Not sure if they work but sent quickly and fit...,5


time: 93.6 ms (started: 2025-01-29 07:06:47 +00:00)


In [34]:
sample_reviews = data.sample(3, random_state=ConfigValues.RANDOM_STATE_VALUE)
for index, row in sample_reviews.iterrows():
    print(f"Review: {row['review_body']}")
    print(f"Rating: {row['star_rating']}")
    print('-' * 80)


Review: Works get
Rating: 5
--------------------------------------------------------------------------------
Review: Can't handle most credit card solicitations , very poor shredder . Amazon puts it's name on this device it should of been tested more, at 1/2 it's price it might be okay for light shredding  abilities.
Rating: 1
--------------------------------------------------------------------------------
Review: I was very pleased with the quality of Wilson Jones Insertable Binder Tab Dividers. They held together well and came with blank inserts for labeling.
Rating: 5.0
--------------------------------------------------------------------------------
time: 177 ms (started: 2025-01-29 07:06:51 +00:00)


### Handle Data inconsistency:

- Drop data values with 'NaN'

- Maintain uniform data type: convert 'star_rating' from 'str' to 'int'



In [35]:
# removing NaN colums
data = data.dropna(subset=['review_body'])

# due to data type inconsistency, converting star_rating to numeric,
data['star_rating'] = pd.to_numeric(data['star_rating'], errors='coerce')
data = data.dropna(subset=['star_rating'])

# Count rating instances
rating_counts = data['star_rating'].value_counts().sort_index()

# Print stats
print("Rating Statistics:")
for rating, count in rating_counts.items():
    print(f"Rating {rating}: {count} reviews")


Rating Statistics:
Rating 1.0: 306967 reviews
Rating 2.0: 138381 reviews
Rating 3.0: 193680 reviews
Rating 4.0: 418348 reviews
Rating 5.0: 1582704 reviews
time: 1.37 s (started: 2025-01-29 07:06:57 +00:00)


### Create Binary Classification & Map ratings:

- Ratings > 3 --> Positive (`1`)

- Ratings <= 2 --> Negative (`0`)

- Drop ratings = 3.



In [36]:
# dataf = data.copy()
neutral_count = (data['star_rating'] == 3).sum()
data = data[data['star_rating'] != 3]
data['sentiment'] = data['star_rating'].apply(lambda x : 1 if x > 3 else 0)

# Count the number of reviews in each class
positive_count = (data['sentiment'] == 1).sum()
negative_count = (data['sentiment'] == 0).sum()

# Print counts
print("Positive reviews:", positive_count)
print("Negative reviews:", negative_count)
print("Neutral reviews (discarded):", neutral_count)

Positive reviews: 2001052
Negative reviews: 445348
Neutral reviews (discarded): 193680
time: 1.14 s (started: 2025-01-29 07:07:08 +00:00)


In [37]:
data.head(10)

Unnamed: 0,review_body,star_rating,sentiment
0,Great product.,5.0,1
1,What's to say about this commodity item except...,5.0,1
2,"Haven't used yet, but I am sure I will like it.",5.0,1
3,Although this was labeled as &#34;new&#34; the...,1.0,0
4,Gorgeous colors and easy to use,4.0,1
5,Perfect for planning weekly meals. Removrd the...,5.0,1
6,Gold plated fusers are the best! It will never...,5.0,1
7,I have used these highlighters for my bible fo...,5.0,1
8,Heavy pen that writes very well. I've had it ...,5.0,1
9,Not sure if they work but sent quickly and fit...,5.0,1


time: 9.4 ms (started: 2025-01-29 07:07:30 +00:00)


 - randomly sample 100000 +ve & -ve reviews

- shuffle with some random_state value for variation.



In [38]:
positive_reviews = data[data['sentiment'] == 1].sample(
    n=min(100000, len(data[data['sentiment'] == 1])), random_state=ConfigValues.RANDOM_STATE_VALUE
)
negative_reviews = data[data['sentiment'] == 0].sample(
    n=min(100000, len(data[data['sentiment'] == 0])), random_state=ConfigValues.RANDOM_STATE_VALUE
)

my_downsized_data = pd.concat([positive_reviews, negative_reviews]).sample(
    frac=1, random_state=ConfigValues.RANDOM_STATE_VALUE
).reset_index(drop=True)

time: 635 ms (started: 2025-01-29 07:07:38 +00:00)


##### Split the dataset into Train:Test = 80:20 and perform further data cleaning & preprocessing tasks on both


In [39]:
# Split data
my_downsized_data = my_downsized_data.dropna(subset=['review_body', 'sentiment'])
x_train, x_test, y_train, y_test = train_test_split(my_downsized_data['review_body'], my_downsized_data['sentiment'], test_size=0.2, random_state=ConfigValues.RANDOM_STATE_VALUE)

time: 97.6 ms (started: 2025-01-29 07:07:48 +00:00)


# Data Cleaning

Using regex expressions to match and replace the below items with empty strings:
- change all to lower case

- URLs

- emails

- HTML tags

- punctuations

- extra spaces

- special / non-alphabetical characters



In [40]:
# converting to lower case
x_train = x_train.str.lower()
x_test = x_test.str.lower()

time: 270 ms (started: 2025-01-29 07:07:52 +00:00)


In [41]:
# removing html tags and url
def remove_html_urls(text):
  text = BeautifulSoup(text, "html.parser").get_text()
  text = re.sub(r'https?://\S+|www\.\S+', '', text)
  return text

x_train = x_train.apply(remove_html_urls)
x_test = x_test.apply(remove_html_urls)

  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


time: 12.9 s (started: 2025-01-29 07:08:01 +00:00)


In [42]:
# remove spaces, special characters, email addresses
def remove_space_characters(text):
  text = re.sub(r'\s+', ' ', text)
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  text = re.sub(r'[a-zA-Z0-9_\-\.]+@[a-zA-Z0-9_\-\.]+\.[a-zA-Z]{2,5}', ' ', text)
  return text

x_train = x_train.apply(remove_space_characters)
x_test = x_test.apply(remove_space_characters)

time: 8.94 s (started: 2025-01-29 07:08:19 +00:00)


In [43]:
# contractions
data['review_body'] = data['review_body'].apply(lambda x: contractions.fix(x))
print(data['review_body'].sample(10))

1663656    Terrific, well-designed planner that holds up ...
1230119    Inconsistant. Printing makes noise and printer...
1887230    I must admit that the setup was not as direct ...
1851540    I have given this as a gift, and it has always...
1021007                                                 good
146524     Great colors, flexibility of color depending o...
2455899    Just like another reviewer, after replacing th...
142486                                                  Good
1045828    O.M.G. Seriously?  They work PERFECTLY.  I am ...
241791                  Exactly what I needed.  Perfect fit.
Name: review_body, dtype: object
time: 49.1 s (started: 2025-01-29 07:08:58 +00:00)


In [46]:
# Report avg length before/after cleaning
avg_length_before_cln = my_downsized_data['review_body'].str.len().mean()
avg_length_after_cln = x_train.str.len().mean()
print(f"Average length before cleaning: {avg_length_before_cln:.4f}")
print(f"Average length after cleaning: {avg_length_after_cln:.4f}")

Average length before cleaning: 318.0072
Average length after cleaning: 301.1237
time: 333 ms (started: 2025-01-29 07:10:55 +00:00)


# Pre-processing

## remove the stop words

In [47]:
# from nltk.corpus import stopwords
 # removing stop words
stop_owrds = set(stopwords.words('english'))
# handle negatove words
negitive_words = ['nor', 'no', 'not', 'none', 'nowhere' 'never', 'neither', 'nobody']
refined_stopwords = [word for word in stop_owrds if word not in negitive_words]
def remove_stop_words(text):
  words = word_tokenize(text)
  filtered_words = [word for word in words if word.lower() not in refined_stopwords]
  return ' '.join(filtered_words)

x_train = x_train.apply(remove_stop_words)
x_test = x_test.apply(remove_stop_words)

time: 1min 10s (started: 2025-01-29 07:11:45 +00:00)


## perform lemmatization  

In [48]:
# from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
  words = word_tokenize(text)
  lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
  return ' '.join(lemmatized_words)

x_train = x_train.apply(lemmatize_text)
x_test = x_test.apply(lemmatize_text)

time: 1min 3s (started: 2025-01-29 07:13:15 +00:00)


In [49]:
# print avg length before/after cleaning + processing
avg_length_before_cln = my_downsized_data['review_body'].str.len().mean()
avg_length_after_cln = x_train.str.len().mean()
print(f"Average length before cleaning + processing: {avg_length_before_cln:.4f}")
print(f"Average length after cleaning + processing: {avg_length_after_cln:.4f}")

Average length before cleaning + processing: 318.0072
Average length after cleaning + processing: 194.0846
time: 212 ms (started: 2025-01-29 07:14:22 +00:00)


# TF-IDF Feature Extraction

In [50]:
vectorizer = TfidfVectorizer(max_features=ConfigValues.MAX_TFIDF_FEATURES, stop_words='english',max_df=0.95,min_df=2)
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)
print(f"Total number of features extracted: {x_train_tfidf.shape[1]}")

Total number of features extracted: 45000
time: 5.47 s (started: 2025-01-29 07:14:44 +00:00)


# Perceptron

For hyperparameter tuning,

max_iter - shows number of epochs

alpha - intensity of regualarization in case of penalty

penalty - controls model's penalty in case of larger weights

In [54]:
# Hyperparameter tuning for Perceptron
param_grid = {
    'penalty': ['l2', 'elasticnet'],
    'alpha': [0.00005, 0.0001, 0.001, 0.005],
    'max_iter': [2000, 3000, 5000, 7000]
}

perceptron = Perceptron()
percp_grid_search = GridSearchCV(perceptron, param_grid, scoring='accuracy', cv=5, n_jobs=-1)
percp_grid_search.fit(x_train_tfidf, y_train)

# Best parameters and model
best_params = percp_grid_search.best_params_
best_perceptron = percp_grid_search.best_estimator_

# Predictions for train/test
train_predictions = best_perceptron.predict(x_train_tfidf)
test_predictions = best_perceptron.predict(x_test_tfidf)

# Metrics for training data
print(f"Perceptron - Training Data Accuracy: {accuracy_score(y_train, train_predictions):.4f}")
print(f"Perceptron - Training Data Precision: {precision_score(y_train, train_predictions):.4f}")
print(f"Perceptron - Training Data Recall: {recall_score(y_train, train_predictions):.4f}")
print(f"Perceptron - Training Data F1-Score: {f1_score(y_train, train_predictions):.4f}")

# Metrics for test data
print(f"Perceptron - Testing Data Accuracy: {accuracy_score(y_test, test_predictions):.4f}")
print(f"Perceptron - Testing Data Precision: {precision_score(y_test, test_predictions):.4f}")
print(f"Perceptron - Testing Data Recall: {recall_score(y_test, test_predictions):.4f}")
print(f"Perceptron - Testing Data F1-Score: {f1_score(y_test, test_predictions):.4f}")

Perceptron - Training Data Accuracy: 0.8523
Perceptron - Training Data Precision: 0.8615
Perceptron - Training Data Recall: 0.8391
Perceptron - Training Data F1-Score: 0.8501
Perceptron - Testing Data Accuracy: 0.8490
Perceptron - Testing Data Precision: 0.8585
Perceptron - Testing Data Recall: 0.8376
Perceptron - Testing Data F1-Score: 0.8479
time: 51.3 s (started: 2025-01-29 07:30:34 +00:00)


# SVM

- C : Regularization intensity, to help balance overfitting/underfitting

- max_iter - shows number of epochs

- loss : loss functions

In [58]:
# Hyperparameter tuning for LinearSVC
param_grid = {
    'C': [0.1, 1, 10],
    'max_iter': [1000, 3000],
    'loss': ['squared_hinge'],
}

# Build & train LinearSVC
my_linear_svc = LinearSVC(random_state=ConfigValues.RANDOM_STATE_VALUE, dual=False)
svc_grid_search = GridSearchCV(my_linear_svc, param_grid, scoring='accuracy', cv=5, n_jobs=-1)
svc_grid_search.fit(x_train_tfidf, y_train)

# Best parameters and model
best_params = svc_grid_search.best_params_
best_linear_svc = svc_grid_search.best_estimator_

# Predictions for train/test
train_predictions = best_linear_svc.predict(x_train_tfidf)
test_predictions = best_linear_svc.predict(x_test_tfidf)

# Metrics for training data
print(f"LinearSVC - Training Data Accuracy: {accuracy_score(y_train, train_predictions):.4f}")
print(f"LinearSVC - Training Data Precision: {precision_score(y_train, train_predictions):.4f}")
print(f"LinearSVC - Training Data Recall: {recall_score(y_train, train_predictions):.4f}")
print(f"LinearSVC - Training Data F1-Score: {f1_score(y_train, train_predictions):.4f}")

# Metrics for test data
print(f"LinearSVC - Testing Data Accuracy: {accuracy_score(y_test, test_predictions):.4f}")
print(f"LinearSVC - Testing Data Precision: {precision_score(y_test, test_predictions):.4f}")
print(f"LinearSVC - Testing Data Recall: {recall_score(y_test, test_predictions):.4f}")
print(f"LinearSVC - Testing Data F1-Score: {f1_score(y_test, test_predictions):.4f}")


LinearSVC - Training Data Accuracy: 0.9247
LinearSVC - Training Data Precision: 0.9278
LinearSVC - Training Data Recall: 0.9208
LinearSVC - Training Data F1-Score: 0.9243
LinearSVC - Testing Data Accuracy: 0.9124
LinearSVC - Testing Data Precision: 0.9144
LinearSVC - Testing Data Recall: 0.9109
LinearSVC - Testing Data F1-Score: 0.9126
time: 2min 15s (started: 2025-01-29 07:53:04 +00:00)


# Logistic Regression

In [59]:
my_logistic_model = LogisticRegression()
my_logistic_model.fit(x_train_tfidf, y_train)

# predictions for train/test
lr_train_predictions = my_logistic_model.predict(x_train_tfidf)
lr_test_predictions = my_logistic_model.predict(x_test_tfidf)

# metrics for training data
print(f"Logistic Reg - Training Data Accuracy: {accuracy_score(y_train, lr_train_predictions):.4f}")
print(f"Logistic Reg - Training Data Precision: {precision_score(y_train, lr_train_predictions):.4f}")
print(f"Logistic Reg - Training Data Recall: {recall_score(y_train, lr_train_predictions):.4f}")
print(f"Logistic Reg - Training Data F1-Score: {f1_score(y_train, lr_train_predictions):.4f}")

# metrics for test data
print(f"Logistic Reg - Testing Data Accuracy: {accuracy_score(y_test, lr_test_predictions):.4f}")
print(f"Logistic Reg - Testing Data Precision: {precision_score(y_test, lr_test_predictions):.4f}")
print(f"Logistic Reg - Testing Data Recall: {recall_score(y_test, lr_test_predictions):.4f}")
print(f"Logistic Reg - Testing Data F1-Score: {f1_score(y_test, lr_test_predictions):.4f}")

Logistic Reg - Training Data Accuracy: 0.9212
Logistic Reg - Training Data Precision: 0.9236
Logistic Reg - Training Data Recall: 0.9182
Logistic Reg - Training Data F1-Score: 0.9209
Logistic Reg - Testing Data Accuracy: 0.9122
Logistic Reg - Testing Data Precision: 0.9134
Logistic Reg - Testing Data Recall: 0.9116
Logistic Reg - Testing Data F1-Score: 0.9125
time: 3.79 s (started: 2025-01-29 07:56:39 +00:00)


# Naive Bayes

In [60]:
param_grid = {
    'alpha': [0.01, 0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0, 5.0, 10.0]
}

my_nb_model = MultinomialNB()
my_grid_search = GridSearchCV(my_nb_model, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1)
my_grid_search.fit(x_train_tfidf, y_train)
best_nb_model = my_grid_search.best_estimator_

# predictions for train/test
nb_train_predictions = best_nb_model.predict(x_train_tfidf)
nb_test_predictions = best_nb_model.predict(x_test_tfidf)

# metrics for training data
print(f"NB - Training Data Accuracy: {accuracy_score(y_train, nb_train_predictions):.4f}")
print(f"NB - Training Data Precision: {precision_score(y_train, nb_train_predictions):.4f}")
print(f"NB - Training Data Recall: {recall_score(y_train, nb_train_predictions):.4f}")
print(f"NB - Training Data F1-Score: {f1_score(y_train, nb_train_predictions):.4f}")

# metrics for test data
print(f"NB - Testing Data Accuracy: {accuracy_score(y_test, nb_test_predictions):.4f}")
print(f"NB - Testing Data Precision: {precision_score(y_test, nb_test_predictions):.4f}")
print(f"NB - Testing Data Recall: {recall_score(y_test, nb_test_predictions):.4f}")
print(f"NB - Testing Data F1-Score: {f1_score(y_test, nb_test_predictions):.4f}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'alpha': 0.7}
NB - Training Data Accuracy: 0.8834
NB - Training Data Precision: 0.8923
NB - Training Data Recall: 0.8718
NB - Training Data F1-Score: 0.8819
NB - Testing Data Accuracy: 0.8692
NB - Testing Data Precision: 0.8810
NB - Testing Data Recall: 0.8552
NB - Testing Data F1-Score: 0.8679
time: 6.55 s (started: 2025-01-29 07:57:06 +00:00)


### Observations

- Increase in performance after data cleaning (removal of urls, tags, spaces) and data pre-processing.

