# Title: Sentiment Analysis on Amazon Product Reviews

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
import string
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("amazon.csv")
df.head()

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1


In [3]:
df.shape

(20000, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviewText  20000 non-null  object
 1   Positive    20000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [5]:
df.isnull().sum()

reviewText    0
Positive      0
dtype: int64

In [6]:
df[df.duplicated()]

Unnamed: 0,reviewText,Positive


# Text Preprocessing

In [7]:
df.reviewText[0]

'This is a one of the best apps acording to a bunch of people and I agree it has bombs eggs pigs TNT king pigs and realustic stuff'

## Lower-casing

In [8]:
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [9]:
df.head()

Unnamed: 0,reviewText,Positive
0,this is a one of the best apps acording to a b...,1
1,this is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"this is a silly game and can be frustrating, b...",1
4,this is a terrific game on any pad. hrs of fun...,1


## Removing punctuation and stopwords

In [10]:
nltk.download('stopwords')
stopword = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to C:\Users\IT
[nltk_data]     BD\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def preprocess_text(text):
    if isinstance(text, str):  # Checking if the input is a string
        # Remove punctuation
        remove_punc = [char for char in text if char not in string.punctuation]
        clean_words = ''.join(remove_punc)  # Char joining

        # Remove stopwords
        text = ' '.join([word for word in clean_words.split() if word.lower() not in stopword])
        
    return text
df['reviewText'] = df['reviewText'].apply(preprocess_text)
df['reviewText']

0        one best apps acording bunch people agree bomb...
1        pretty good version game free lots different l...
2        really cool game bunch levels find golden eggs...
3        silly game frustrating lots fun definitely rec...
4        terrific game pad hrs fun grandkids love great...
                               ...                        
19995    app fricken stupidit froze kindle wont allow p...
19996    please add need neighbors ginger1016 thanks bu...
19997    love game awesome wish free stuff houses didnt...
19998    love love love app side fashion story fights w...
19999    game rip list things make betterbull first nee...
Name: reviewText, Length: 20000, dtype: object

## Lemmatization

In [12]:
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    if isinstance(text, str):  # Checking if the input is a string
        lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
        return lemmatized_text
    else:
        return text  # Returning non-string values as is
df['reviewText'] = df['reviewText'].apply(lemmatize_text)
df['reviewText']

0        one best apps acording bunch people agree bomb...
1        pretty good version game free lot different le...
2        really cool game bunch level find golden egg s...
3        silly game frustrating lot fun definitely reco...
4        terrific game pad hr fun grandkids love great ...
                               ...                        
19995    app fricken stupidit froze kindle wont allow p...
19996    please add need neighbor ginger1016 thanks bun...
19997    love game awesome wish free stuff house didnt ...
19998    love love love app side fashion story fight wo...
19999    game rip list thing make betterbull first need...
Name: reviewText, Length: 20000, dtype: object

## Splitting into training and testing sets

In [13]:
df['reviewText'].fillna('', inplace=True)

X_train, X_test, y_train, y_test = train_test_split(df["reviewText"], df["Positive"], test_size=0.3)

## TF-IDF vectorization

In [14]:
#Vectorization
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


## Model Selection, Training and Performance Evaluation

In [15]:
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Random Forest", RandomForestClassifier()),
    ("Support Vector Machine", SVC()),
]

for name, model in models:
    
    #Training the model
    model.fit(X_train_vec, y_train)

    #Evaluating performance
    y_pred = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(confusion_matrix(y_test, y_pred))

Logistic Regression Results:
Accuracy: 0.8817
Precision: 0.8837
Recall: 0.9689
F1-Score: 0.9243
[[ 953  571]
 [ 139 4337]]
Random Forest Results:
Accuracy: 0.8595
Precision: 0.8549
Recall: 0.9777
F1-Score: 0.9121
[[ 781  743]
 [ 100 4376]]
Support Vector Machine Results:
Accuracy: 0.8853
Precision: 0.8878
Recall: 0.9687
F1-Score: 0.9265
[[ 976  548]
 [ 140 4336]]


## Hyperparameter Tuning

In [16]:
# Random Forest Hyperparameter Tuning
random_forest = RandomForestClassifier()
param_grid_rf = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15],
    "min_samples_split": [2, 5, 10],
}
grid_search_rf = GridSearchCV(random_forest, param_grid_rf, cv=5)
grid_search_rf.fit(X_train_vec, y_train)
best_rf = grid_search_rf.best_estimator_

# Evaluating the best Random Forest model
y_pred = best_rf.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Tunned Random Forest Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(confusion_matrix(y_test, y_pred))

print("Best Random Forest Parameters:", grid_search_rf.best_params_)

Tunned Random Forest Results:
Accuracy: 0.7463
Precision: 0.7462
Recall: 1.0000
F1-Score: 0.8547
[[   2 1522]
 [   0 4476]]
Best Random Forest Parameters: {'max_depth': 15, 'min_samples_split': 10, 'n_estimators': 100}


In [17]:
# SVM Hyperparameter Tuning
svm = SVC()
param_grid_svm = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf", "poly"],
    "gamma": ["scale", "auto"],
}
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5)
grid_search_svm.fit(X_train_vec, y_train)
best_svm = grid_search_svm.best_estimator_

# Evaluating the best SVM model
y_pred = best_svm.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Tunned SVM Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(confusion_matrix(y_test, y_pred))
print("Best SVM Parameters:", grid_search_svm.best_params_)

Tunned SVM Results:
Accuracy: 0.8902
Precision: 0.9058
Recall: 0.9517
F1-Score: 0.9282
[[1081  443]
 [ 216 4260]]
Best SVM Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
