In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report

import joblib

nltk.download("stopwords",quiet = True)
nltk.download("wordnet",quiet = True)

True

#### Load the Dataset

In [3]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust Ã¢?Â¹620 ..from retailer.I didn'...,1


In [4]:
df.shape

(8518, 8)

#### EDA

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8518 entries, 0 to 8517
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reviewer Name    8508 non-null   object 
 1   Review Title     8508 non-null   object 
 2   Place of Review  8468 non-null   object 
 3   Up Votes         8508 non-null   float64
 4   Down Votes       8508 non-null   float64
 5   Month            8053 non-null   object 
 6   Review text      8510 non-null   object 
 7   Ratings          8518 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 532.5+ KB


In [6]:
df.isnull().sum()

Reviewer Name       10
Review Title        10
Place of Review     50
Up Votes            10
Down Votes          10
Month              465
Review text          8
Ratings              0
dtype: int64

In [7]:
df = df[df["Ratings"]!= 3]

df['sentiment'] = df['Ratings'].apply(lambda x:1 if x>=4 else 0)

df['text'] = df["Review Title"].fillna("") + " " + df['Review text'].fillna("")
df = df[['text','sentiment']]
df.head()

Unnamed: 0,text,sentiment
0,"Nice product Nice product, good quality, but p...",1
1,Don't waste your money They didn't supplied Yo...,0
2,Did not meet expectations Worst product. Damag...,0
4,Over priced Over pricedJust Ã¢?Â¹620 ..from reta...,0
5,Mind-blowing purchase Good quality product. De...,1


#### Text Preprocessing

In [8]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

In [9]:
df['clean_review'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,text,sentiment,clean_review
0,"Nice product Nice product, good quality, but p...",1,nice product nice product good quality price r...
1,Don't waste your money They didn't supplied Yo...,0,waste money supplied yonex mavis outside cover...
2,Did not meet expectations Worst product. Damag...,0,meet expectation worst product damaged shuttle...
4,Over priced Over pricedJust Ã¢?Â¹620 ..from reta...,0,priced pricedjust retailer understand wat adva...
5,Mind-blowing purchase Good quality product. De...,1,mind blowing purchase good quality product del...


In [10]:
X = df['clean_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [11]:
tfidf = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [12]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Linear SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=200)
}

print("TF-IDF MODEL RESULTS\n")

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    preds = model.predict(X_test_tfidf)
    print(f"{name} F1-score: {f1_score(y_test, preds):.4f}")

TF-IDF MODEL RESULTS

Logistic Regression F1-score: 0.9576
Naive Bayes F1-score: 0.9409
Linear SVM F1-score: 0.9590
Random Forest F1-score: 0.9514


#### Final Model is Logistic Regression

In [13]:

print("\nLogistic Regression selected as final model based on best F1-score and simplicity.")

best_model = models["Logistic Regression"]

final_preds = best_model.predict(X_test_tfidf)
print("\nFinal Model Classification Report:")
print(classification_report(y_test, final_preds))


Logistic Regression selected as final model based on best F1-score and simplicity.

Final Model Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.54      0.66       217
           1       0.93      0.99      0.96      1364

    accuracy                           0.92      1581
   macro avg       0.90      0.76      0.81      1581
weighted avg       0.92      0.92      0.92      1581



In [14]:
def predict_sentiment(review):
    review = clean_text(review)
    vector = tfidf.transform([review])
    pred = model.predict(vector)[0]
    return "Positive ðŸ˜Š" if pred == 1 else "Negative ðŸ˜ž"

predict_sentiment("The shuttle quality is excellent and durable")


'Positive ðŸ˜Š'

In [15]:

joblib.dump(best_model, "sentiment_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

print("\nâœ… Model and Vectorizer saved successfully")


âœ… Model and Vectorizer saved successfully
