In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv(r"D:\Downloads\reviews_badminton\data.csv")
df

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1
...,...,...,...,...,...,...,...,...
8513,,,,,,,,5
8514,,,,,,,,2
8515,,,,,,,,4
8516,,,,,,,,1


In [None]:
import re
import nltk
import unicodedata
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text)
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")
    text = text.lower()
    text = re.sub(r'pricedjust', 'priced just', text)
    text = re.sub(r'pricejust', 'price just', text)
    text = text.replace("o. k.", "ok").replace("o.k.", "ok")
    text = re.sub(r'read more', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = [w for w in text.split() if w not in stop_words]

    return " ".join(words)

df['clean_text'] = df['Review text'].apply(clean_text)
df[['Review text', 'clean_text']].head()




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fb5cd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Review text,clean_text
0,"Nice product, good quality, but price is now r...",nice product good quality price rising bad sig...
1,They didn't supplied Yonex Mavis 350. Outside ...,supplied yonex mavis outside cover yonex ad in...
2,Worst product. Damaged shuttlecocks packed in ...,worst product damaged shuttlecocks packed new ...
3,"Quite O. K. , but nowadays the quality of the...",quite ok nowadays quality corks like years bac...
4,Over pricedJust â?¹620 ..from retailer.I didn'...,priced retailer understand wat advantage buyin...


In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df['normalized_text'] = df['clean_text'].apply(lemmatize_text)

df[['clean_text', 'normalized_text']].head()


pd.set_option('display.max_colwidth', None)

df[['Review text','clean_text', 'normalized_text']].head()


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fb5cd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\fb5cd\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,Review text,clean_text,normalized_text
0,"Nice product, good quality, but price is now rising which is a bad sign. 800-850 was an affordable price, especially when we play everyday. So kindly help us out in terms of the price. Thank You.READ MORE",nice product good quality price rising bad sign affordable price especially play everyday kindly help us terms price thank,nice product good quality price rising bad sign affordable price especially play everyday kindly help u term price thank
1,They didn't supplied Yonex Mavis 350. Outside cover was Yonex Ad inside was a cheapest.... Sad to hear this.READ MORE,supplied yonex mavis outside cover yonex ad inside cheapest sad hear,supplied yonex mavis outside cover yonex ad inside cheapest sad hear
2,Worst product. Damaged shuttlecocks packed in new box. It's not a original yonex product. Don't buy.flipkart platform is chosen to fraud the buyers.READ MORE,worst product damaged shuttlecocks packed new box original yonex product buy flipkart platform chosen fraud buyers,worst product damaged shuttlecock packed new box original yonex product buy flipkart platform chosen fraud buyer
3,"Quite O. K. , but nowadays the quality of the corks like not as before 3 to 5 years back.. I am using MAVIS 350 for more than 15 years quality of corks was very very good at that times, but now I am not getting the quality corks as like before, rate of corks also too much now, I am very sorry to say like this, but in my experience , my Statment is very true to my knowledgeREAD MORE",quite ok nowadays quality corks like years back using mavis years quality corks good times getting quality corks like rate corks also much sorry say like experience statment true knowledge,quite ok nowadays quality cork like year back using mavis year quality cork good time getting quality cork like rate cork also much sorry say like experience statment true knowledge
4,Over pricedJust â?¹620 ..from retailer.I didn't understand.. Wat is d advantage of buying dis frm flipkrtREAD MORE,priced retailer understand wat advantage buying dis frm flipkrt,priced retailer understand wat advantage buying dis frm flipkrt


In [None]:
def sentiment_label(rating):
    if rating >= 4:
        return 1
    elif rating <= 2:
        return 0
    else:
        return None

df['sentiment'] = df['Ratings'].apply(sentiment_label)


df = df.dropna(subset=['sentiment'])

df['sentiment'].value_counts()


sentiment
1.0    6826
0.0    1077
Name: count, dtype: int64

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer(
    max_features=5000,      
    ngram_range=(1,2)       
)

X_tfidf = tfidf.fit_transform(df['normalized_text'])


y = df['sentiment']

print("TF-IDF shape:", X_tfidf.shape)


TF-IDF shape: (7903, 5000)


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(X_train.shape, X_test.shape)


(6322, 5000) (1581, 5000)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
# Train predictions
y_train_pred_lr = lr.predict(X_train)
f1_train_lr = f1_score(y_train, y_train_pred_lr)

# Test predictions (already done, but safe)
y_test_pred_lr = lr.predict(X_test)
f1_test_lr = f1_score(y_test, y_test_pred_lr)

print("Logistic Regression")
print("Train F1-score:", f1_train_lr)
print("Test  F1-score:", f1_test_lr)


Logistic Regression
Train F1-score: 0.9631081803600071
Test  F1-score: 0.9540960451977402


In [9]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)
# Train predictions
y_train_pred_svm = svm.predict(X_train)
f1_train_svm = f1_score(y_train, y_train_pred_svm)

# Test predictions
y_test_pred_svm = svm.predict(X_test)
f1_test_svm = f1_score(y_test, y_test_pred_svm)

print("\nSVM")
print("Train F1-score:", f1_train_svm)
print("Test  F1-score:", f1_test_svm)


SVM
Train F1-score: 0.9841645431379686
Test  F1-score: 0.9567723342939481


In [None]:
from gensim.models import Word2Vec


tokenized_text = df['normalized_text'].apply(lambda x: x.split())

tokenized_text.head()


0    [nice, product, good, quality, price, rising, bad, sign, affordable, price, especially, play, everyday, kindly, help, u, term, price, thank]
1                                                                [supplied, yonex, mavis, outside, cover, yonex, ad, inside, cheapest, sad, hear]
2               [worst, product, damaged, shuttlecock, packed, new, box, original, yonex, product, buy, flipkart, platform, chosen, fraud, buyer]
4                                                                       [priced, retailer, understand, wat, advantage, buying, dis, frm, flipkrt]
5                                                                                                       [good, quality, product, delivered, time]
Name: normalized_text, dtype: object

In [None]:
w2v_model = Word2Vec(
    sentences=tokenized_text,
    vector_size=100,  
    window=5,
    min_count=2,
    workers=4,
    epochs=20
)


In [12]:
w2v_model.wv['quality']

array([-0.10152441,  0.3588058 ,  0.12280393, -0.02490501,  0.1075304 ,
       -0.45404196,  0.13657664,  0.721426  , -0.05800816, -0.42107877,
        0.10780936, -0.26699352,  0.08864351,  0.11090741,  0.16531539,
       -0.14879386,  0.3329303 , -0.23939049, -0.11676656, -0.6412459 ,
        0.11533391, -0.09068041,  0.14868869,  0.1315513 , -0.43875206,
        0.00609252, -0.17837986, -0.08016957, -0.17706096,  0.10323599,
        0.226128  ,  0.14820804,  0.1054971 , -0.29803243, -0.02149524,
        0.4266876 ,  0.1540714 , -0.48857668, -0.24793978, -0.27109057,
        0.09086198, -0.28123274,  0.01147188, -0.08326028,  0.45738927,
       -0.04807765,  0.06157202, -0.40189958,  0.05994327,  0.03977657,
        0.29402336, -0.05513592, -0.09360323, -0.1317047 ,  0.07962222,
       -0.04468029,  0.03289628,  0.14782202, -0.07306768, -0.05339321,
        0.04037099, -0.04481783,  0.06309081, -0.18179299, -0.22642952,
        0.20852731,  0.14204587,  0.13608871, -0.27531594,  0.31

In [13]:
import numpy as np

def sentence_vector(sentence, model, vector_size=100):
    words = sentence.split()
    word_vectors = [
        model.wv[word] for word in words if word in model.wv
    ]
    if len(word_vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)


In [14]:
X_w2v = np.array([
    sentence_vector(text, w2v_model, 100)
    for text in df['normalized_text']
])

y = df['sentiment']

print(X_w2v.shape)


(7903, 100)


In [15]:
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X_w2v, y, test_size=0.2, random_state=42, stratify=y
)

In [16]:
lr = LogisticRegression(max_iter=1000)

lr.fit(X_train, y_train)

# Predictions
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

print("Logistic Regression (Word2Vec)")
print("Train F1-score:", f1_score(y_train, y_train_pred))
print("Test  F1-score:", f1_score(y_test, y_test_pred))

Logistic Regression (Word2Vec)
Train F1-score: 0.9488651417468869
Test  F1-score: 0.945031712473573


In [17]:
svm = LinearSVC()

svm.fit(X_train, y_train)

# Predictions
y_train_pred = svm.predict(X_train)
y_test_pred = svm.predict(X_test)

print("SVM (Word2Vec)")
print("Train F1-score:", f1_score(y_train, y_train_pred))
print("Test  F1-score:", f1_score(y_test, y_test_pred))

SVM (Word2Vec)
Train F1-score: 0.9503083700440529
Test  F1-score: 0.9469244288224956
