In [1]:
import torch
import numpy as np
import pandas as pd

from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score


In [2]:
df=pd.read_csv(r"D:\Downloads\reviews_badminton\data.csv")
df

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1
...,...,...,...,...,...,...,...,...
8513,,,,,,,,5
8514,,,,,,,,2
8515,,,,,,,,4
8516,,,,,,,,1


In [None]:
import re
import nltk
import unicodedata
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text)
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")
    text = text.lower()
    text = re.sub(r'pricedjust', 'priced just', text)
    text = re.sub(r'pricejust', 'price just', text)
    text = text.replace("o. k.", "ok").replace("o.k.", "ok")
    text = re.sub(r'read more', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = [w for w in text.split() if w not in stop_words]
    return " ".join(words)

df['clean_text'] = df['Review text'].apply(clean_text)
df[['Review text', 'clean_text']].head()




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fb5cd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Review text,clean_text
0,"Nice product, good quality, but price is now r...",nice product good quality price rising bad sig...
1,They didn't supplied Yonex Mavis 350. Outside ...,supplied yonex mavis outside cover yonex ad in...
2,Worst product. Damaged shuttlecocks packed in ...,worst product damaged shuttlecocks packed new ...
3,"Quite O. K. , but nowadays the quality of the...",quite ok nowadays quality corks like years bac...
4,Over pricedJust â?¹620 ..from retailer.I didn'...,priced retailer understand wat advantage buyin...


In [4]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df['normalized_text'] = df['clean_text'].apply(lemmatize_text)

df[['clean_text', 'normalized_text']].head()


pd.set_option('display.max_colwidth', None)

df[['Review text','clean_text', 'normalized_text']].head()


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fb5cd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\fb5cd\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,Review text,clean_text,normalized_text
0,"Nice product, good quality, but price is now rising which is a bad sign. 800-850 was an affordable price, especially when we play everyday. So kindly help us out in terms of the price. Thank You.READ MORE",nice product good quality price rising bad sign affordable price especially play everyday kindly help us terms price thank,nice product good quality price rising bad sign affordable price especially play everyday kindly help u term price thank
1,They didn't supplied Yonex Mavis 350. Outside cover was Yonex Ad inside was a cheapest.... Sad to hear this.READ MORE,supplied yonex mavis outside cover yonex ad inside cheapest sad hear,supplied yonex mavis outside cover yonex ad inside cheapest sad hear
2,Worst product. Damaged shuttlecocks packed in new box. It's not a original yonex product. Don't buy.flipkart platform is chosen to fraud the buyers.READ MORE,worst product damaged shuttlecocks packed new box original yonex product buy flipkart platform chosen fraud buyers,worst product damaged shuttlecock packed new box original yonex product buy flipkart platform chosen fraud buyer
3,"Quite O. K. , but nowadays the quality of the corks like not as before 3 to 5 years back.. I am using MAVIS 350 for more than 15 years quality of corks was very very good at that times, but now I am not getting the quality corks as like before, rate of corks also too much now, I am very sorry to say like this, but in my experience , my Statment is very true to my knowledgeREAD MORE",quite ok nowadays quality corks like years back using mavis years quality corks good times getting quality corks like rate corks also much sorry say like experience statment true knowledge,quite ok nowadays quality cork like year back using mavis year quality cork good time getting quality cork like rate cork also much sorry say like experience statment true knowledge
4,Over pricedJust â?¹620 ..from retailer.I didn't understand.. Wat is d advantage of buying dis frm flipkrtREAD MORE,priced retailer understand wat advantage buying dis frm flipkrt,priced retailer understand wat advantage buying dis frm flipkrt


In [None]:
def sentiment_label(rating):
    if rating >= 4:
        return 1
    elif rating <= 2:
        return 0
    else:
        return None

df['sentiment'] = df['Ratings'].apply(sentiment_label)


df = df.dropna(subset=['sentiment'])

df['sentiment'].value_counts()


sentiment
1.0    6826
0.0    1077
Name: count, dtype: int64

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [7]:
print(torch.cuda.is_available())

False


In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

device = torch.device("cpu")
model.to(device)
model.eval()

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [9]:
texts = df["Review text"].astype(str).tolist()
labels = df["sentiment"].values

In [10]:
def bert_embeddings(texts, batch_size=16, max_len=128):
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]

        encoded = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=max_len,
            return_tensors="pt"
        )

        input_ids = encoded["input_ids"].to(device)
        attention_mask = encoded["attention_mask"].to(device)

        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=attention_mask)

        cls_embeddings = output.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(cls_embeddings)

    return np.vstack(embeddings)


In [11]:
X_bert = bert_embeddings(texts)
print("BERT shape:", X_bert.shape)

BERT shape: (7903, 768)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_bert, labels, test_size=0.2, random_state=42, stratify=labels
)

In [13]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

print("BERT + LR")
print("Train F1:", f1_score(y_train, lr.predict(X_train)))
print("Test  F1:", f1_score(y_test, lr.predict(X_test)))


BERT + LR
Train F1: 0.9813211845102505
Test  F1: 0.964611455673112


In [14]:
svm = LinearSVC()
svm.fit(X_train, y_train)

print("BERT + SVM")
print("Train F1:", f1_score(y_train, svm.predict(X_train)))
print("Test  F1:", f1_score(y_test, svm.predict(X_test)))


BERT + SVM
Train F1: 0.9872954940133443
Test  F1: 0.9579524680073126


In [None]:
import pickle


with open("sentiment_lr_bert.pkl", "wb") as f:
    pickle.dump(lr, f)


tokenizer.save_pretrained("bert_tokenizer")
model.save_pretrained("bert_model")

print("Models saved successfully")


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Models saved successfully
