In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import nltk

In [2]:
df=pd.read_csv('reviews.csv')
df

Unnamed: 0,rating,date,review,platform
0,2,30 December 2024,"I was using it for long time, but have to stop...",blinkit
1,1,4 November 2024,Loving the fast deliveries and mostly they are...,blinkit
2,1,31 October 2024,The customer support is very disappointing. I ...,blinkit
3,5,29 August 2024,"I've been using Blinkit for a while now, and i...",blinkit
4,2,31 December 2024,Blinkit was my go to app and it was rare that ...,blinkit
...,...,...,...,...
4615,1,27 November 2024,I have deposited ₹1300 to my zepto wallet to o...,zepto
4616,4,5 December 2024,Prices r competitive for certain items. Not fo...,zepto
4617,1,13 November 2024,Worst delivery ever. They delivered a broken p...,zepto
4618,5,17 November 2024,"I had a great experience with zepto, the quali...",zepto


In [3]:
df['review'].shape

(4620,)

In [4]:
df['rating'].unique()

array([2, 1, 5, 3, 4], dtype=int64)

In [5]:
df1=df[['review','rating']]
df1

Unnamed: 0,review,rating
0,"I was using it for long time, but have to stop...",2
1,Loving the fast deliveries and mostly they are...,1
2,The customer support is very disappointing. I ...,1
3,"I've been using Blinkit for a while now, and i...",5
4,Blinkit was my go to app and it was rare that ...,2
...,...,...
4615,I have deposited ₹1300 to my zepto wallet to o...,1
4616,Prices r competitive for certain items. Not fo...,4
4617,Worst delivery ever. They delivered a broken p...,1
4618,"I had a great experience with zepto, the quali...",5


In [6]:
import re

In [7]:
# step 1 = cleaned \\ [^A-Za-z]+', ' ' ,, puntuation, fullstop, exclamatory....

def clean(text):
    text = re.sub('[^A-Za-z]+', ' ', text)
    return text

In [8]:
df1['cleaned']=df1['review'].apply(clean)
df1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['cleaned']=df1['review'].apply(clean)


Unnamed: 0,review,rating,cleaned
0,"I was using it for long time, but have to stop...",2,I was using it for long time but have to stop ...
1,Loving the fast deliveries and mostly they are...,1,Loving the fast deliveries and mostly they are...
2,The customer support is very disappointing. I ...,1,The customer support is very disappointing I r...
3,"I've been using Blinkit for a while now, and i...",5,I ve been using Blinkit for a while now and it...
4,Blinkit was my go to app and it was rare that ...,2,Blinkit was my go to app and it was rare that ...
...,...,...,...
4615,I have deposited ₹1300 to my zepto wallet to o...,1,I have deposited to my zepto wallet to order a...
4616,Prices r competitive for certain items. Not fo...,4,Prices r competitive for certain items Not for...
4617,Worst delivery ever. They delivered a broken p...,1,Worst delivery ever They delivered a broken pr...
4618,"I had a great experience with zepto, the quali...",5,I had a great experience with zepto the qualit...


In [12]:
# step 2 = lowercase/upercase remove

df1['Lowercase']= df1['cleaned'].apply(lambda x: " ".join(word.lower() for word in x.split()))
df1

Unnamed: 0,review,rating,cleaned,Lowercase
0,"I was using it for long time, but have to stop...",2,I was using it for long time but have to stop ...,i was using it for long time but have to stop ...
1,Loving the fast deliveries and mostly they are...,1,Loving the fast deliveries and mostly they are...,loving the fast deliveries and mostly they are...
2,The customer support is very disappointing. I ...,1,The customer support is very disappointing I r...,the customer support is very disappointing i r...
3,"I've been using Blinkit for a while now, and i...",5,I ve been using Blinkit for a while now and it...,i ve been using blinkit for a while now and it...
4,Blinkit was my go to app and it was rare that ...,2,Blinkit was my go to app and it was rare that ...,blinkit was my go to app and it was rare that ...
...,...,...,...,...
4615,I have deposited ₹1300 to my zepto wallet to o...,1,I have deposited to my zepto wallet to order a...,i have deposited to my zepto wallet to order a...
4616,Prices r competitive for certain items. Not fo...,4,Prices r competitive for certain items Not for...,prices r competitive for certain items not for...
4617,Worst delivery ever. They delivered a broken p...,1,Worst delivery ever They delivered a broken pr...,worst delivery ever they delivered a broken pr...
4618,"I had a great experience with zepto, the quali...",5,I had a great experience with zepto the qualit...,i had a great experience with zepto the qualit...


In [14]:
# step 3 = remove stopwords

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [15]:
df1['stopwords'] = df1['Lowercase'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))
df1

Unnamed: 0,review,rating,cleaned,Lowercase,stopwords
0,"I was using it for long time, but have to stop...",2,I was using it for long time but have to stop ...,i was using it for long time but have to stop ...,using long time stop using recently got used i...
1,Loving the fast deliveries and mostly they are...,1,Loving the fast deliveries and mostly they are...,loving the fast deliveries and mostly they are...,loving fast deliveries mostly great quick deli...
2,The customer support is very disappointing. I ...,1,The customer support is very disappointing I r...,the customer support is very disappointing i r...,customer support disappointing recently bought...
3,"I've been using Blinkit for a while now, and i...",5,I ve been using Blinkit for a while now and it...,i ve been using blinkit for a while now and it...,using blinkit become go app grocery shopping a...
4,Blinkit was my go to app and it was rare that ...,2,Blinkit was my go to app and it was rare that ...,blinkit was my go to app and it was rare that ...,blinkit go app rare disappointed today disappo...
...,...,...,...,...,...
4615,I have deposited ₹1300 to my zepto wallet to o...,1,I have deposited to my zepto wallet to order a...,i have deposited to my zepto wallet to order a...,deposited zepto wallet order watch depositing ...
4616,Prices r competitive for certain items. Not fo...,4,Prices r competitive for certain items Not for...,prices r competitive for certain items not for...,prices r competitive certain items packing goo...
4617,Worst delivery ever. They delivered a broken p...,1,Worst delivery ever They delivered a broken pr...,worst delivery ever they delivered a broken pr...,worst delivery ever delivered broken product s...
4618,"I had a great experience with zepto, the quali...",5,I had a great experience with zepto the qualit...,i had a great experience with zepto the qualit...,great experience zepto quality level faster de...


In [16]:
# step 4 = remove token words, pos tag

from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag

# POS tagger dictionary
pos_dict = {'J': wordnet.ADJ, 'V': wordnet.VERB, 'N': wordnet.NOUN, 'R': wordnet.ADV}

def token_pos(text):
    return [
        (word, pos_dict.get(tag[0]))
        for word, tag in pos_tag(word_tokenize(text))
    ]

In [17]:
df1['POS tagged'] = df1["stopwords"].apply(token_pos)
df1

Unnamed: 0,review,rating,cleaned,Lowercase,stopwords,POS tagged
0,"I was using it for long time, but have to stop...",2,I was using it for long time but have to stop ...,i was using it for long time but have to stop ...,using long time stop using recently got used i...,"[(using, v), (long, a), (time, n), (stop, n), ..."
1,Loving the fast deliveries and mostly they are...,1,Loving the fast deliveries and mostly they are...,loving the fast deliveries and mostly they are...,loving fast deliveries mostly great quick deli...,"[(loving, v), (fast, a), (deliveries, n), (mos..."
2,The customer support is very disappointing. I ...,1,The customer support is very disappointing I r...,the customer support is very disappointing i r...,customer support disappointing recently bought...,"[(customer, n), (support, n), (disappointing, ..."
3,"I've been using Blinkit for a while now, and i...",5,I ve been using Blinkit for a while now and it...,i ve been using blinkit for a while now and it...,using blinkit become go app grocery shopping a...,"[(using, v), (blinkit, n), (become, n), (go, v..."
4,Blinkit was my go to app and it was rare that ...,2,Blinkit was my go to app and it was rare that ...,blinkit was my go to app and it was rare that ...,blinkit go app rare disappointed today disappo...,"[(blinkit, n), (go, v), (app, r), (rare, a), (..."
...,...,...,...,...,...,...
4615,I have deposited ₹1300 to my zepto wallet to o...,1,I have deposited to my zepto wallet to order a...,i have deposited to my zepto wallet to order a...,deposited zepto wallet order watch depositing ...,"[(deposited, v), (zepto, None), (wallet, a), (..."
4616,Prices r competitive for certain items. Not fo...,4,Prices r competitive for certain items Not for...,prices r competitive for certain items not for...,prices r competitive certain items packing goo...,"[(prices, n), (r, v), (competitive, a), (certa..."
4617,Worst delivery ever. They delivered a broken p...,1,Worst delivery ever They delivered a broken pr...,worst delivery ever they delivered a broken pr...,worst delivery ever delivered broken product s...,"[(worst, r), (delivery, n), (ever, r), (delive..."
4618,"I had a great experience with zepto, the quali...",5,I had a great experience with zepto the qualit...,i had a great experience with zepto the qualit...,great experience zepto quality level faster de...,"[(great, a), (experience, n), (zepto, n), (qua..."


In [18]:
# step 5 = lemmatation method

from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:  # If no POS tag is available, use the word as is
            lemma = word
        else:
            # Lemmatize based on the POS tag
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
        lemma_rew = lemma_rew + " " + lemma  # Add the lemma to the result string
    return lemma_rew

# Assuming df2 is your dataframe and 'POS tagged' contains POS-tagged text
df1['Lemma'] = df1['POS tagged'].apply(lemmatize)
df1

Unnamed: 0,review,rating,cleaned,Lowercase,stopwords,POS tagged,Lemma
0,"I was using it for long time, but have to stop...",2,I was using it for long time but have to stop ...,i was using it for long time but have to stop ...,using long time stop using recently got used i...,"[(using, v), (long, a), (time, n), (stop, n), ...",use long time stop use recently get used ite...
1,Loving the fast deliveries and mostly they are...,1,Loving the fast deliveries and mostly they are...,loving the fast deliveries and mostly they are...,loving fast deliveries mostly great quick deli...,"[(loving, v), (fast, a), (deliveries, n), (mos...",love fast delivery mostly great quick delive...
2,The customer support is very disappointing. I ...,1,The customer support is very disappointing I r...,the customer support is very disappointing i r...,customer support disappointing recently bought...,"[(customer, n), (support, n), (disappointing, ...",customer support disappoint recently buy ele...
3,"I've been using Blinkit for a while now, and i...",5,I ve been using Blinkit for a while now and it...,i ve been using blinkit for a while now and it...,using blinkit become go app grocery shopping a...,"[(using, v), (blinkit, n), (become, n), (go, v...",use blinkit become go app grocery shopping a...
4,Blinkit was my go to app and it was rare that ...,2,Blinkit was my go to app and it was rare that ...,blinkit was my go to app and it was rare that ...,blinkit go app rare disappointed today disappo...,"[(blinkit, n), (go, v), (app, r), (rare, a), (...",blinkit go app rare disappoint today disappo...
...,...,...,...,...,...,...,...
4615,I have deposited ₹1300 to my zepto wallet to o...,1,I have deposited to my zepto wallet to order a...,i have deposited to my zepto wallet to order a...,deposited zepto wallet order watch depositing ...,"[(deposited, v), (zepto, None), (wallet, a), (...",deposit zepto wallet order watch deposit mon...
4616,Prices r competitive for certain items. Not fo...,4,Prices r competitive for certain items Not for...,prices r competitive for certain items not for...,prices r competitive certain items packing goo...,"[(prices, n), (r, v), (competitive, a), (certa...",price r competitive certain item pack good d...
4617,Worst delivery ever. They delivered a broken p...,1,Worst delivery ever They delivered a broken pr...,worst delivery ever they delivered a broken pr...,worst delivery ever delivered broken product s...,"[(worst, r), (delivery, n), (ever, r), (delive...",worst delivery ever deliver broken product s...
4618,"I had a great experience with zepto, the quali...",5,I had a great experience with zepto the qualit...,i had a great experience with zepto the qualit...,great experience zepto quality level faster de...,"[(great, a), (experience, n), (zepto, n), (qua...",great experience zepto quality level faster ...


In [23]:
#for i in df1['rating']:
 #   if i<=2:
  #      print('Bad')
   # elif i>2 and i<=4:
    #    print('Average')
    #else:
     #   print('Good')
df1['rating_category'] = df1['rating'].apply(lambda x: 'Bad' if x <= 2 else 'Average' if x <= 4 else 'Good')
df1       

Unnamed: 0,review,rating,cleaned,Lowercase,stopwords,POS tagged,Lemma,rating_category
0,"I was using it for long time, but have to stop...",2,I was using it for long time but have to stop ...,i was using it for long time but have to stop ...,using long time stop using recently got used i...,"[(using, v), (long, a), (time, n), (stop, n), ...",use long time stop use recently get used ite...,Bad
1,Loving the fast deliveries and mostly they are...,1,Loving the fast deliveries and mostly they are...,loving the fast deliveries and mostly they are...,loving fast deliveries mostly great quick deli...,"[(loving, v), (fast, a), (deliveries, n), (mos...",love fast delivery mostly great quick delive...,Bad
2,The customer support is very disappointing. I ...,1,The customer support is very disappointing I r...,the customer support is very disappointing i r...,customer support disappointing recently bought...,"[(customer, n), (support, n), (disappointing, ...",customer support disappoint recently buy ele...,Bad
3,"I've been using Blinkit for a while now, and i...",5,I ve been using Blinkit for a while now and it...,i ve been using blinkit for a while now and it...,using blinkit become go app grocery shopping a...,"[(using, v), (blinkit, n), (become, n), (go, v...",use blinkit become go app grocery shopping a...,Good
4,Blinkit was my go to app and it was rare that ...,2,Blinkit was my go to app and it was rare that ...,blinkit was my go to app and it was rare that ...,blinkit go app rare disappointed today disappo...,"[(blinkit, n), (go, v), (app, r), (rare, a), (...",blinkit go app rare disappoint today disappo...,Bad
...,...,...,...,...,...,...,...,...
4615,I have deposited ₹1300 to my zepto wallet to o...,1,I have deposited to my zepto wallet to order a...,i have deposited to my zepto wallet to order a...,deposited zepto wallet order watch depositing ...,"[(deposited, v), (zepto, None), (wallet, a), (...",deposit zepto wallet order watch deposit mon...,Bad
4616,Prices r competitive for certain items. Not fo...,4,Prices r competitive for certain items Not for...,prices r competitive for certain items not for...,prices r competitive certain items packing goo...,"[(prices, n), (r, v), (competitive, a), (certa...",price r competitive certain item pack good d...,Average
4617,Worst delivery ever. They delivered a broken p...,1,Worst delivery ever They delivered a broken pr...,worst delivery ever they delivered a broken pr...,worst delivery ever delivered broken product s...,"[(worst, r), (delivery, n), (ever, r), (delive...",worst delivery ever deliver broken product s...,Bad
4618,"I had a great experience with zepto, the quali...",5,I had a great experience with zepto the qualit...,i had a great experience with zepto the qualit...,great experience zepto quality level faster de...,"[(great, a), (experience, n), (zepto, n), (qua...",great experience zepto quality level faster ...,Good


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [25]:
X = df1['Lemma']
y = df1['rating_category']

In [26]:
# Vectorize the text
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [30]:
lg=LogisticRegression()

In [32]:
lg.fit(X_train,y_train)

In [34]:
y_pred=lg.predict(X_test)

In [35]:
print(accuracy_score(y_pred,y_test)*100)

85.71428571428571
