In [1]:
!pip install nltk
!pip install textblob



In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(["stopwords", "wordnet"])
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')


import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
dft = pd.read_csv("chrome_reviews.csv")
dft.head()

Unnamed: 0,ID,Review URL,Text,Star,Thumbs Up,User Name,Developer Reply,Version,Review Date,App ID
0,3886,https://play.google.com/store/apps/details?id=...,This is very helpfull aap.,5,0,INDIAN Knowledge,,83.0.4103.106,2020-12-19,com.android.chrome
1,3887,https://play.google.com/store/apps/details?id=...,Good,3,2,Ijeoma Happiness,,85.0.4183.127,2020-12-19,com.android.chrome
2,3888,https://play.google.com/store/apps/details?id=...,Not able to update. Neither able to uninstall.,1,0,Priti D BtCFs-29,,85.0.4183.127,2020-12-19,com.android.chrome
3,3889,https://play.google.com/store/apps/details?id=...,Nice app,4,0,Ajeet Raja,,77.0.3865.116,2020-12-19,com.android.chrome
4,3890,https://play.google.com/store/apps/details?id=...,Many unwanted ads,1,0,Rams Mp,,87.0.4280.66,2020-12-19,com.android.chrome


In [4]:
dft.shape

(7204, 10)

## Text Preprocessing

In [5]:
# I am only considering columns which will help to solve the problems
df = dft[["ID", "Text","Star"]].set_index("ID")
df.head()

Unnamed: 0_level_0,Text,Star
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
3886,This is very helpfull aap.,5
3887,Good,3
3888,Not able to update. Neither able to uninstall.,1
3889,Nice app,4
3890,Many unwanted ads,1


In [6]:
## lets identify each ratings based on its reviews
df_5 = df[df["Star"]==5]
print(df_5.shape)
df_5.head()   #We have 3871 most positive reviews

(3871, 2)


Unnamed: 0_level_0,Text,Star
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
3886,This is very helpfull aap.,5
3892,Yes yes,5
3893,Awesome,5
3896,Good,5
3897,Very good app,5


In [7]:
#similarly for each ratings, but we are most interested in low review.
df_4= df[df["Star"]==4]
df_3 = df[df["Star"]==3]
df_2= df[df["Star"]==2]
df_1 = df[df["Star"]==1]

In [8]:
print("The shape for 1-rating datas are:-", df_1.shape)
df_1.head()

The shape for 1-rating datas are:- (1894, 2)


Unnamed: 0_level_0,Text,Star
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
3888,Not able to update. Neither able to uninstall.,1
3890,Many unwanted ads,1
3894,Very bad app 😞,1
3895,Many times I tried to update its not updating....,1
3898,App is not getting update and it is not gettin...,1


### Checking For the Missing values

In [9]:
df.isnull().sum()

Text    1
Star    0
dtype: int64

In [10]:
## Now that there is 1 missing value in text columns, and since this review were written by users. So removing the missing review is only options.
df.dropna(inplace=True)

In [11]:
df.reset_index(inplace=True)  #reseting the index, since we have remove the missing values


### Removing Stopwords and Apply lemmatization

In [12]:
wordnet = WordNetLemmatizer()
corpus=[]
sent= df["Text"]
for i in range(len(sent)):
    words = re.sub("^a-zA-Z", " ", str(sent[i]))
    words= words.lower()
    words= words.split()
    words= [wordnet.lemmatize(word)  for word in words if not word in set(stopwords.words("english"))]
    words = " ".join(words)
    corpus.append(words)

In [13]:
corpus = pd.DataFrame(corpus, columns=["Text"])

df_txt = pd.concat([corpus,df["Star"]],axis=1)

In [14]:
df_txt.head(5)

Unnamed: 0,Text,Star
0,helpfull aap.,5
1,good,3
2,able update. neither able uninstall.,1
3,nice app,4
4,many unwanted ad,1


In [15]:
TextBlob(df_txt["Text"][2]).sentiment[0]

0.5

In [16]:
senti_list = []
for sent in df_txt["Text"]:
    sentiment_score = TextBlob(sent).sentiment[0]  #it will give the sentiment score based on each reviews
    if (sentiment_score > 0):
        senti_list.append('Positive Review')
        
    elif (sentiment_score < 0):
        senti_list.append('Negative Review')
        
    else:
        senti_list.append('Neutral Review')

In [17]:
senti_list[:5]
df_txt["Sentiment"] = senti_list
df_txt.head()

Unnamed: 0,Text,Star,Sentiment
0,helpfull aap.,5,Neutral Review
1,good,3,Positive Review
2,able update. neither able uninstall.,1,Positive Review
3,nice app,4,Positive Review
4,many unwanted ad,1,Positive Review


##### In the problem statement, it was mentioned that reviews were positive but users put the star rating low. So we will extract those reviews which has positive sentiment.

In [39]:
pos_data = df_txt[df_txt["Sentiment"]== "Positive Review"]
pos_data= pos_data[pos_data["Star"]==1]
pos_data.head()

Unnamed: 0,Text,Star,Sentiment
2,able update. neither able uninstall.,1,Positive Review
4,many unwanted ad,1,Positive Review
9,many time tried update updating. whenever try ...,1,Positive Review
12,app getting update getting open saying u r usi...,1,Positive Review
15,coming real status. thank,1,Positive Review


In [19]:
model_name ="roberta-large-mnli"

In [20]:
import transformers
from transformers import pipeline
generator= pipeline(task="text-classification", model=model_name)

Downloading:   0%|          | 0.00/688 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [36]:

def sentiment_classfier(data):
    sentiment_lists=[]
    for sent in data["Text"]:
        sentiment= generator(sent)
        sentiment_lists.append(sentiment)
        

In [35]:
generator(df_txt["Text"][0])

[{'label': 'NEUTRAL', 'score': 0.5082586407661438}]

In [38]:
sentiment_lists =sentiment_classfier(df_txt)

RuntimeError: The expanded size of the tensor (527) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 527].  Tensor sizes: [1, 514]

In [40]:
model_nam= "finiteautomata/bertweet-base-sentiment-analysis"
sent_classifier = pipeline(task="text-classification", model=model_nam)

Downloading:   0%|          | 0.00/890 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/515M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/295 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/824k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [42]:
def sentiment_classfiers(data):
    sentiment_lists=[]
    for sent in data["Text"]:
        sentiment= sent_classifier(sent)
        sentiment_lists.append(sentiment)
        
sentiment_col =sentiment_classfiers(df_txt)

Token indices sequence length is longer than the specified maximum sequence length for this model (396 > 128). Running this sequence through the model will result in indexing errors


IndexError: index out of range in self