#### Importing the libraries

In [1]:
import pandas as pd
import numpy as np

#### Reading the csv as dataframe

In [2]:
df = pd.read_csv('googleplaystore_user_reviews.csv')

In [3]:
df.head(5)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


*We will perform sentiment analysis on Google Play store user reviews*

#### Renaming the column

In [4]:
df = df.rename(columns={'Translated_Review':'Reviews'})

In [5]:
df.columns

Index(['App', 'Reviews', 'Sentiment', 'Sentiment_Polarity',
       'Sentiment_Subjectivity'],
      dtype='object')

*The column **Translated_Review** has been renamed to **Reviews**.*

#### Checking information of the dataset

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64295 entries, 0 to 64294
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     64295 non-null  object 
 1   Reviews                 37427 non-null  object 
 2   Sentiment               37432 non-null  object 
 3   Sentiment_Polarity      37432 non-null  float64
 4   Sentiment_Subjectivity  37432 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.5+ MB


*There are total of 64295 rocords and 5 columns in the dataset*

In [7]:
df.isnull().sum()

App                           0
Reviews                   26868
Sentiment                 26863
Sentiment_Polarity        26863
Sentiment_Subjectivity    26863
dtype: int64

*There are 26868 missing values in **Reviews** column and 26863 missing values in **Sentiment**, **Sentiment_Polarity** and **Sentiment_Subjectivity** columns of the dataset.* 

#### Checking for duplicate records

In [8]:
df.duplicated(subset=None, keep='first').sum()

33616

*There are 33616 duplicate rows*

In [9]:
df.shape

(64295, 5)

In [10]:
df = df[df.duplicated(df.columns.tolist(), keep='first')==False]

In [11]:
df.duplicated(subset=None, keep='first').sum()

0

In [12]:
df.shape

(30679, 5)

*The duplicate rows have been removed*

In [13]:
df.isnull().sum()

App                         0
Reviews                   987
Sentiment                 982
Sentiment_Polarity        982
Sentiment_Subjectivity    982
dtype: int64

#### Handling missing values in  Reviews column

In [14]:
df[(df['Reviews'].isnull())].loc[:, 'App']

2                                   10 Best Foods for You
268                                                  11st
362                       1LINE – One Line with One Touch
405      2018Emoji Keyboard 😂 Emoticons Lite -sticker&gif
539                   2Date Dating App, Love and matching
                               ...                       
64082      Hotspot Shield Free VPN Proxy & Wi-Fi Security
64119                                             Hotstar
64156                      Hotwire Hotel & Car Rental App
64202                      Housing-Real Estate & Property
64236                         Houzz Interior Design Ideas
Name: App, Length: 987, dtype: object

*The above displays the rows for which there are missing values in **Reviews** column*

In [15]:
df[(df['Reviews'].isnull())].loc[:, 'App'].unique()

array(['10 Best Foods for You', '11st', '1LINE – One Line with One Touch',
       '2018Emoji Keyboard 😂 Emoticons Lite -sticker&gif',
       '2Date Dating App, Love and matching', '2RedBeans',
       '30 Day Fitness Challenge - Workout at Home',
       '365Scores - Live Scores', '3D Blue Glass Water Keyboard Theme',
       '3D Color Pixel by Number - Sandbox Art Coloring',
       '3D Live Neon Weed Launcher', '4 in a Row',
       '4K Wallpapers and Ultra HD Backgrounds',
       '591房屋交易-租屋、中古屋、新建案、實價登錄、別墅透天、公寓套房、捷運、買房賣房行情、房價房貸查詢', '591房屋交易-香港',
       '7 Cups: Anxiety & Stress Chat', '7 Day Food Journal Challenge',
       '7 Minute Workout', '7 Weeks - Habit & Goal Tracker',
       '8 Ball Pool', '850 Sports News Digest',
       '8fit Workouts & Meal Planner', '95Live -SG#1 Live Streaming App',
       'A Call From Santa Claus!', 'A Manual of Acupuncture',
       'A Word A Day', 'A&E - Watch Full Episodes of TV Shows',
       'A+ Gallery - Photos & Videos', 'A+ Mobile',
       'ABC Kids

*The above displays the unique Apps for which there are missing values in **Reviews** column*

In [16]:
'''
def apps_missing_reviews(dataset):
    for values in dataset['App'].unique().tolist():
        if dataset[dataset['App']==values].App.count() != dataset[dataset['App']==values].Reviews.count():
            print(values, " : ", dataset[dataset['App']==values].App.count(), " : ", dataset[dataset['App']==values].Reviews.count())
            
apps_missing_reviews(df)
'''

'\ndef apps_missing_reviews(dataset):\n    for values in dataset[\'App\'].unique().tolist():\n        if dataset[dataset[\'App\']==values].App.count() != dataset[dataset[\'App\']==values].Reviews.count():\n            print(values, " : ", dataset[dataset[\'App\']==values].App.count(), " : ", dataset[dataset[\'App\']==values].Reviews.count())\n            \napps_missing_reviews(df)\n'

*The above displays the Apps for which there are 1 or more missing values in the **Reviews** column*

In [17]:
'''
def apps_no_missing_reviews(dataset):
    for values in dataset['App'].unique().tolist():
        #print(values)
        if dataset[dataset['App']==values].App.count() == dataset[dataset['App']==values].Reviews.count():
            print(values, " : ", dataset[dataset['App']==values].App.count(), " : ", dataset[dataset['App']==values].Reviews.count())
            
apps_no_missing_reviews(df)
'''

'\ndef apps_no_missing_reviews(dataset):\n    for values in dataset[\'App\'].unique().tolist():\n        #print(values)\n        if dataset[dataset[\'App\']==values].App.count() == dataset[dataset[\'App\']==values].Reviews.count():\n            print(values, " : ", dataset[dataset[\'App\']==values].App.count(), " : ", dataset[dataset[\'App\']==values].Reviews.count())\n            \napps_no_missing_reviews(df)\n'

*The above displays the Apps for which there are no missing values in the **Reviews** column*

#### Dropping rows having missing values in the Reviews column

In [18]:
df = df[df['Reviews'].isnull()==False]

In [19]:
df

Unnamed: 0,App,Reviews,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.000000,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.250000,0.288462
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.400000,0.875000
4,10 Best Foods for You,Best idea us,Positive,1.000000,0.300000
5,10 Best Foods for You,Best way,Positive,1.000000,0.300000
...,...,...,...,...,...
64222,Housing-Real Estate & Property,Most ads older many agents ..not much owner po...,Positive,0.173333,0.486667
64223,Housing-Real Estate & Property,"If photos posted portal load, fit purpose. I'm...",Positive,0.225000,0.447222
64226,Housing-Real Estate & Property,"Dumb app, I wanted post property rent give opt...",Negative,-0.287500,0.250000
64227,Housing-Real Estate & Property,I property business got link SMS happy perform...,Positive,0.800000,1.000000


In [20]:
df.isnull().sum()

App                       0
Reviews                   0
Sentiment                 0
Sentiment_Polarity        0
Sentiment_Subjectivity    0
dtype: int64

*There are no more missing values in the dataset*

In [21]:
df.to_csv('file1.csv', index=False)

In [22]:
df = pd.read_csv('file1.csv')

In [23]:
df

Unnamed: 0,App,Reviews,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.000000,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.250000,0.288462
2,10 Best Foods for You,Works great especially going grocery store,Positive,0.400000,0.875000
3,10 Best Foods for You,Best idea us,Positive,1.000000,0.300000
4,10 Best Foods for You,Best way,Positive,1.000000,0.300000
...,...,...,...,...,...
29687,Housing-Real Estate & Property,Most ads older many agents ..not much owner po...,Positive,0.173333,0.486667
29688,Housing-Real Estate & Property,"If photos posted portal load, fit purpose. I'm...",Positive,0.225000,0.447222
29689,Housing-Real Estate & Property,"Dumb app, I wanted post property rent give opt...",Negative,-0.287500,0.250000
29690,Housing-Real Estate & Property,I property business got link SMS happy perform...,Positive,0.800000,1.000000


In [24]:
df.isnull().sum()

App                       0
Reviews                   0
Sentiment                 0
Sentiment_Polarity        0
Sentiment_Subjectivity    0
dtype: int64

#### Cleaning the punctuation marks

In [25]:
import re
import string

In [26]:
#df['Reviews'].unique().tolist()

In [27]:
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct
    
df['Reviews'] = df['Reviews'].apply(lambda x: remove_punctuation(x))

In [28]:
def cleaning(dataset, characters):
    dataset['Reviews'] = dataset['Reviews'].str.lower()
    dataset['Reviews'] = dataset['Reviews'].str.replace('*', 'i')
    dataset['Reviews'] = [re.sub('♥️|❤|\d', '', e) for e in df['Reviews']]
    dataset['Reviews'] = [re.sub('\s+', ' ', e) for e in df['Reviews']]
    for ch in characters:
        dataset['Reviews'] = dataset['Reviews'].str.replace(ch, '')
    return dataset
        
        
        
char=['☆', '✌', '—']
df = cleaning(df, char)

In [29]:
df.Reviews.unique().tolist()

['i like eat delicious food thats im cooking food myself case best foods helps lot also best before shelf life',
 'this help eating healthy exercise regular basis',
 'works great especially going grocery store',
 'best idea us',
 'best way',
 'amazing',
 'looking forward app',
 'it helpful site it help foods get ',
 'good you',
 'useful information the amount spelling errors questions validity information shared once fixed stars given',
 'thank you great app add arthritis eyes immunity kidneyliver detox foods please ',
 'greatest ever completely awesome maintain health this must ppl there love it',
 'good health good health first priority',
 'health its important world either life think ',
 'mrs sunita bhati i thankful developersto make kind app really good healthy food body',
 'very useful in diabetes age i need control sugar thanks',
 'one greatest apps',
 'good nice',
 'healthy really helped',
 'god health',
 'health should always be top priority on mysg',
 'an excellent a useful',


In [30]:
df

Unnamed: 0,App,Reviews,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,i like eat delicious food thats im cooking foo...,Positive,1.000000,0.533333
1,10 Best Foods for You,this help eating healthy exercise regular basis,Positive,0.250000,0.288462
2,10 Best Foods for You,works great especially going grocery store,Positive,0.400000,0.875000
3,10 Best Foods for You,best idea us,Positive,1.000000,0.300000
4,10 Best Foods for You,best way,Positive,1.000000,0.300000
...,...,...,...,...,...
29687,Housing-Real Estate & Property,most ads older many agents not much owner post...,Positive,0.173333,0.486667
29688,Housing-Real Estate & Property,if photos posted portal load fit purpose im su...,Positive,0.225000,0.447222
29689,Housing-Real Estate & Property,dumb app i wanted post property rent give opti...,Negative,-0.287500,0.250000
29690,Housing-Real Estate & Property,i property business got link sms happy perform...,Positive,0.800000,1.000000


*The punctuation marks have been cleaned in the dataset*

#### Tokenizing the Reviews

In [31]:
from nltk.tokenize import word_tokenize
df['Reviews'].dropna(inplace=True)
df['Reviews'] = df['Reviews'].apply(word_tokenize)

In [32]:
df

Unnamed: 0,App,Reviews,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,"[i, like, eat, delicious, food, thats, im, coo...",Positive,1.000000,0.533333
1,10 Best Foods for You,"[this, help, eating, healthy, exercise, regula...",Positive,0.250000,0.288462
2,10 Best Foods for You,"[works, great, especially, going, grocery, store]",Positive,0.400000,0.875000
3,10 Best Foods for You,"[best, idea, us]",Positive,1.000000,0.300000
4,10 Best Foods for You,"[best, way]",Positive,1.000000,0.300000
...,...,...,...,...,...
29687,Housing-Real Estate & Property,"[most, ads, older, many, agents, not, much, ow...",Positive,0.173333,0.486667
29688,Housing-Real Estate & Property,"[if, photos, posted, portal, load, fit, purpos...",Positive,0.225000,0.447222
29689,Housing-Real Estate & Property,"[dumb, app, i, wanted, post, property, rent, g...",Negative,-0.287500,0.250000
29690,Housing-Real Estate & Property,"[i, property, business, got, link, sms, happy,...",Positive,0.800000,1.000000


*The Reviews have been tokenized i.e. split into tokens/pieces*

#### Removing the stop words from Reviews

In [33]:
#nltk.download('stopwords')
from nltk.corpus import stopwords

In [34]:
stopwords = stopwords.words('english')

In [35]:
def remove_stopwords(text):        
    words = [w for w in text if w not in stopwords]
    return words


df['Reviews'] = df['Reviews'].apply(lambda x: remove_stopwords(x))

In [36]:
df

Unnamed: 0,App,Reviews,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,"[like, eat, delicious, food, thats, im, cookin...",Positive,1.000000,0.533333
1,10 Best Foods for You,"[help, eating, healthy, exercise, regular, basis]",Positive,0.250000,0.288462
2,10 Best Foods for You,"[works, great, especially, going, grocery, store]",Positive,0.400000,0.875000
3,10 Best Foods for You,"[best, idea, us]",Positive,1.000000,0.300000
4,10 Best Foods for You,"[best, way]",Positive,1.000000,0.300000
...,...,...,...,...,...
29687,Housing-Real Estate & Property,"[ads, older, many, agents, much, owner, posted...",Positive,0.173333,0.486667
29688,Housing-Real Estate & Property,"[photos, posted, portal, load, fit, purpose, i...",Positive,0.225000,0.447222
29689,Housing-Real Estate & Property,"[dumb, app, wanted, post, property, rent, give...",Negative,-0.287500,0.250000
29690,Housing-Real Estate & Property,"[property, business, got, link, sms, happy, pe...",Positive,0.800000,1.000000


*The sopwords have been removed from Reviews*

#### Lemmatization of Reviews

In [37]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text


df['Reviews'] = df['Reviews'].apply(lambda x: word_lemmatizer(x))

In [38]:
df['Reviews'].head(10)

0    [like, eat, delicious, food, thats, im, cookin...
1    [help, eating, healthy, exercise, regular, basis]
2     [work, great, especially, going, grocery, store]
3                                      [best, idea, u]
4                                          [best, way]
5                                            [amazing]
6                              [looking, forward, app]
7                     [helpful, site, help, food, get]
8                                               [good]
9    [useful, information, amount, spelling, error,...
Name: Reviews, dtype: object

#### Determining sentiment polarity and subjectivity using TextBlob library

In [39]:
from textblob import TextBlob

In [40]:
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

In [41]:
df

Unnamed: 0,App,Reviews,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,"[like, eat, delicious, food, thats, im, cookin...",Positive,1.000000,0.533333
1,10 Best Foods for You,"[help, eating, healthy, exercise, regular, basis]",Positive,0.250000,0.288462
2,10 Best Foods for You,"[work, great, especially, going, grocery, store]",Positive,0.400000,0.875000
3,10 Best Foods for You,"[best, idea, u]",Positive,1.000000,0.300000
4,10 Best Foods for You,"[best, way]",Positive,1.000000,0.300000
...,...,...,...,...,...
29687,Housing-Real Estate & Property,"[ad, older, many, agent, much, owner, posted, ...",Positive,0.173333,0.486667
29688,Housing-Real Estate & Property,"[photo, posted, portal, load, fit, purpose, im...",Positive,0.225000,0.447222
29689,Housing-Real Estate & Property,"[dumb, app, wanted, post, property, rent, give...",Negative,-0.287500,0.250000
29690,Housing-Real Estate & Property,"[property, business, got, link, sm, happy, per...",Positive,0.800000,1.000000


In [42]:
df['Reviews'] = df['Reviews'].apply(lambda x: " ".join(x))

In [43]:
df

Unnamed: 0,App,Reviews,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,like eat delicious food thats im cooking food ...,Positive,1.000000,0.533333
1,10 Best Foods for You,help eating healthy exercise regular basis,Positive,0.250000,0.288462
2,10 Best Foods for You,work great especially going grocery store,Positive,0.400000,0.875000
3,10 Best Foods for You,best idea u,Positive,1.000000,0.300000
4,10 Best Foods for You,best way,Positive,1.000000,0.300000
...,...,...,...,...,...
29687,Housing-Real Estate & Property,ad older many agent much owner posted detail r...,Positive,0.173333,0.486667
29688,Housing-Real Estate & Property,photo posted portal load fit purpose im sure s...,Positive,0.225000,0.447222
29689,Housing-Real Estate & Property,dumb app wanted post property rent give option...,Negative,-0.287500,0.250000
29690,Housing-Real Estate & Property,property business got link sm happy performanc...,Positive,0.800000,1.000000


In [44]:
df['Predicted_Polarity'] = df['Reviews'].apply(pol)
df['Predicted_Subjectivity'] = df['Reviews'].apply(sub)

In [45]:
df.head(10)

Unnamed: 0,App,Reviews,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Predicted_Polarity,Predicted_Subjectivity
0,10 Best Foods for You,like eat delicious food thats im cooking food ...,Positive,1.0,0.533333,1.0,0.533333
1,10 Best Foods for You,help eating healthy exercise regular basis,Positive,0.25,0.288462,0.25,0.288462
2,10 Best Foods for You,work great especially going grocery store,Positive,0.4,0.875,0.4,0.875
3,10 Best Foods for You,best idea u,Positive,1.0,0.3,1.0,0.3
4,10 Best Foods for You,best way,Positive,1.0,0.3,1.0,0.3
5,10 Best Foods for You,amazing,Positive,0.6,0.9,0.6,0.9
6,10 Best Foods for You,looking forward app,Neutral,0.0,0.0,0.0,0.0
7,10 Best Foods for You,helpful site help food get,Neutral,0.0,0.0,0.0,0.0
8,10 Best Foods for You,good,Positive,0.7,0.6,0.7,0.6
9,10 Best Foods for You,useful information amount spelling error quest...,Positive,0.2,0.1,0.2,0.1


In [46]:
df[df['App'] == 'Housing-Real Estate & Property']

Unnamed: 0,App,Reviews,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Predicted_Polarity,Predicted_Subjectivity
29671,Housing-Real Estate & Property,incorrect listing agent show property discus p...,Negative,-0.025,0.125,-0.025,0.125
29672,Housing-Real Estate & Property,pathetic appall posted lead owner either old p...,Negative,-0.3625,0.625,-0.3625,0.625
29673,Housing-Real Estate & Property,waste app property want room hyderabad able fi...,Positive,0.266667,0.375,0.15,0.3125
29674,Housing-Real Estate & Property,filter useless posted owner show agent approac...,Negative,-0.5,0.2,-0.5,0.2
29675,Housing-Real Estate & Property,business filter dont work redirect buyer agent...,Neutral,0.0,0.0,0.0,0.0
29676,Housing-Real Estate & Property,filter work searching property useless,Negative,-0.5,0.2,-0.5,0.2
29677,Housing-Real Estate & Property,want list property sale option please connect,Neutral,0.0,0.0,0.0,0.0
29678,Housing-Real Estate & Property,good,Positive,0.7,0.6,0.7,0.6
29679,Housing-Real Estate & Property,awesome app price date awesome app give proper...,Positive,0.509375,0.511458,0.509375,0.511458
29680,Housing-Real Estate & Property,worse ever give graphic picture property like ...,Negative,-0.08125,0.41875,-0.066667,0.433333


#### Summary

*The actual and the predicted values of snetiment polarity and subjectivity are very close to each other for the respective applications.*