In [1]:
import os 
import pandas as pd
import numpy as np

from IPython.core.display import display

import seaborn as sns
import matplotlib.pyplot as plt

import cufflinks as cf

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
import plotly.offline as offline
import plotly.tools as tls

import nltk as nltk
from nltk import bigrams 
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

import scattertext as st
import spacy
nlp = spacy.load('en')

import textblob
from textblob import Word
from textblob import TextBlob

#from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import emoji
import regex

init_notebook_mode(connected=True)
%matplotlib inline

In [2]:
def avgword(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

In [3]:
print("Current Working Directory " , os.getcwd())

Current Working Directory  C:\Users\Mikayel\Desktop\Python\ies_gplay_scraper


Now, we will load all the data we have scraped. We have dataframes of 3 columns. We have the amount of the **support** the comment received (net of the likes and dislikes to a comment), the actual contents of the **comment** and the final **rating** to the app given by the users (on a scale from 1 to 5; 5 being the highest).

In [4]:
files = [i for i in os.listdir('data') if 'reviews' in i]

# preparation of renaming files
nicks = ['fb', 'snap', 'whatsapp', 'musicaly']
renamer = [[val, nicks[i]] for i,val in enumerate(files)]

# batch loading
load = [pd.read_csv(i) for i in 'data/' + os.listdir('data') if 'reviews' in i]

# additing the file definition
load = [val.assign(app = renamer[i][1]) for i, val in enumerate(load)]

# concat into a DataFrame
dt = pd.concat(load)
dt.head()


fb = dt.query('app == "fb"')
snap = dt.query('app == "snap"')
musicaly = dt.query('app == "musicaly"')
whatsapp = dt.query('app == "whatsapp"')

TypeError: must be str, not list

### Datasets at a glance

In [None]:
print("For Facebook, there are {} observations and {} features in this dataset.".format(dt.query('app == "fb"').shape[0],dt.query('app == "fb"').shape[1]))
print("For Snapchat, there are {} observations and {} features in this dataset.".format(dt.query('app == "snap"').shape[0],dt.query('app == "snap"').shape[1]))
print("For Musicaly, there are {} observations and {} features in this dataset.".format(dt.query('app == "musicaly"').shape[0],dt.query('app == "musicaly"').shape[1]))
print("For Whatsapp, there are {} observations and {} features in this dataset.\n".format(dt.query('app == "whatsapp"').shape[0],dt.query('app == "whatsapp"').shape[1]))


In [None]:
display(
    "Total ratings",
    dt.describe(),
    "Facebook ratings",
    dt.query('app == "fb"').describe()
)

The mean rating for the Facebook sample is 2.58, below of total app average of 2.98.

In [None]:
fb.loc[fb['support'] == fb.support.max()]

The highest support among Facebook reviewswas received by a complaint that had a 2-star rating, mentioning such problems like video or chat bubble crashes.

In [None]:
text = fb.review[8]

# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
#created a dummy variable, that equals unity if the review is positively rated (higher than 3)
dt['Positively_Rated'] = np.where(dt['rating']>3, 1, 0)

# add average word length as a variable
dt['word_count'] = dt['review'].apply(lambda x: len(str(x).split(" ")))
dt['char_count'] = dt['review'].str.len() ## this also includes spaces
dt['avgword'] = dt['review'].apply(lambda x: avgword(x))

#stopwords
stop = stopwords.words('english')
dt['stopwords'] = dt['review'].apply(lambda x: len([x for x in x.split() if x in stop]))

# number of numerics and uppercase words
dt['numerics'] = dt['review'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
dt['upper'] = dt['review'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
dt[['review','numerics','upper']].head()

#remove uppercase and punctuation
dt['review'] = dt['review'].apply(lambda x: " ".join(x.lower() for x in x.split())).str.replace('[^\w\s]','')

#remove stopwords
dt['review'] = dt['review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
dt['polarity'] = dt['review'].map(lambda text: TextBlob(text).sentiment.polarity)

dt.head()

In [None]:
f, axes = plt.subplots(2,2 , figsize=(7, 7), sharex=True)

_ = sns.distplot(fb["rating"] , color="skyblue", ax=axes[0, 0], axlabel ="Facebook rating")
_ = sns.distplot( snap["rating"] , color="#34495e", ax=axes[0, 1],axlabel ="Snapchat rating")
_ = sns.distplot( musicaly["rating"] , color="gold", ax=axes[1, 0],axlabel ="Musicaly rating")
_ = sns.distplot( whatsapp["rating"] , color="teal", ax=axes[1, 1],axlabel ="Whatsapp rating")

Whilst Snapchat and Facebook have more negative 1-star reviews than positive (Facebook has the highest amount of 1-star reviews - 5752), Musicaly/TikTok seems to be very favourably rated by its users, with the highest amount of favourable 5 stars, 5725.

In [None]:
dt.groupby(['app', 'rating']).count()

To build the classification model, we get rid of the neutral reviews, which are generally regard as 3 stars.

In [None]:
#created a dummy variable, that equals unity if the review is positively rated (higher than 3)
fb['Positively_Rated'] = np.where(fb['rating']>3, 1, 0)
fb.head(15)

In [None]:
fb[['Positively_Rated']].describe()

Out of 14080 reviews in our sample, only 37.3% are positive.

In [None]:
pd.crosstab(index = fb['Positively_Rated'], columns="Total count")

In [None]:
fb['word_count'] = fb['review'].apply(lambda x: len(str(x).split(" ")))
fb['char_count'] = fb['review'].str.len() ## this also includes spaces

fb.head(10)

In [None]:
# add average word length as a variable
fb['avgword'] = fb['review'].apply(lambda x: avgword(x))

Let's also calculate the amount of stopwords, numbers and special characters in the review

In [None]:
#stopwords
fb['stopwords'] = fb['review'].apply(lambda x: len([x for x in x.split() if x in stop]))
fb.head()

In [None]:
# number of numerics and uppercase words
fb['numerics'] = fb['review'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
fb['upper'] = fb['review'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
fb[['review','numerics','upper']].head()


Now, let's prepare the data for text mining and further analysis by pre-processing

In [None]:
#remove uppercase
fb['review'] = fb['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#remove punctuation
fb['review'] = fb['review'].str.replace('[^\w\s]','')

#remove stopwords
fb['review'] = fb['review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
fb.head()

In [None]:
avg_word_count = dt['word_count'].mean()
avg_word_count_snap=dt[dt['app']=="snap"]["word_count"].mean()
avg_word_count_fb=dt[dt['app']=="fb"]["word_count"].mean()
avg_word_count_musicaly=dt[dt['app']=="musicaly"]["word_count"].mean()
avg_word_count_whatsapp=dt[dt['app']=="whatsapp"]["word_count"].mean()

pd.DataFrame([avg_word_count_fb, avg_word_count_snap, avg_word_count_musicaly, avg_word_count_whatsapp,avg_word_count], index = ['Fb', 'Snap','Musicaly','Whatsapp','Total'], columns = ['average']) 

Interestingly, Snapchat's reviewers seem to be the wordiest, with the average word count of the review being the largest - 45.18.

In [None]:
dt.loc[:,['polarity', 'word_count']].head()

In [None]:
avg_word_count =dt['word_count'].mean()

f, axes = plt.subplots(2,2 , figsize=(10, 10), sharex=True)
_ = sns.kdeplot(dt[dt['app']=="fb"]["word_count"] , color="skyblue", ax=axes[0, 0],shade=True)
_ = sns.kdeplot(dt[dt['app']=="snap"]["word_count"] , color="#34495e", ax=axes[0, 1],shade=True)                #,ax ="Snapchat rating")
_ = sns.kdeplot(dt[dt['app']=="musicaly"]["word_count"] , color="gold", ax=axes[1, 0],shade=True)               #,ax ="Musicaly rating")
_ = sns.kdeplot(dt[dt['app']=="whatsapp"]["word_count"] , color="teal", ax=axes[1, 1],shade=True)               #,ax ="Whatsapp rating")

It would also be interesting to see, whether or not the little bump in the amount of word count (between 100 and 150 words) is connected to a higher negative sentiment (one would expect disappointed users to have higher commitment to writing the review)

In [None]:
samp = dt.iloc[np.random.randint(1, 46760, 1000)]

plt.scatter(samp.word_count, samp.polarity)

However, this seems not to be the case - since the dispersion of the sentiment is seemingly random.

In [None]:
_ = sns.heatmap(dt[['rating','support','Positively_Rated','word_count','avgword','stopwords','polarity']].corr().round(3), annot=True, linewidth=0.75)

In [None]:
# let's also lemmatize the reviews
fb['review'] = fb['review'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
fb[['review']].head()

In [None]:
freq = pd.Series(' '.join(fb['review']).split()).value_counts()[:15]
freq

We then use  TextBlob to calculate sentiment polarity (ranging betweeen -1 and 1; where 1 means positive sentiment and -1 means a negative sentiment)


In [None]:
fb['polarity'] = fb['review'].map(lambda text: TextBlob(text).sentiment.polarity)
fb.head()

#### Tokenization

Sentence tokenizer splits paragraph text into sentences, whilst 

In [None]:
# join all rows into one text
text=fb["review"].str.cat(sep=' ')
tokenized_text=sent_tokenize(text)
#print(tokenized_text)

tokenized_word=word_tokenize(text)
#print(tokenized_word)

fdist = FreqDist(tokenized_word)
#print(fdist)
#fdist.most_common(15)



### Visualizations

In [None]:
# Top 30 most frequent words

fdist.plot(30,cumulative=False)
plt.show()

In [None]:
# join all rows into one text
text=dt["review"].str.cat(sep=' ')
tokenized_text=sent_tokenize(text)

fdistall = FreqDist(tokenized_word)
fdistall.plot(30,cumulative=False)
plt.show()


In [None]:
topics = ['app', 'messenger', 'update','message','cant']

for topic in topics:
    print(topic,'more :', ' , '.join([ word.lower() for word, count in fdist.most_common(5)]))
    print(topic,'less :', ' , '.join([ word.lower() for word, count in fdist.most_common()[-5:]]))

In [None]:
dt['parsed'] = dt.review.apply(nlp)

In [None]:
dt.head()

In [None]:
words = dt.query('app == "fb"').review.str.split(' ')
fb_top15 = words.apply(pd.Series).melt().dropna().groupby('value').size().sort_values(ascending = False).iloc[:15]
words = dt.query('app == "whatsapp"').review.str.split(' ')
what_top15 = words.apply(pd.Series).melt().dropna().groupby('value').size().sort_values(ascending = False).iloc[:15]

In [None]:
pd.DataFrame((fb_top15, what_top15), index = ['fb','whatsapp']).T.dropna()