In [1]:
from google_play_scraper import Sort, reviews , app, reviews_all 

import pandas as pd
import datetime as dt

import dash
from dash.exceptions import PreventUpdate
from dash.dependencies import Input, Output
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc

import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from string import punctuation as punc
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords as sw

import plotly.graph_objs as go

In [2]:
application_id = "com.namshi.android"
app_info = app(application_id,lang='en', country='us')

In [3]:
total_reviews = app_info['reviews']
fetch_total = total_reviews
if total_reviews>10000:
    fetch_total = 10000

In [4]:
result_1, _ = reviews(
    application_id,
    lang='en', # defaults to 'en'
    country='us', # defaults to 'us'
    sort=Sort.MOST_RELEVANT, # defaults to Sort.MOST_RELEVANT
    count=fetch_total, # defaults to 100
    filter_score_with=1, # defaults to None(means all score)
    #continuation_token = continuation_token
)

In [5]:
result_2, _ = reviews(
    application_id,
    lang='en', # defaults to 'en'
    country='us', # defaults to 'us'
    sort=Sort.MOST_RELEVANT, # defaults to Sort.MOST_RELEVANT
    count=fetch_total, # defaults to 100
    filter_score_with=2, # defaults to None(means all score)
    #continuation_token = continuation_token
)

In [6]:
result_3, _ = reviews(
    application_id,
    lang='en', # defaults to 'en'
    country='us', # defaults to 'us'
    sort=Sort.MOST_RELEVANT, # defaults to Sort.MOST_RELEVANT
    count=fetch_total, # defaults to 100
    filter_score_with=3, # defaults to None(means all score)
    #continuation_token = continuation_token
)

In [7]:
ls_results = [result_1,result_2,result_3]
ls_reviews_df= []
for result in ls_results:
    review_df = pd.DataFrame(result, columns = ['at','userName','score','content','reviewCreatedVersion','replyContent'])\
                .rename(columns = {'at' : 'date','userName':'app_user','score':'rating','content':'user_review',\
                                   'reviewCreatedVersion':'for_version', 'replyContent':'dev_response'} )
    review_df['year'] = review_df.date.map(lambda x: x.year)
    review_df['month'] = review_df.date.map(lambda x: x.month)
    ls_reviews_df.append(review_df)

### Reviews vs Responses

In [8]:
reviews_df = pd.DataFrame()
for df in ls_reviews_df:
    reviews_df = pd.concat([reviews_df,df], axis=0)
rev_res = reviews_df.groupby('rating').count()[['user_review','dev_response']].reset_index()
rev_res

Unnamed: 0,rating,user_review,dev_response
0,1,953,509
1,2,158,77
2,3,289,130


In [9]:
stopwords = set(sw.words('english'))
stopwords.update({'sometimes','get','i\'m','good', 'something','give','hope','that\'s','that', 'well','please', 'plz','help','also','u'})
stopwords.difference_update({"aren't","couldn't","doesn't","don't","hadn't","haven't","isn't","shouldn't","weren't","won't","wouldn't"}) 
lemmatizer = WordNetLemmatizer() 

In [10]:
def create_word_graph(voc):
    axis_style = dict(
                fixedrange = True,
                showline=True,
                showgrid=False,
                showticklabels=True,
                linecolor='rgb(204, 204, 204)',
                linewidth=2,
                ticks='outside',
                tickfont=dict(family='Arial',size=12,color='rgb(82, 82, 82)',),
        )
    hover_label_style = dict( bgcolor="white",font_size=15)
    margin_style = dict(l=100, r=20, t=100,)
    data = [
        go.Bar(x= [tup[0] for tup in voc.most_common(20)], y = [tup[1] for tup in voc.most_common(20)],
               name='', hoverinfo = 'skip',marker={"color": "#63C9B5"},
               text = [tup[1] for tup in voc.most_common(20)] , textposition = 'outside', cliponaxis = False
              )
    ]
    fig = go.Figure(data = data)
    fig.update_layout( 
            height = 400,
            title_text = "Top words used",
            hoverlabel=hover_label_style,
            hovermode = 'x',
            xaxis_title="words",
            yaxis_title="frequency",
            xaxis = axis_style,
            yaxis = axis_style,
            margin= margin_style,
            showlegend=False,
            plot_bgcolor='white',
        )
    return fig

### Add your keywords to filter out reviews

In [52]:
keywords = [] #['otp']

ls_dic_review = []
ls_fig = []
tokenizer=RegexpTokenizer("[a-z']{3,}")

for review_df in ls_reviews_df:
    dic_review = {}
    dic_word = {}
    
    review = ' '.join(review_df.user_review).lower()
    token = tokenizer.tokenize(review)
    token = [lemmatizer.lemmatize(lemmatizer.lemmatize(word, pos = 'v'),pos = 'n') for word in token if word not in stopwords]
    voc=nltk.FreqDist(token)
    ls_fig.append(create_word_graph(voc))
    
    ls_group = list(review_df.groupby(['year','month']).groups.keys())[::-1]

    review_group = review_df.groupby(['year','month'])
    for group in ls_group:
        review = ' '.join(review_group.get_group(group)[['date','user_review']].user_review).lower()

        token = tokenizer.tokenize(review)
        if len(keywords) == 0:
            token = [lemmatizer.lemmatize(lemmatizer.lemmatize(word, pos = 'v'),pos = 'n') for word in token if word not in stopwords]
        else:
            token = [lemmatizer.lemmatize(lemmatizer.lemmatize(word, pos = 'v'),pos = 'n') for word in token if lemmatizer.lemmatize(lemmatizer.lemmatize(word, pos = 'v'),pos = 'n') in keywords]
        voc=nltk.FreqDist(token)
             
        df = review_group.get_group(group)[['date','user_review']]
        df.user_review = df.user_review.map(lambda x : sent_tokenize(x.lower()))
        ls= df.user_review.tolist()

        new_ls = []
        for i in ls:
            new_ls+=i
        key_set = set(voc.keys())
        dic = {}
        new_punc = punc.replace("'","")
        for sent in new_ls:
            wrds_ls = sent.split()
            sumi = 0
            for wrd in wrds_ls:

                m = lemmatizer.lemmatize(lemmatizer.lemmatize(wrd.translate(str.maketrans("","", new_punc)), pos = 'v'),pos = 'n')
                if m in key_set:
                    sumi+=voc[m]
            dic[sent]=sumi
        high_freq = max(dic.values())
        if high_freq==0:
            continue
        df_sent = pd.DataFrame([dic.keys(),dic.values()]).T.rename(columns = {0:"sent", 1:"freq"})
        df_sent['weight_freq']=df_sent.freq.map(lambda x : round(x/high_freq,2))
        df_sent = df_sent.sort_values(['weight_freq'],ascending=False)
        dic_review['Period : '+str(group[0])+'-'+str(group[1])] = df_sent[df_sent.weight_freq>0.13]
    ls_dic_review.append(dic_review)

In [53]:
colours = ["#009700","#009790", "#FFFF5C" , "#FF8A33", "#FF3F31"]
axis_style = dict(
            ticks="outside",
            fixedrange = True,
            showline=False,
            #showgrid=False,
            showticklabels=True,
    ticklen=10,
    tickcolor='white',
    tickfont=dict(family='Arial',size=12,color='rgb(82, 82, 82)',),
    )
hover_label_style = dict( bgcolor="white",font_size=15)
margin_style = dict( t=0,b = 0)


fig = go.Figure()
data = [
    go.Bar(x= app_info['histogram'][::-1], y = [1,2,3,4,5][::-1]   , name='',orientation ='h',
           marker={"color": colours, }, opacity = 0.8, hoverinfo = 'skip',
           text = app_info['histogram'][::-1] , textposition = 'outside', cliponaxis = False
          )
]

fig = go.Figure(data= data)

fig.update_layout( 
    height=165, width =500,
    xaxis = dict(showticklabels=False, ticks="",fixedrange = True,),
    yaxis = {**axis_style,**{'tick0': 0,'dtick': 1}},
    margin = margin_style,
    plot_bgcolor='white',
)

In [54]:
overall, _ = reviews(
    application_id,
    lang='en', # defaults to 'en'
    country='us', # defaults to 'us'
    sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
    count=fetch_total, # defaults to 100
    #filter_score_with=3, # defaults to None(means all score)
    #continuation_token = continuation_token
)
overall_df = pd.DataFrame(overall, columns = ['at','userName','score','content','reviewCreatedVersion','replyContent'])\
                .rename(columns = {'at' : 'date','userName':'app_user','score':'rating','content':'user_review',\
                                   'reviewCreatedVersion':'for_version', 'replyContent':'dev_response'} )
overall_df.dropna(subset=["user_review"], inplace=True)
overall_review = ' '.join(overall_df[['date','user_review']].user_review).lower()
overall_df['date'] = overall_df.date.map(lambda x : x.date())
overall_df

Unnamed: 0,date,app_user,rating,user_review,for_version,dev_response
0,2021-03-17,Yuliya Musyk,5,been loyal customer for like 5 yrs and never f...,8.12.1,Thank you for taking the time to write this re...
1,2021-03-16,Saima Khan,5,So happy with the overall experience..they hav...,8.12.1,Thank you so much for your valuable feedback a...
2,2021-03-16,Shumaila Jaffrey,4,"Overall great app, but they should have more p...",8.12.1,"Hello, Thank you for your suggestion, We will ..."
3,2021-03-15,Mohammed Saeed,5,🌹🌹🌹🌹,8.12.1,Thank you so much for your encouraging star ra...
4,2021-03-15,Mohammed Shakir,1,Bad experience.first time i order on this app ...,8.12.1,"Apologies for any inconvenience, we are gratif..."
...,...,...,...,...,...,...
5172,2014-06-13,Alessandro Casuccio,5,Namshi rocks,1.0.1,
5173,2014-06-12,Paul Debahy,5,Finally! Thanks for the great app namshi! Am l...,1.0,
5174,2014-06-12,Ross I.,5,This is amazing. I love the way everything is ...,1.0,
5175,2014-06-12,Megha Mehra,5,This is awesome. Best shopping site,1.0,


In [55]:
#overall_review.translate(str.maketrans("","", new_punc))
#' '.join(overall_df.user_reviews)
overall_df['user_reviews'] = overall_df['user_review'].map(lambda x : x.lower().translate(str.maketrans("","", new_punc)))
tokenizer=RegexpTokenizer("[\w']+")
overall_token = tokenizer.tokenize(' '.join(overall_df.user_reviews))
overall_token = [lemmatizer.lemmatize(lemmatizer.lemmatize(word.translate(str.maketrans("","", new_punc)), pos = 'v')) for word in overall_token if word not in stopwords]
overall_voc=nltk.FreqDist(overall_token)


overall_df['user_reviews'] = overall_df.user_review.map(lambda x : sent_tokenize(x.lower()))
ls= overall_df.user_reviews.tolist()

new_ls = []
for i in ls:
    new_ls+=i
key_set = set(overall_voc.keys())
dic = {}
new_punc = punc.replace("'","")
for sent in new_ls:
    wrds_ls = sent.split()
    sumi = 0
    for wrd in wrds_ls:
        m = lemmatizer.lemmatize(lemmatizer.lemmatize(wrd.translate(str.maketrans("","", new_punc)), pos = 'v'),pos = 'n')
        if m in key_set:
            sumi+=overall_voc[m]
    dic[sent]=sumi
high_freq = max(dic.values())
df_sent = pd.DataFrame([dic.keys(),dic.values()]).T.rename(columns = {0:"sent", 1:"freq"})
df_sent['weight_freq']=df_sent.freq.map(lambda x : round(x/high_freq,2))
df_sent = df_sent.sort_values(['weight_freq'],ascending=False)

In [56]:
from monkeylearn import MonkeyLearn
text = ' '.join(df_sent[df_sent.weight_freq>0.13].sent)
ml = MonkeyLearn('7464bbfd76d10de04a200b9665700cd87e0e6760')
data = [text]
model_id = 'cl_CsfDyd3m'
result = ml.classifiers.classify(model_id, data)
sentiment = result.body[0]['classifications'][0]['tag_name']
confidence = result.body[0]['classifications'][0]['confidence']

In [57]:
overall_rating_df = overall_df.groupby('rating').count().reset_index()[['rating','user_review']]
total_reviews = overall_rating_df.user_review.sum()
overall_rating_df['percentage of total'] = overall_rating_df.user_review.map(lambda x : round((x/total_reviews)*100,2))
overall_rating_df

Unnamed: 0,rating,user_review,percentage of total
0,1,953,18.42
1,2,158,3.05
2,3,289,5.58
3,4,548,10.59
4,5,3227,62.36


In [58]:
axis_style = dict(
            fixedrange = True,
            showline=True,
            showgrid=False,
            showticklabels=True,
            linecolor='rgb(204, 204, 204)',
            linewidth=2,
            ticks='outside',
            tickfont=dict(family='Arial',size=12,color='rgb(82, 82, 82)',),
    )
hover_label_style = dict( bgcolor="white",font_size=15)
margin_style = dict(l=100, r=20, t=100,)
ls_data = []

for i in range(1,6):
    ls_data.append(
        go.Scatter(x= overall_df[overall_df.rating==i].groupby('date').count().reset_index().date.tolist(),
               y = overall_df[overall_df.rating==i].groupby('date').count().rating.tolist(),
               
               name=i,mode = 'lines+markers', marker =dict(size=1, line={'width':2 })
              ))
fig_overall = go.Figure(data = ls_data)
fig_overall.update_layout( 
        height = 400,
        title_text = "Overview of Review Rating",
        hoverlabel=hover_label_style,
        hovermode = 'x unified',
        xaxis_title="Time Period",
        yaxis_title="Total",
        xaxis = axis_style,
        yaxis = axis_style,
        margin= margin_style,
        showlegend=True,
        plot_bgcolor='white',
    )

In [59]:
def generate_reviews(group,dic_review):
    return html.Div([
                    html.Br(),group, html.Br(),'-----'.join(dic_review[group].sent)
                ], className="mr-1")

In [60]:
ls_options_1 = [{"label" : "last {} months".format(i+1) , "value" : i+1} for i in range(1,len(ls_dic_review[0].keys()))]
ls_options_2 = [{"label" : "last {} months".format(i+1) , "value" : i+1} for i in range(1,len(ls_dic_review[1].keys()))]
ls_options_3 = [{"label" : "last {} months".format(i+1) , "value" : i+1} for i in range(1,len(ls_dic_review[2].keys()))]
text_center = {'text-align': 'center'}
component_width = {'min-width': '1000px','margin':'auto'}
app_dash = dash.Dash( external_stylesheets=[dbc.themes.BOOTSTRAP])
app_dash.layout = html.Div( [
    html.Div(className="container",children =[
    html.Br(),
    html.H1(app_info['title'], style = text_center),
    html.Div(app_info["summaryHTML"]
                            , style = {'font-size':'20px',**text_center}),
    html.Br(),
    dbc.Row([
        dbc.Col('Genre : {}'.format(app_info["genre"])),
        dbc.Col('Installs : {}+'.format(app_info["minInstalls"])
                            , style = text_center),
        dbc.Col('Rating : {}'.format(app_info['score'])
                            , style = text_center),
        dbc.Col('Released : {}'.format(app_info["released"])
                            , style = text_center),
        dbc.Col('Size : {}'.format(app_info["size"])
                            , style = text_center),
            ]),
    html.Br(),
    dbc.Row([
        dbc.Col('Webiste : {}'.format(app_info["developerWebsite"])
                            , style = {'text-align': 'left'}),
        dbc.Col('e-mail : {}'.format(app_info["developerEmail"])
                            , style = {'text-align': 'right'}),
            ]),
    html.Br(),
    dbc.Row([
        dbc.Col([
            html.Div(" ", style = {'height':'18px'}),
            dbc.Table.from_dataframe(id = "reviews_response",df = rev_res, striped = True 
                             ,style = {'font-size': '13px',**text_center}
                             ,className = 'table table-dark table-hover table-md' ,  header = True)
                ]),
        dbc.Col([
            dbc.Row([dcc.Graph(id='fig2_bar_chart', figure=fig, config={'displayModeBar':False})]
                             , style = {"width":"100%"}),
            dbc.Row([
                dbc.Col('Total Ratings : {}'.format(app_info["ratings"])
                            , style = text_center),
                dbc.Col('Total Reviews : {}'.format(app_info["reviews"])
                            , style = {'text-align': 'left'}) ])
                ],style={'width': '100%','margin':'auto'})
            ]),
    ],style = component_width),
    
    html.Br(),
    dcc.Tabs([
       dcc.Tab(label='Reviews Rating 1', children=[
            dcc.Graph(id='rating_fig1', figure=ls_fig[0]
                            ,style={'width': '90%',**component_width}),
            html.Br(),
            html.Div(id="div_dropdown_1", children =[
            dcc.Dropdown(id="dropdown_1", options=ls_options_1, searchable=False, placeholder="Select months"
                            ,style={'width': '90%',**component_width})
                     ]),
            html.Br(),
            html.Div(id = "reviews_1", children=[generate_reviews(group,ls_dic_review[0]) for group in list(ls_dic_review[0].keys())[:2]]
                            ,style={'width': '90%','text-align': 'justify',**component_width}
                            ,className="container"),
           html.Br()
                                                    ]),
      dcc.Tab(label='Reviews Rating 2', children=[
            dcc.Graph(id='rating_fig2', figure=ls_fig[1]
                             ,style={'width': '90%',**component_width}),
            html.Br(),
            dcc.Dropdown(id="dropdown_2", options=ls_options_2, searchable=False, placeholder="Select months"
                            ,style={'width': '90%',**component_width}),
            html.Br(),
            html.Div(id = "reviews_2", children=[generate_reviews(group,ls_dic_review[1]) for group in list(ls_dic_review[1].keys())[:2]]
                            ,style={'width': '90%','text-align': 'justify',**component_width}
                            ,className="container"),
            html.Br()
                                                    ]),
     dcc.Tab(label='Reviews Rating 3', children=[
            dcc.Graph(id='rating_fig3', figure=ls_fig[2]
                             ,style={'width': '90%',**component_width}),
            html.Br(),
            dcc.Dropdown(id="dropdown_3", options=ls_options_3, searchable=False, placeholder="Select months"
                            ,style={'width': '90%',**component_width}),
            html.Br(),
            html.Div(id = "reviews_3", children=[generate_reviews(group,ls_dic_review[2]) for group in list(ls_dic_review[2].keys())[:2]]
                            ,style={'width': '90%','text-align': 'justify',**component_width}
                            ,className="container"),
            html.Br(),
                                                    ]),
     dcc.Tab(label='Overall Reviews', children=[
            html.Br(),
            dbc.Row([
                dbc.Col(html.H4('Overall Sentiment for last {} reviews : {}'.format(fetch_total,sentiment))),
                dbc.Col(html.H4('Confidence in Sentiment Analysis : {}'.format(confidence)))
            ], style = {'width': '90%',**component_width}),
            html.Br(),
            dbc.Table.from_dataframe(id = "overall_reviews",df = overall_rating_df, striped = True 
                             ,style = {'font-size': '13px',**text_center,'width': '90%',**component_width}
                             ,className = 'table table-dark table-hover table-md' ,  header = True),
            html.Br(),
            dcc.Graph(id='rating_fig4', figure=fig_overall
                             ,style={'width': '90%',**component_width}),
            html.Br(),                         ]),
    ], colors={  "border": "white","primary": "gold","background": "#63C9B5"}
                            ,style={'width': '100%',**component_width})

])
@app_dash.callback(Output(component_id='reviews_1', component_property='children'),
        [Input(component_id="dropdown_1",component_property="value")])
def update_on_change(value):
    if value is None:
        raise PreventUpdate
    else:
        return html.Div(id = "reviews_1", children=[generate_reviews(group,ls_dic_review[0]) for group in list(ls_dic_review[0].keys())[:value]]
                            ,style={'width': '100%',**component_width})
    
@app_dash.callback(Output(component_id='reviews_2', component_property='children'),
        [Input(component_id="dropdown_2",component_property="value")])
def update_on_change(value):
    if value is None:
        raise PreventUpdate
    else:
        return html.Div(id = "reviews_2", children=[generate_reviews(group,ls_dic_review[1]) for group in list(ls_dic_review[1].keys())[:value]]
                            ,style={'width': '100%',**component_width})
    
@app_dash.callback(Output(component_id='reviews_3', component_property='children'),
        [Input(component_id="dropdown_3",component_property="value")])
def update_on_change(value):
    if value is None:
        raise PreventUpdate
    else:
        return html.Div(id = "reviews_3", children=[generate_reviews(group,ls_dic_review[2]) for group in list(ls_dic_review[2].keys())[:value]]
                            ,style={'width': '100%',**component_width})


    
app_dash.run_server()

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 in production, use a production WSGI server like gunicorn instead.

 in production, use a production WSGI server like gunicorn instead.

 in production, use a production WSGI server like gunicorn instead.

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [18/Mar/2021 04:00:28] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [18/Mar/2021 04:00:28] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [18/Mar/2021 04:00:28] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [18/Mar/2021 04:00:28] "[37mPOST /_dash-update-component HTTP/1.1[0m" 204 -
127.0.0.1 - - [18/Mar/2021 04:00:28] "[37mPOST /_dash-update-component HTTP/1.1[0m" 204 -
127.0.0.1 - - [18/Mar/2021 04:00:28] "[37mPOST /_dash-update-component HTTP/1.1[0m" 204 -
127.0.0.1 - - [18/Mar/2021 04:00:54] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [18/Mar/2021 04:01:00] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [18/Mar/2021 04:01:05] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
