In [3]:
import streamlit as st
import numpy as np
import pandas as pd
import json
from tqdm import tqdm

!pip install google_play_scraper
from google_play_scraper import reviews_all, Sort

## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud
## for text processing
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
## for sentiment
!pip install textblob
from textblob import TextBlob
## for vectorizer
from sklearn import feature_extraction, manifold

##Predict
!pip install transformers
from transformers import RobertaForSequenceClassification, AutoTokenizer

from transformers import logging

!pip install torch
import torch




AttributeError: partially initialized module 'nltk' has no attribute 'data' (most likely due to a circular import)

In [None]:
app_package_name = 'com.netflix.ninja'

result = reviews_all(
    app_package_name,
    sleep_milliseconds=2, # defaults to 0
    lang='en', # defaults to 'en'
    country='us', # defaults to 'us'
    sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
)

In [None]:
len(result)

In [None]:
df_raw = pd.DataFrame.from_records(result)

In [None]:
# df_raw = pd.read_excel(r'D:\Portfolio\Dataset\Chat_Team_CaseStudy FINAL.xlsx')

In [None]:
df = df_raw.copy()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.score.unique()

In [None]:
min(df['at'])

In [None]:
df_ = df.copy()
df_['content'] = df_['content'].apply(lambda text: emoji.replace_emoji(str(text), ''))
df_['content'] = df_['content'].replace(r'^\s*$', np.nan, regex=True)
emoji_review_indices = df_[df_['content'].isna()].index
# set those reviews as NaN in the original df
df.loc[emoji_review_indices, 'content'] = np.nan

print(f"There are {len(emoji_review_indices)} reviews that are just emojis or empty.")

In [None]:
nans = df.isna().sum()
nans_x = nans.index.to_list()
nans_y = nans.values
plt.bar(nans_x, nans_y)
plt.xticks(rotation=60)
plt.show()
display(nans)

In [None]:
review_len=df['content'].str.len()
plt.hist(review_len,color='blue', bins=100)
plt.xlim(-2, 500)
plt.title('characters in reviews')
plt.show()

In [None]:
review_len=df['content'].str.split().map(lambda x: len(x))
plt.hist(review_len,color='blue', bins=100)
plt.title('Word lengths')
plt.show()

In [None]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [None]:
lst_stopwords = nltk.corpus.stopwords.words("english")
df['content_clean'] = df['content'].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))

In [None]:
df.head()

In [None]:
def create_corpus():
    corpus = []
    for x in df['content_clean'].str.lower().str.split():
        for i in x:
            corpus.append(i)
    return corpus

corpus = create_corpus()

In [None]:
top=10
corpus = df["content_clean"]
lst_tokens = nltk.tokenize.word_tokenize(corpus.str.cat(sep=" "))
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.suptitle("Most frequent words", fontsize=15)
    
## unigrams
dic_words_freq = nltk.FreqDist(lst_tokens)
dtf_uni = pd.DataFrame(dic_words_freq.most_common(), 
                       columns=["Word","Freq"])
dtf_uni.set_index("Word").iloc[:top,:].sort_values(by="Freq").plot(
                  kind="barh", title="Unigrams", ax=ax[0], 
                  legend=False).grid(axis='x')
ax[0].set(ylabel=None)
    
## bigrams
dic_words_freq = nltk.FreqDist(nltk.ngrams(lst_tokens, 2))
dtf_bi = pd.DataFrame(dic_words_freq.most_common(), 
                      columns=["Word","Freq"])
dtf_bi["Word"] = dtf_bi["Word"].apply(lambda x: " ".join(
                   string for string in x) )
dtf_bi.set_index("Word").iloc[:top,:].sort_values(by="Freq").plot(
                  kind="barh", title="Bigrams", ax=ax[1],
                  legend=False).grid(axis='x')
ax[1].set(ylabel=None)
plt.show()

In [None]:
wc = wordcloud.WordCloud(background_color='black', max_words=100, 
                         max_font_size=35)
wc = wc.generate(str(corpus))
fig = plt.figure(num=1)
plt.axis('off')
plt.imshow(wc, cmap=None)
plt.show()

<h3>Modeling</h3>

In [None]:
idx_to_label = {
    0: 'CONTENT',
    1: 'USER_EXPERIENCE',
    2: 'SUBSCRIPTION',
    3: 'INTERFACE'}
idx_to_label_sentiments = {
    0: 'NEGATIVE',
    1: 'NEUTRAL',
    2: 'POSITIVE'
}

In [None]:
model = RobertaForSequenceClassification.from_pretrained('./review-classification-roberta-40-trials')