# Forming Topic Models from Gcash Google Play Store Reviews Using Latent Dirichlet Allocation <br>
This notebook aims to generate topic models for 5-star and 1-star Gcash app reviews taken from the Google Play Store using Latent Dirichlet Allocation (LDA)

In [None]:
# !pip install emot
# !pip install contractions
# !pip install pyLDAvis

In [1]:
# importing libraries
import pandas as pd
import regex as re
import numpy as np
import tqdm

from emot.emo_unicode import UNICODE_EMO, EMOTICONS

import contractions
from textblob import TextBlob
from nltk.tokenize import word_tokenize

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import models

import spacy

from pprint import pprint

import seaborn as sns

import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis



In [2]:
def import_csv(path):
    reviews = pd.read_csv(path)
    reviews = reviews[['at', 'score', 'content']]
    return reviews

  and should_run_async(code)


In [4]:
gcash_reviews = pd.read_csv('/Mynt Capstone/data/gcash_reviews_playstore.csv')

  and should_run_async(code)


In [5]:
gcash_reviews.head()

  and should_run_async(code)


Unnamed: 0.1,Unnamed: 0,_id,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,app_name,app_id
0,0,60b76108c5eccca0e375ce55,gp:AOqpTOHa4x04QC93zb0pxQZ-1Q1GwFeysD-XpcDTW78...,Emmanuel Jim Roldan,https://play-lh.googleusercontent.com/a/AATXAJ...,Good for mobile online deals.,1,0,5.40.0,2021-06-02 18:32:37,,,GCash,com.globe.gcash.android
1,1,60b76108c5eccca0e375ce56,gp:AOqpTOFOvZwSS0iVvmGbcBKkwFyprROh5KqfcJuI5jg...,Harem Tuazon,https://play-lh.googleusercontent.com/a-/AOh14...,Ilang beses nang naulit na nag load ako nag ba...,1,0,5.41.0,2021-06-02 18:32:14,,,GCash,com.globe.gcash.android
2,2,60b76108c5eccca0e375ce57,gp:AOqpTOFb35bOoUsvlsEEPVO0UrtxJ2fut8jq5sdiexC...,Fraxilyn Nael,https://play-lh.googleusercontent.com/a-/AOh14...,Its a great experience and convenient,4,0,5.40.0,2021-06-02 18:32:07,,,GCash,com.globe.gcash.android
3,3,60b76108c5eccca0e375ce58,gp:AOqpTOFRgY4C4LVX_-Cr5D1zxW881WGrlPf01jMWwr4...,Mary rose Manipolo,https://play-lh.googleusercontent.com/a-/AOh14...,Ok na ok sya para sa mga easy transaction lalo...,5,0,5.41.0,2021-06-02 18:31:31,,,GCash,com.globe.gcash.android
4,4,60b76108c5eccca0e375ce59,gp:AOqpTOH8Znzr1cV9K-1A9ci8IBaQOr-fUkvUhC4P7ZU...,Ferritch Vlog,https://play-lh.googleusercontent.com/a-/AOh14...,very helpful and contented,5,0,5.40.0,2021-06-02 18:31:25,,,GCash,com.globe.gcash.android


In [6]:
# Remove emojis and emoticons

# Code block 1: remove emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

#Example
remove_emoji("Have fun with NLP! ðŸ˜ƒðŸ˜ƒ")

  and should_run_async(code)


'Have fun with NLP! '

In [7]:
# Function for removing emoticons
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

  and should_run_async(code)


In [8]:
# Function for removing URL's
def remove_urls (vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

  and should_run_async(code)


### Preprocessing Function

In [9]:
def preprocess(corpus, column):
    corpus[column].dropna(inplace=True) # Drop blank rows
    corpus[column] = corpus[column].astype(str)
    corpus[column] = [entry.lower() for entry in corpus[column]] # Lowercase text
    corpus[column] = corpus[column].apply(lambda x: remove_emoji(x)) #Remove emoji's
    corpus[column] = corpus[column].apply(lambda x: remove_emoticons(x)) #Remove emoticons
    corpus[column] = corpus[column].apply(lambda x: remove_urls(x)) #Remove URL's
    corpus[column] = corpus[column].apply(lambda x: contractions.fix(x)) # Expand contractions
    corpus[column] = corpus[column].str.replace("[^a-zA-Z#]", " ") # Remove unwanted characters, numbers and symbols
    corpus[column] = corpus[column].str.replace(r'\r+|\n+|\t+','', regex=True) #Remove \n \t \r
    corpus[column] = corpus[column].str.replace('[^\w\s]','') # Remove punctuation

  and should_run_async(code)
  corpus[column] = corpus[column].str.replace('[^\w\s]','') # Remove punctuation


In [10]:
preprocess(gcash_reviews, 'content')

  and should_run_async(code)
  corpus[column] = corpus[column].str.replace("[^a-zA-Z#]", " ") # Remove unwanted characters, numbers and symbols
  corpus[column] = corpus[column].str.replace('[^\w\s]','') # Remove punctuation


In [11]:
gcash_reviews = gcash_reviews[['at', 'score', 'content']]

  and should_run_async(code)


In [12]:
gcash_reviews.head(10)

  and should_run_async(code)


Unnamed: 0,at,score,content
0,2021-06-02 18:32:37,1,good for mobile online deals
1,2021-06-02 18:32:14,1,ilang beses nang naulit na nag load ako nag ba...
2,2021-06-02 18:32:07,4,its a great experience and convenient
3,2021-06-02 18:31:31,5,ok na ok sya para sa mga easy transaction lalo...
4,2021-06-02 18:31:25,5,very helpful and contented
5,2021-06-02 18:31:14,3,do not know
6,2021-06-02 18:31:07,1,disappointed with the new features in buying a...
7,2021-06-02 18:30:58,5,satisfied
8,2021-06-02 18:30:51,5,ok cya
9,2021-06-02 18:29:56,5,this app is already good


### Splitting 5-Star and 1-Star Reviews

In [13]:
gcash_rating5 = gcash_reviews[gcash_reviews['score'] == 5]
gcash_rating5.head()

  and should_run_async(code)


Unnamed: 0,at,score,content
3,2021-06-02 18:31:31,5,ok na ok sya para sa mga easy transaction lalo...
4,2021-06-02 18:31:25,5,very helpful and contented
7,2021-06-02 18:30:58,5,satisfied
8,2021-06-02 18:30:51,5,ok cya
9,2021-06-02 18:29:56,5,this app is already good


In [14]:
gcash_rating1 = gcash_reviews[gcash_reviews['score'] == 1]
gcash_rating1.head()

  and should_run_async(code)


Unnamed: 0,at,score,content
0,2021-06-02 18:32:37,1,good for mobile online deals
1,2021-06-02 18:32:14,1,ilang beses nang naulit na nag load ako nag ba...
6,2021-06-02 18:31:07,1,disappointed with the new features in buying a...
10,2021-06-02 18:29:06,1,the new update has many bugs
11,2021-06-02 18:28:30,1,regular prepaid loading options are gone


### Loading Stopword List

In [15]:
from spacy.lang.tl.stop_words import STOP_WORDS as tl_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

  and should_run_async(code)


In [16]:
final_stopwords_list = list(tl_stop) + list(en_stop) + \
['gcash', 'g-cash', 'app','po', 'please', 'i', 'thank', 'nyo', 'lang', 'pag', 'mag', 'nag', 'rin', 'nyu', 'kau', 'naman', 'mo']

  and should_run_async(code)


In [17]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
nlp.Defaults.stop_words.update(final_stopwords_list)

  and should_run_async(code)


---
### LDA Topic Modelling <br>
### A. Gcash 1-Star Reviews

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))

data = gcash_rating1.content.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1][0][:30])

['good', 'for', 'mobile', 'online', 'deals']


### Phrase Modeling: Bigram Model

In [None]:
# Build the bigram model
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [None]:
# Define functions for stopwords, bigrams, and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in final_stopwords_list] for doc in texts]


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [None]:
# Remove Stop Words
data_words_nostops1 = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams1 = make_bigrams(data_words_nostops1)

print(data_words_bigrams1[:1])

[['ok', 'ok', 'sya', 'easy', 'transaction', 'lalo', 'malalayong', 'lugar', 'thumb']]


In [None]:
# Create Dictionary
id2word1 = corpora.Dictionary(data_words_bigrams1)

# Create Corpus
texts1 = data_words_bigrams1

# Term Document Frequency
corpus1 = [id2word1.doc2bow(text) for text in texts1]

# View
print(corpus1[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1)]]


### Base Model Creation

In [None]:
# Build LDA model
lda_model_rating1 = gensim.models.LdaMulticore(corpus=corpus1,
                                       id2word=id2word1,
                                       num_topics=10, 
                                       random_state=1,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [None]:
# Print the keywords in the 10 topics
pprint(lda_model_rating1.print_topics())
doc_lda= lda_model_rating1[corpus1]

[(0,
  '0.043*"account" + 0.041*"error" + 0.038*"fix" + 0.033*"problem" + '
  '0.026*"register" + 0.025*"log" + 0.024*"time" + 0.017*"wrong" + '
  '0.016*"bank" + 0.016*"saying"'),
 (1,
  '0.051*"like" + 0.050*"good" + 0.043*"time" + 0.043*"use" + 0.033*"star" + '
  '0.023*"pay" + 0.023*"working" + 0.022*"bank" + 0.015*"said" + 0.014*"link"'),
 (2,
  '0.033*"payment" + 0.029*"money" + 0.027*"number" + 0.024*"received" + '
  '0.022*"got" + 0.022*"account" + 0.021*"hard" + 0.020*"use" + 0.019*"email" '
  '+ 0.017*"pesos"'),
 (3,
  '0.089*"id" + 0.072*"verified" + 0.065*"account" + 0.062*"verify" + '
  '0.039*"fully" + 0.038*"student" + 0.038*"verification" + 0.026*"updating" + '
  '0.015*"need" + 0.013*"picture"'),
 (4,
  '0.087*"money" + 0.065*"send" + 0.049*"code" + 0.035*"transfer" + '
  '0.027*"receive" + 0.024*"help" + 0.022*"want" + 0.021*"process" + '
  '0.018*"work" + 0.018*"valid"'),
 (5,
  '0.056*"di" + 0.037*"wala" + 0.029*"pera" + 0.028*"kayo" + 0.022*"tapos" + '
  '0.019*"ma

In [None]:
# Compute Coherence Score
coherence_model_lda1 = CoherenceModel(model=lda_model_rating1, texts=data_words_bigrams1, dictionary=id2word1, coherence='c_v')
coherence_lda1 = coherence_model_lda1.get_coherence()

print('\nCoherence Score: ', coherence_lda1)


Coherence Score:  0.4128627617734912


### Hyperparameter Tuning <br>
This part aims the optimum number of topics and values for eta and alpha.

In [21]:
# supporting function
def compute_coherence_values(data, corpus, dictionary, k, a, eta):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           alpha=a,
                                           eta=eta,
                                           random_state=100,
                                           chunksize=100,
                                           passes=10)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

  and should_run_async(code)


In [None]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Eta parameter
eta = list(np.arange(0.01, 1, 0.3))
eta.append('symmetric')

In [None]:
model_results = {'Topics': [],
                 'Alpha': [],
                 'Eta': [],
                 'Coherence': []
                }
# Loop to determine optimal number of topics
if 1 == 1:
    pbar = tqdm.tqdm(total=54)
    
    # iterate through the number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through eta values
            for e in eta:
                cv = compute_coherence_values(data_words_bigrams1, 
                                              corpus=corpus1, 
                                              dictionary=id2word1, 
                                              k=k,
                                              a=a,
                                              eta=e)
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Eta'].append(e)
                model_results['Coherence'].append(cv)
        
            pbar.update(1)
    
    pd.DataFrame(model_results).to_csv('gcash_playstore_lda_tuning_results_rating1.csv', index=False)
    pbar.close()

In [None]:
gcash_playstore_lda_tuning_results_rating1= pd.read_csv('gcash_playstore_lda_tuning_results_rating1.csv')

In [None]:
gcash_playstore_lda_tuning_results_rating1.sort_values(by='Coherence', ascending=False)

Unnamed: 0,Topics,Alpha,Eta,Coherence
123,6,0.01,0.9099999999999999,0.570397
183,8,0.01,0.9099999999999999,0.566391
203,8,symmetric,0.9099999999999999,0.565049
233,9,symmetric,0.9099999999999999,0.561167
173,7,symmetric,0.9099999999999999,0.561051
...,...,...,...,...
5,2,0.31,0.01,0.307047
15,2,0.9099999999999999,0.01,0.290869
10,2,0.61,0.01,0.289438
20,2,symmetric,0.01,0.284998


### Final Model

In [None]:
final_lda1 = gensim.models.LdaMulticore(corpus=corpus1,
                                           id2word=id2word1,
                                           num_topics=6, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.01,
                                           eta=0.9099999999999999	)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(final_lda1, corpus1, id2word1)
LDAvis_prepared

### B. Gcash 5-Star Reviews

In [18]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))

data = gcash_rating5.content.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1][0][:30])

  and should_run_async(code)


['ok', 'na', 'ok', 'sya', 'para', 'sa', 'mga', 'easy', 'transaction', 'lalo', 'na', 'mga', 'malalayong', 'lugar', 'thumb', 'up', 'ako']


In [19]:
# Build the bigram model
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

  and should_run_async(code)


In [20]:
# Define functions for stopwords, bigrams, and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in final_stopwords_list] for doc in texts]


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

  and should_run_async(code)


In [22]:
# Remove Stop Words
data_words_nostops5 = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams5 = make_bigrams(data_words_nostops5)

print(data_words_bigrams5[:1])

  and should_run_async(code)


[['ok', 'ok', 'sya', 'easy', 'transaction', 'lalo', 'malalayong', 'lugar', 'thumb']]


In [23]:
# Create Dictionary
id2word5 = corpora.Dictionary(data_words_bigrams5)

# Create Corpus
texts5 = data_words_bigrams5

# Term Document Frequency
corpus5 = [id2word5.doc2bow(text) for text in texts5]

# View
print(corpus5[:1])

  and should_run_async(code)


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1)]]


### Base Model Creation

In [None]:
# Build LDA model
lda_model_rating5 = gensim.models.LdaMulticore(corpus=corpus5,
                                       id2word=id2word5,
                                       num_topics=10, 
                                       random_state=1,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [None]:
# Print the keywords in the 10 topics
pprint(lda_model_rating5.print_topics())
doc_lda= lda_model_rating5[corpus5]

[(0,
  '0.283*"love" + 0.234*"great" + 0.136*"convenient" + 0.034*"fast" + '
  '0.033*"transactions" + 0.016*"reliable" + 0.015*"credit" + '
  '0.013*"especially" + 0.012*"helpfull" + 0.010*"di"'),
 (1,
  '0.084*"excellent" + 0.061*"transaction" + 0.048*"think" + '
  '0.035*"application" + 0.033*"try" + 0.029*"want" + 0.026*"new" + '
  '0.025*"wallet" + 0.023*"buying" + 0.022*"perfect"'),
 (2,
  '0.097*"thanks" + 0.061*"globe" + 0.035*"update" + 0.031*"paying" + '
  '0.027*"need" + 0.020*"card" + 0.019*"bills" + 0.019*"way" + 0.018*"loading" '
  '+ 0.015*"makes"'),
 (3,
  '0.095*"ok" + 0.041*"usefull" + 0.041*"paypal" + 0.036*"cool" + 0.024*"work" '
  '+ 0.022*"yung" + 0.021*"helps" + 0.017*"add" + 0.015*"thumbs" + '
  '0.015*"user"'),
 (4,
  '0.279*"useful" + 0.067*"amazing" + 0.058*"service" + 0.037*"wow" + '
  '0.036*"super" + 0.036*"easier" + 0.026*"bank" + 0.019*"code" + 0.017*"tnx" '
  '+ 0.014*"yes"'),
 (5,
  '0.244*"apps" + 0.123*"helpful" + 0.034*"life" + 0.032*"satisfied" + '

In [None]:
# Compute Coherence Score
coherence_model_lda5 = CoherenceModel(model=lda_model_rating5, texts=data_words_bigrams5, dictionary=id2word5, coherence='c_v')
coherence_lda5 = coherence_model_lda5.get_coherence()

print('\nCoherence Score: ', coherence_lda5)


Coherence Score:  0.4221780106989336


### Hyperparameter Tuning

In [24]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Eta parameter
eta = list(np.arange(0.01, 1, 0.3))
eta.append('symmetric')

  and should_run_async(code)


In [25]:
model_results = {'Topics': [],
                 'Alpha': [],
                 'Eta': [],
                 'Coherence': []
                }
# Loop to determine optimal number of topics
if 1 == 1:
    pbar = tqdm.tqdm(total=54)
    
    # iterate through the number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through eta values
            for e in eta:
                cv = compute_coherence_values(data_words_bigrams5, 
                                              corpus=corpus5, 
                                              dictionary=id2word5, 
                                              k=k,
                                              a=a,
                                              eta=e)
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Eta'].append(e)
                model_results['Coherence'].append(cv)
        
            pbar.update(1)
    
    pd.DataFrame(model_results).to_csv('gcash_playstore_lda_tuning_results_rating5.csv', index=False)
    pbar.close()

  and should_run_async(code)
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 54/54 [4:12:08<00:00, 280.16s/it]  


In [26]:
gcash_playstore_lda_tuning_results_rating5= pd.read_csv('gcash_playstore_lda_tuning_results_rating5.csv')

  and should_run_async(code)


In [27]:
gcash_playstore_lda_tuning_results_rating5.sort_values(by='Coherence', ascending=False)

  and should_run_async(code)


Unnamed: 0,Topics,Alpha,Eta,Coherence
88,4,asymmetric,0.9099999999999999,0.685420
117,5,asymmetric,0.61,0.667871
148,6,asymmetric,0.9099999999999999,0.657207
118,5,asymmetric,0.9099999999999999,0.656828
208,8,asymmetric,0.9099999999999999,0.633532
...,...,...,...,...
21,2,symmetric,0.31,0.369160
22,2,symmetric,0.61,0.369160
23,2,symmetric,0.9099999999999999,0.369160
24,2,symmetric,symmetric,0.369160


### Final Model

In [28]:
final_lda5 = gensim.models.LdaMulticore(corpus=corpus5,
                                           id2word=id2word5,
                                           num_topics=4, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha='asymmetric',
                                           eta=0.9099999999999999	)

  and should_run_async(code)


In [29]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(final_lda5, corpus5, id2word5)
LDAvis_prepared

  and should_run_async(code)
