# Preprocessing + Topic Modeling (Mixed Language Approach)
This notebook serves as a demo for topic modeling GCash App Store Reviews using LDA. The approach of this notebook is to ignore the multi-lingual nature of reviews and move directly to LDA after data cleaning. This notebook will be divided into four parts:

1. Data Cleaning
2. Text Cleaning
3. Modeling (LDA)
4. Exploration of Results

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!ls "/content/drive/My Drive/Data/Reviews/Play Store"

gcash_playstore_reviews_final.csv  paymaya_reviews_playstore.csv
gcash_reviews_playstore.csv


In [None]:
!pip install emot contractions pyLDAvis lda textblob
!python3 -m spacy download en

In [2]:
# importing libraries
import pandas as pd
import numpy as np 
import regex as re
import datetime
import json

from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import contractions
from textblob import TextBlob

from spacy.lang.tl.stop_words import STOP_WORDS as tl_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

#nltk
import nltk
from nltk.corpus import stopwords

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import nltk
import tqdm

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

  from collections import Iterable
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [3]:
# importing the dataset
df = pd.read_csv('/content/drive/My Drive/Data/Reviews/Play Store/gcash_playstore_reviews_final.csv')

In [4]:
df.shape

(199298, 13)

In [5]:
df.dtypes

_id                     object
app_id                  object
app_name                object
at                      object
content                 object
repliedAt               object
replyContent            object
reviewCreatedVersion    object
reviewId                object
score                    int64
thumbsUpCount            int64
userImage               object
userName                object
dtype: object

In [6]:
df.head(2)

Unnamed: 0,_id,app_id,app_name,at,content,repliedAt,replyContent,reviewCreatedVersion,reviewId,score,thumbsUpCount,userImage,userName
0,60b76108c5eccca0e375ce55,com.globe.gcash.android,GCash,2021-06-02T18:32:37.000Z,Good for mobile online deals.,,,5.40.0,gp:AOqpTOHa4x04QC93zb0pxQZ-1Q1GwFeysD-XpcDTW78...,1,0,https://play-lh.googleusercontent.com/a/AATXAJ...,Emmanuel Jim Roldan
1,60b76108c5eccca0e375ce56,com.globe.gcash.android,GCash,2021-06-02T18:32:14.000Z,Ilang beses nang naulit na nag load ako nag ba...,,,5.41.0,gp:AOqpTOFOvZwSS0iVvmGbcBKkwFyprROh5KqfcJuI5jg...,1,0,https://play-lh.googleusercontent.com/a-/AOh14...,Harem Tuazon


In [7]:
df['score'].value_counts()

5    118576
1     42537
4     16187
3     12885
2      9113
Name: score, dtype: int64

## Part 1: Data Exploration + Cleaning

In [8]:
# Checking for null values
df.isnull().sum() 

_id                          0
app_id                       0
app_name                     0
at                           0
content                     14
repliedAt               178901
replyContent            178901
reviewCreatedVersion     48259
reviewId                     0
score                        0
thumbsUpCount                0
userImage                    0
userName                     0
dtype: int64

In [9]:
# Dropping unwanted columns [userName, app_name, app_id, developerResponse]
df = df.drop(['_id', 'app_id', 'app_name', 'replyContent', 'reviewCreatedVersion', 'reviewId', 'thumbsUpCount', 'userImage', 'userName', 'repliedAt'], axis=1)

In [10]:
# Splitting datetime column + Checking date range

# Code block below separated the timestamp [[date]] column into Date and Time column respectively
df['Dates'] = pd.to_datetime(df['at']).dt.date
df['Time'] = pd.to_datetime(df['at']).dt.time
df = df.drop('at', axis=1)

In [11]:
df.sort_values(by='Dates', ascending=False).head(1)

Unnamed: 0,content,score,Dates,Time
0,Good for mobile online deals.,1,2021-06-02,18:32:37


In [12]:
df.sort_values(by='Dates', ascending=True).head(1)

Unnamed: 0,content,score,Dates,Time
198696,"""Unknown error occurred"" always popping up! Ne...",1,2012-03-26,18:49:57


In the section above, we can see that the dataset ranges from 2012-05-23 to 2021-06-11.

In [13]:
df.dtypes

content    object
score       int64
Dates      object
Time       object
dtype: object

In [14]:
df['word_count'] = df['content'].str.split().str.len()

In [15]:
df.head(5)

Unnamed: 0,content,score,Dates,Time,word_count
0,Good for mobile online deals.,1,2021-06-02,18:32:37,5.0
1,Ilang beses nang naulit na nag load ako nag ba...,1,2021-06-02,18:32:14,19.0
2,Its a great experience and convenient,4,2021-06-02,18:32:07,6.0
3,Ok na ok sya para sa mga easy transaction lalo...,5,2021-06-02,18:31:31,16.0
4,very helpful and contented,5,2021-06-02,18:31:25,4.0


In [16]:
df2 = df.copy()

In [17]:
df2 = df2.dropna(subset=['content'])

In [18]:
# Filter pandas df for reviews from january 2020 to may 2021
# import datetime
df2 = df2[(df2['Dates']>datetime.date(2020,1,1)) & (df2['Dates']<datetime.date(2021,5,31))] 

In [19]:
df2.shape

(130233, 5)

In [20]:
df2.sort_values(by='word_count')

Unnamed: 0,content,score,Dates,Time,word_count
102908,😊,5,2020-07-04,17:15:27,1.0
93166,Ok,5,2020-07-27,11:19:36,1.0
93164,good,5,2020-07-27,11:20:27,1.0
93163,good,5,2020-07-27,11:20:38,1.0
29655,good,5,2021-03-15,18:51:20,1.0
...,...,...,...,...,...
26452,You so much and I hope you have a great day an...,5,2021-04-02,16:02:29,114.0
38037,They stole my money. They said they're going t...,1,2021-02-17,15:58:22,117.0
102007,Just got home at the end of the day I love you...,5,2020-07-11,09:43:09,119.0
134248,Mop is a great day and I will be there to help...,5,2020-01-09,06:14:33,122.0


In [21]:
df2['word_count'].value_counts().sort_index().head(10)

1.0     35495
2.0     21638
3.0     10440
4.0      7608
5.0      5841
6.0      4527
7.0      3706
8.0      2985
9.0      2622
10.0     2374
Name: word_count, dtype: int64

In [22]:
df2.loc[df2['word_count'].isin([1])]

Unnamed: 0,content,score,Dates,Time,word_count
4549,Excellent!,5,2021-05-30,23:45:41,1.0
4555,Good,4,2021-05-30,22:17:59,1.0
4558,Good,5,2021-05-30,22:01:46,1.0
4591,Good,5,2021-05-30,17:33:12,1.0
4593,Good,5,2021-05-30,17:28:39,1.0
...,...,...,...,...,...
134754,Good,5,2020-01-02,12:21:38,1.0
134758,Nice,5,2020-01-02,12:12:53,1.0
134759,Good,5,2020-01-02,12:12:27,1.0
134781,nice,5,2020-01-02,05:42:03,1.0


## Part 2: Text Cleaning

Now that we have done the initial cleaning of the df, let us now proceed to text cleaning for topic modeling purposes.

My approach in this notebook would be as follows:
1. Do the necessary text cleaning steps (remove encodings, lowercase, strip punctutation, expand contractions etc)
2. Remove Filipino stop words and possibly lemmatize Filipino words
3. Remove English stop words and lemmatize
4. Tokenization and other methods

In [23]:
df2.head(4)

Unnamed: 0,content,score,Dates,Time,word_count
4547,Very good,5,2021-05-30,23:59:34,2.0
4548,So good app to me,5,2021-05-30,23:57:27,5.0
4549,Excellent!,5,2021-05-30,23:45:41,1.0
4550,It is easy to sending money and load by Gcash.,5,2021-05-30,23:45:05,10.0


In [24]:
# Rename content column to final_review + reset index
df2.rename(columns={'content':'final_review'}, inplace=True)

In [25]:
df2 = df2.reset_index(drop=True)

#### Remove emojis and emoticons

In [26]:
# Remove emojis and emoticons

# Code block 1: remove emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

#Example
remove_emoji("Have fun with NLP! 😃😃")

'Have fun with NLP! '

In [27]:
# apply the remove emoji function to the review column
df2['final_review'] = df2['final_review'].apply(lambda x: remove_emoji(x))

In [28]:
df2.loc[df2['word_count'].isin([1])]

Unnamed: 0,final_review,score,Dates,Time,word_count
2,Excellent!,5,2021-05-30,23:45:41,1.0
8,Good,4,2021-05-30,22:17:59,1.0
11,Good,5,2021-05-30,22:01:46,1.0
44,Good,5,2021-05-30,17:33:12,1.0
46,Good,5,2021-05-30,17:28:39,1.0
...,...,...,...,...,...
130198,Good,5,2020-01-02,12:21:38,1.0
130202,Nice,5,2020-01-02,12:12:53,1.0
130203,Good,5,2020-01-02,12:12:27,1.0
130225,nice,5,2020-01-02,05:42:03,1.0


In [29]:
# Code block 2: remove emoticons

# from emot.emo_unicode import UNICODE_EMO, EMOTICONS

# Function for removing emoticons
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)


In [30]:
#Example
remove_emoticons("I used to play Super Mario games :)")

'I used to play Super Mario games '

In [31]:
# Applying remove emoticons code
df2['final_review'] = df2['final_review'].apply(lambda x: remove_emoticons(x))

In [32]:
df2['word_count'] = df2['final_review'].str.split().str.len()

In [33]:
df2['word_count'].value_counts().sort_index().head(6)

0     2810
1    33170
2    21874
3    10145
4     7572
5     5751
Name: word_count, dtype: int64

In [34]:
df2.loc[df2['word_count'].isin([2])]

Unnamed: 0,final_review,score,Dates,Time,word_count
0,Very good,5,2021-05-30,23:59:34,2
4,Love it,5,2021-05-30,23:33:04,2
9,Nice app,5,2021-05-30,22:11:04,2
13,Verified now,5,2021-05-30,21:13:50,2
14,Can't online,1,2021-05-30,21:13:42,2
...,...,...,...,...,...
130223,Faster transaction,3,2020-01-02,07:39:58,2
130227,Great app,5,2020-01-02,02:25:38,2
130228,nice app,5,2020-01-02,02:08:09,2
130229,Love it!!!,5,2020-01-02,01:22:27,2


#### Remove \n, \t, \r

In [35]:
df2['final_review'][12]

"How dare you charged us for over the counter cash ins that's insane! You guys are such a ripped off! Just because the app is booming doesn't mean you can ripped us off anytime you want now! Bastards "

In [36]:
# Remove \r \n \t
df2['final_review'] = df2['final_review'].replace(r'\r+|\n+|\t+',' ', regex=True)

In [37]:
df2['final_review'][12]

"How dare you charged us for over the counter cash ins that's insane! You guys are such a ripped off! Just because the app is booming doesn't mean you can ripped us off anytime you want now! Bastards "

#### Convert reviews to lowercase

In [38]:
df2['final_review'] = df2['final_review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df2['final_review'].head()

0                                         very good
1                                 so good app to me
2                                        excellent!
3    it is easy to sending money and load by gcash.
4                                           love it
Name: final_review, dtype: object

#### Removing URLS

In [39]:
# import re

def remove_urls (vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

df2['final_review'] = df2.final_review.apply(remove_urls)
df2.final_review.head()

0                                         very good
1                                 so good app to me
2                                        excellent!
3    it is easy to sending money and load by gcash.
4                                           love it
Name: final_review, dtype: object

#### Expand Contractions

Contractions are words or combinations of words which are shortened by dropping letters and replacing them by an apostrophe. 

Let’s have a look at some examples:
* we’re = we are
* we’ve = we have
* I’d = I would

Note: This step needs to be done before word tokenizer because NLTK word tokenizer has in-built methods for dealing with contractions. However, NLTK word tokenizer's approach separates contractions without expanding. Expanding is a better method than simply separating.

In [40]:
# import contractions

# Example text 
text = ''' She'll be airport in 30 mins. We are supposed to catch the arrival, aren't we?  
          I'd love to welcome her personally. It'll be an awesome vacation.'''
  
# creating an empty list 
expanded_words = []     
for word in text.split(): 
  # using contractions.fix to expand the shotened words 
  expanded_words.append(contractions.fix(word))    
    
expanded_text = ' '.join(expanded_words) 
print('Original text: ' + text) 
print('\n') 
print('Expanded_text: ' + expanded_text)

Original text:  She'll be airport in 30 mins. We are supposed to catch the arrival, aren't we?  
          I'd love to welcome her personally. It'll be an awesome vacation.


Expanded_text: she will be airport in 30 mins. We are supposed to catch the arrival, are not we? I would love to welcome her personally. it will be an awesome vacation.


In [41]:
df2['final_review'] = df2['final_review'].apply(lambda x: contractions.fix(x))
df2.final_review.sample(5)

22621            free cash 2000
113845                exvellent
92873                    useful
29688     good and fast service
41686           very convenient
Name: final_review, dtype: object

#### Remove Punctuation + Correct Spelling using TextBlob

In [42]:
# remove punctuation
df2['final_review'] = df2['final_review'].str.replace('[^\w\s]','') # Remove punctuation

  df2['final_review'] = df2['final_review'].str.replace('[^\w\s]','') # Remove punctuation
  


In [43]:
# Apply TextBlob to correct spelling of words
df2['final_review'] = df2['final_review'].apply(lambda x: str(TextBlob(x))) # Remove punctuation

#### Separating good and bad reviews into separate dataframes

In [44]:
df2 = df2.reset_index()

In [45]:
df_goodreviews = df2.loc[df2['score'].isin([5])].copy()
df_badreviews = df2.loc[df2['score'].isin([1])].copy()

In [46]:
print(f'df_good_reviews: {df_goodreviews.shape}')
print(f'df_bad_reviews: {df_badreviews.shape}')

df_good_reviews: (76657, 6)
df_bad_reviews: (28226, 6)


In [47]:
df2['score'].value_counts()

5    76657
1    28226
4    10625
3     8576
2     6149
Name: score, dtype: int64

## Part 3: Modeling

#### Download nltk English stopwords and spacy model (EN for lemmatization)

In [48]:
# Run in python console
# import nltk
nltk.download('stopwords')

# Run in terminal or command prompt
#!python3 -m spacy download |en

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

#### Prepare Stop Words (English + Tagalog)

Sources for English stopwords: 
1. nltk library

Sources for Tagalog stopwords:
1. Typical Stop Words: https://github.com/stopwords-iso/stopwords-tl/blob/master/stopwords-tl.json
2. Profanity : https://github.com/jromest/filipino-badwords-list/blob/master/src/filipino-badwords-list.js

In [49]:
# from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [50]:
# import json
with open('/content/drive/My Drive/Data/stopwords/tagalog-sw') as file:
    tl_stopwords = json.load(file)

In [51]:
# Extend stopwords to include tagalog + common gcash related words 
stop_words.extend(tl_stopwords)
stop_words.extend(['gcash','g-cash','po', 'app', 'would','can'])
stop_words.extend(['nyo','naman','yung', 'di', 'wala ','mag', 'nag', 'pag', 'kayo', 'lang', 'ung', 'niyo', 'tapos', 'ba', 'mo', 'please', 'pls'])
stop_words.extend(['nga','yan','yun', 'akong', 'inyo', 'ur', 'star', 'nalang', 'kasi', 'talaga'])
stop_words.extend(['nmn', 'nlng','un','blah', 'tru', 'pwede','like','one','globe', 'guys', 'even', 'seems', 'hi', 'guess'])

#### Tokenize and Clean Up Text some more

In [52]:
# Check columns
df_badreviews.columns

Index(['index', 'final_review', 'score', 'Dates', 'Time', 'word_count'], dtype='object')

In [53]:
# Convert the review columns into a list
bad_reviews = df_badreviews.final_review.values.tolist()
good_reviews = df_goodreviews.final_review.values.tolist()

In [54]:
# Define a function to tokenize and remove punctuations
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

br_words = list(sent_to_words(bad_reviews))
gr_words = list(sent_to_words(good_reviews))

#### Create Bigram and Trigram Models

Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring.

Gensim’s Phrases model can build and implement the bigrams, trigrams, quadgrams and more. The two important arguments to Phrases are *min_count* and *threshold*. The higher the values of these param, the harder it is for words to be combined to bigrams.

###### Bigram and Trigram for good reviews

In [55]:
# Build the bigram and trigram models
g_bigram = gensim.models.Phrases(gr_words, min_count=5, threshold=80) # higher threshold fewer phrases.
g_trigram = gensim.models.Phrases(g_bigram[gr_words], threshold=80)  

# Faster way to get a sentence clubbed as a trigram/bigram
g_bigram_mod = gensim.models.phrases.Phraser(g_bigram)
g_trigram_mod = gensim.models.phrases.Phraser(g_trigram)

# See trigram example
print(g_trigram_mod[g_bigram_mod[gr_words[0]]])



['very', 'good']


##### Bigram and Trigam for bad reviews

In [56]:
# Build the bigram and trigram models
b_bigram = gensim.models.Phrases(br_words, min_count=5, threshold=80) # higher threshold fewer phrases.
b_trigram = gensim.models.Phrases(b_bigram[br_words], threshold=80)  

# Faster way to get a sentence clubbed as a trigram/bigram
b_bigram_mod = gensim.models.phrases.Phraser(b_bigram)
b_trigram_mod = gensim.models.phrases.Phraser(b_trigram)

# See trigram example
print(b_trigram_mod[b_bigram_mod[br_words[3]]])



['super', 'hustle', 'everytime', 'tried', 'to', 'load', 'my', 'phone', 'its', 'always', 'error', 'what', 'happened', 'to', 'you', 'gcash', 'poor', 'service']


#### Remove Stopwords, Make Bigrams and Lemmatize (Eng. words only)

##### Functionalizing the steps

I have excluded lemmatization as it brought out errant results when added to tagalog corpus.

In [57]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def g_make_bigrams(texts):
    return [g_bigram_mod[doc] for doc in texts]

def g_make_trigrams(texts):
    return [g_trigram_mod[g_bigram_mod[doc]] for doc in texts]

def b_make_bigrams(texts):
    return [b_bigram_mod[doc] for doc in texts]

def b_make_trigrams(texts):
    return [b_trigram_mod[b_bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

##### Applying the functions to tokenized bad and good reviews


In [58]:
# APPLYING TO TOKENIZED GOOD REVIEWS

# Remove Stop Words
gr_words_nostops = remove_stopwords(gr_words)

# ### REMOVED LEMMATIZATION COS IT AFFECTS THE TAGALOG WORDS / TAGALOG WORDS ARE RENDERED INEFFECTIVE
# # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# # python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])

# # Do lemmatization keeping only noun, adj, vb, adv
# gr_lemmatized = lemmatization(gr_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])


# Form Bigrams
gr_words_bigrams = g_make_bigrams(gr_words_nostops)

# Form Trigrams
gr_words_trigrams = g_make_trigrams(gr_words_bigrams)


In [59]:
gr_words_trigrams[13]

['nice', 'excellent']

In [60]:
# APPLYING TO TOKENIZED BAD REVIEWS

# Remove Stop Words
br_words_nostops = remove_stopwords(br_words)

# ## REMOVED LEMMATIZATION COS IT AFFECTS THE TAGALOG WORDS / TAGALOG WORDS ARE RENDERED INEFFECTIVE
# # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# # python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])

# # Do lemmatization keeping only noun, adj, vb, adv
# br_lemmatized = lemmatization(br_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Form Bigrams
br_words_bigrams = b_make_bigrams(br_words_nostops)

br_words_trigrams = b_make_trigrams(br_words_bigrams)




In [61]:
br_words_trigrams[3]


['super',
 'hustle',
 'everytime',
 'tried',
 'load',
 'phone',
 'always',
 'error',
 'happened',
 'poor',
 'service']

#### Create the Dictionary and Corpus needed for Topic Modeling

The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus. Let’s create them.

##### Good Reviews -  Dictionary and Corpus

In [62]:
# Create Dictionary
g_id2word = corpora.Dictionary(gr_words_trigrams)
g_id2word.filter_extremes(no_below=10, no_above=0.35)
g_id2word.compactify()

# Create Corpus
g_texts = gr_words_trigrams

# Term Document Frequency
g_corpus = [g_id2word.doc2bow(text) for text in g_texts]

# View
print(g_corpus[:1])

[[(0, 1)]]


##### Bad Reviews -  Dictionary and Corpus

In [63]:
# Create Dictionary
b_id2word = corpora.Dictionary(br_words_trigrams)
b_id2word.filter_extremes(no_below=10, no_above=0.35)
b_id2word.compactify()

# Create Corpus
b_texts = br_words_trigrams

# Term Document Frequency
b_corpus = [b_id2word.doc2bow(text) for text in b_texts]

# View
print(b_corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)]]


#### Building the Topic Model

##### Optizing Number of Topics for Good Reviews

In [64]:
# Build initial LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=g_corpus,
                                           id2word=g_id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad 

In [65]:
print(json.dumps(lda_model.print_topics(), indent=1))

[
 [
  0,
  "0.310*\"nice\" + 0.126*\"easy\" + 0.070*\"money\" + 0.055*\"load\" + 0.026*\"transfer\" + 0.024*\"payment\" + 0.018*\"send\" + 0.017*\"lot\" + 0.016*\"reliable\" + 0.015*\"save\""
 ],
 [
  1,
  "0.140*\"convenient\" + 0.117*\"ok\" + 0.086*\"awesome\" + 0.081*\"helpful\" + 0.071*\"bills\" + 0.060*\"pay\" + 0.034*\"cash\" + 0.034*\"bank\" + 0.029*\"paying\" + 0.025*\"online\""
 ],
 [
  2,
  "0.056*\"transactions\" + 0.051*\"hassle\" + 0.042*\"time\" + 0.025*\"okay\" + 0.021*\"thumbs\" + 0.017*\"savings\" + 0.017*\"awsome\" + 0.017*\"ever\" + 0.016*\"want\" + 0.016*\"helpfull\""
 ],
 [
  3,
  "0.164*\"use\" + 0.126*\"excellent\" + 0.081*\"amazing\" + 0.056*\"fast\" + 0.046*\"transaction\" + 0.039*\"super\" + 0.039*\"free\" + 0.036*\"buy\" + 0.035*\"satisfied\" + 0.030*\"far\""
 ],
 [
  4,
  "0.068*\"best\" + 0.058*\"thank\" + 0.042*\"thanks\" + 0.039*\"usefull\" + 0.039*\"really\" + 0.038*\"service\" + 0.028*\"using\" + 0.025*\"need\" + 0.025*\"account\" + 0.025*\"help\""
 ],

In [66]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=gr_words_trigrams, dictionary=g_id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [67]:
# #Optimizing topic models

# import numpy as np
# import tqdm
# grid = {}
# grid['Validation_Set'] = {}
# # Topics range
# min_topics = 2
# max_topics = 8
# step_size = 1
# topics_range = range(min_topics, max_topics, step_size)
# # Alpha parameter
# alpha = list(np.arange(0.01, 1, 0.3))
# alpha.append('symmetric')
# alpha.append('asymmetric')
# # Beta parameter
# beta = list(np.arange(0.01, 1, 0.3))
# beta.append('symmetric')
# # Validation sets
# num_of_docs = len(g_corpus)
# corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
#                # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
#                #gensim.utils.ClippedCorpus(g_corpus, num_of_docs*0.75), 
#                g_corpus]
# corpus_title = ['75% Corpus', '100% Corpus']
# model_results = {'Validation_Set': [],
#                  'Topics': [],
#                  'Alpha': [],
#                  'Beta': [],
#                  'Coherence': []
#                 }
# # Can take a long time to run
# if 1 == 1:
#     pbar = tqdm.tqdm(total=540)
    
#     # iterate through validation corpuses
#     for i in range(len(corpus_sets)):
#         # iterate through number of topics
#         for k in topics_range:
#             # iterate through alpha values
#             for a in alpha:
#                 # iterare through beta values
#                 for b in beta:
#                     # get the coherence score for the given parameters
#                     cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=g_id2word, 
#                                                   k=k, a=a, b=b)
#                     # Save the model results
#                     model_results['Validation_Set'].append(corpus_title[i])
#                     model_results['Topics'].append(k)
#                     model_results['Alpha'].append(a)
#                     model_results['Beta'].append(b)
#                     model_results['Coherence'].append(cv)
                    
#                     pbar.update(1)
#     pd.DataFrame(model_results).to_csv('gcash_gr_lda_tuning_results.csv', index=False)
#     pbar.close()

In [68]:
# model_cv_df_gr = pd.DataFrame(model_results) 
# model_cv_df_gr.sort_values(by='Coherence',ascending=False)

In [69]:
# Build optimized LDA model

lda_model_experimental = gensim.models.LdaMulticore(corpus=g_corpus,
                                           id2word=g_id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.61,
                                           eta=0.91)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad 

In [70]:
print(json.dumps(lda_model_experimental.print_topics(), indent=1))

[
 [
  0,
  "0.375*\"nice\" + 0.222*\"great\" + 0.157*\"apps\" + 0.065*\"amazing\" + 0.042*\"thanks\" + 0.038*\"service\" + 0.019*\"experience\" + 0.007*\"user_friendly\" + 0.003*\"aps\" + 0.003*\"game\""
 ],
 [
  1,
  "0.130*\"excellent\" + 0.110*\"awesome\" + 0.104*\"helpful\" + 0.040*\"super\" + 0.025*\"convinient\" + 0.019*\"cool\" + 0.015*\"thumbs\" + 0.014*\"maganda\" + 0.013*\"better\" + 0.013*\"sana\""
 ],
 [
  2,
  "0.109*\"easy\" + 0.094*\"use\" + 0.020*\"satisfied\" + 0.020*\"using\" + 0.018*\"account\" + 0.016*\"wow\" + 0.013*\"reliable\" + 0.013*\"cannot\" + 0.012*\"update\" + 0.011*\"stars\""
 ],
 [
  3,
  "0.052*\"money\" + 0.044*\"bills\" + 0.042*\"best\" + 0.041*\"load\" + 0.037*\"pay\" + 0.036*\"thank\" + 0.028*\"fast\" + 0.024*\"really\" + 0.023*\"transaction\" + 0.021*\"bank\""
 ],
 [
  4,
  "0.759*\"good\" + 0.052*\"usefull\" + 0.032*\"far\" + 0.028*\"job\" + 0.005*\"excellence\" + 0.005*\"quality\" + 0.005*\"yes\" + 0.005*\"exellent\" + 0.004*\"trusted\" + 0.003*\

##### Optimizing Number of Topics for Bad Reviews

In [71]:
# Build LDA model
lda_model2 = gensim.models.ldamodel.LdaModel(corpus=b_corpus,
                                           id2word=b_id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad 

In [72]:
print(json.dumps(lda_model2.print_topics(), indent=1))

[
 [
  0,
  "0.107*\"update\" + 0.052*\"need\" + 0.051*\"gcredit\" + 0.043*\"problem\" + 0.038*\"issue\" + 0.032*\"code\" + 0.021*\"credit\" + 0.018*\"almost\" + 0.016*\"amount\" + 0.015*\"new\""
 ],
 [
  1,
  "0.189*\"cannot\" + 0.075*\"always\" + 0.050*\"fix\" + 0.043*\"error\" + 0.032*\"open\" + 0.027*\"says\" + 0.026*\"try\" + 0.025*\"working\" + 0.023*\"log\" + 0.020*\"bills\""
 ],
 [
  2,
  "0.052*\"money\" + 0.047*\"use\" + 0.036*\"service\" + 0.027*\"still\" + 0.027*\"time\" + 0.026*\"customer\" + 0.023*\"already\" + 0.016*\"help\" + 0.015*\"since\" + 0.013*\"transaction\""
 ],
 [
  3,
  "0.104*\"account\" + 0.051*\"cash\" + 0.033*\"get\" + 0.029*\"verified\" + 0.026*\"email\" + 0.024*\"bank\" + 0.024*\"tried\" + 0.019*\"fully\" + 0.018*\"verify\" + 0.017*\"id\""
 ],
 [
  4,
  "0.053*\"mpin\" + 0.041*\"wala\" + 0.034*\"pera\" + 0.027*\"receive\" + 0.013*\"forgot\" + 0.013*\"ayaw\" + 0.012*\"sana\" + 0.012*\"anything\" + 0.011*\"lng\" + 0.010*\"costumer\""
 ],
 [
  5,
  "0.051*\

In [73]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=gr_words_trigrams, dictionary=g_id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

#### Visualizing LDA model results

In [74]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, g_corpus, g_id2word)
vis

In [75]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model2, b_corpus, b_id2word)
vis