Reference: https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0?gi=b3b53e8290cd

In [1]:
import warnings
import gensim
import spacy
import joblib
import pickle
import pyLDAvis
import tqdm
import pandas as pd
import numpy as np
import pyLDAvis.gensim_models as gensimvis
import gensim.corpora as corpora

from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from datetime import datetime

warnings.filterwarnings(action="ignore")
pyLDAvis.enable_notebook()



### I. Import Data
---

In [2]:
gcash_good = joblib.load('../../Data/Preprocessed/Mico/P2/gcash_good.sav')
gcash_bad = joblib.load('../../Data/Preprocessed/Mico/P2/gcash_bad.sav')
paymaya_good = joblib.load('../../Data/Preprocessed/Mico/P2/paymaya_good.sav')
paymaya_bad = joblib.load('../../Data/Preprocessed/Mico/P2/paymaya_bad.sav')

In [3]:
def get_data(list, store):
    if store == 'as':
        data, dictionary, corpus = list[0], list[1], list[2]
    elif store == 'ps':
        data, dictionary, corpus = list[3], list[4], list[5]
    return data, dictionary, corpus

In [4]:
gcash_good_as, gcash_good_dict_as, gcash_good_corpus_as = get_data(gcash_good, 'as')
gcash_good_ps, gcash_good_dict_ps, gcash_good_corpus_ps = get_data(gcash_good, 'ps')

In [5]:
gcash_bad_as, gcash_bad_dict_as, gcash_bad_corpus_as = get_data(gcash_bad, 'as')
gcash_bad_ps, gcash_bad_dict_ps, gcash_bad_corpus_ps = get_data(gcash_bad, 'ps')

In [6]:
paymaya_good_as, paymaya_good_dict_as, paymaya_good_corpus_as = get_data(paymaya_good, 'as')
paymaya_good_ps, paymaya_good_dict_ps, paymaya_good_corpus_ps = get_data(paymaya_good, 'ps')

In [7]:
paymaya_bad_as, paymaya_bad_dict_as, paymaya_bad_corpus_as = get_data(paymaya_bad, 'as')
paymaya_bad_ps, paymaya_bad_dict_ps, paymaya_bad_corpus_ps = get_data(paymaya_bad, 'ps')

### II. Topic Modeling
---

#### A. Base Model

In [8]:
def build_model(corpus, dictionary):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=10, 
                                           random_state=100,
                                           chunksize=10000, # Increasing chunksize will speed up training as long as document fits memory
                                           passes=10, # Epochs
                                           per_word_topics=True)
    return lda_model

Create different LDA models for each type of review and store.

In [9]:
gcash_good_lda_as = build_model(gcash_good_corpus_as, gcash_good_dict_as)
gcash_bad_lda_as = build_model(gcash_bad_corpus_as, gcash_bad_dict_as)

In [10]:
gcash_good_lda_ps = build_model(gcash_good_corpus_ps, gcash_good_dict_ps)
gcash_bad_lda_ps = build_model(gcash_bad_corpus_ps, gcash_bad_dict_ps)

In [11]:
paymaya_good_lda_as = build_model(paymaya_good_corpus_as, paymaya_good_dict_as)
paymaya_bad_lda_as = build_model(paymaya_bad_corpus_as, paymaya_bad_dict_as)

In [12]:
paymaya_good_lda_ps = build_model(paymaya_good_corpus_ps, paymaya_good_dict_ps)
paymaya_bad_lda_ps = build_model(paymaya_bad_corpus_ps, paymaya_bad_dict_ps)

#### B. Base Coherence

Check quality of topics through topic coherence which is measured by the "degree of semantic similarity between hgih scoring words in the topics." We first compute the base coherence of the topics.

In [13]:
def compute_coherence(lda_model, data, dictionary):
    coherence_model_lda = CoherenceModel(model = lda_model,
                                         texts = data,
                                         dictionary = dictionary, 
                                         coherence = 'c_v')
    
    return coherence_model_lda.get_coherence()

In [14]:
gcash_good_as_base = compute_coherence(gcash_good_lda_as, gcash_good_as, gcash_good_dict_as)
gcash_bad_as_base = compute_coherence(gcash_bad_lda_as, gcash_bad_as, gcash_bad_dict_as)
gcash_good_ps_base = compute_coherence(gcash_good_lda_ps, gcash_good_ps, gcash_good_dict_ps)
gcash_bad_ps_base = compute_coherence(gcash_bad_lda_ps, gcash_bad_ps, gcash_bad_dict_ps)

In [15]:
paymaya_good_as_base = compute_coherence(paymaya_good_lda_as, paymaya_good_as, paymaya_good_dict_as)
paymaya_bad_as_base = compute_coherence(paymaya_bad_lda_as, paymaya_bad_as, paymaya_bad_dict_as)
paymaya_good_ps_base = compute_coherence(paymaya_good_lda_ps, paymaya_good_ps, paymaya_good_dict_ps)
paymaya_bad_ps_base = compute_coherence(paymaya_bad_lda_ps, paymaya_bad_ps, paymaya_bad_dict_ps)

In [16]:
gcash_as = [gcash_good_as_base, gcash_bad_as_base]
gcash_ps = [gcash_good_ps_base, gcash_bad_ps_base]
paymaya_as = [paymaya_good_as_base, paymaya_bad_as_base]
paymaya_ps = [paymaya_good_ps_base, paymaya_bad_ps_base]

In [17]:
df = pd.DataFrame(list(zip(gcash_as, gcash_ps, paymaya_as, paymaya_ps)), 
                  columns = ['GCash AS', 'GCash PS', 'PayMaya AS', 'PayMaya PS'],
                  index = ['Good', 'Bad'])

In [18]:
df

Unnamed: 0,GCash AS,GCash PS,PayMaya AS,PayMaya PS
Good,0.429972,0.461109,0.295934,0.549764
Bad,0.391118,0.422193,0.29937,0.463847


In [19]:
df.to_csv('Results/base.csv', index=False)

#### C. Hyperparameter Tuning

In [8]:
# supporting function
def compute_coherence_values(corpus, dictionary, data, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           random_state=100,
                                           chunksize=10000,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [9]:
def hyperparameter_tuner(corpus, dictionary, data, temp):
    grid = {}
    grid['Validation_Set'] = {}

    # Topics range
    min_topics = 2
    max_topics = 6
    step_size = 1
    topics_range = range(min_topics, max_topics, step_size)

    # Alpha parameter
    alpha = list(np.arange(0.01, 1, 0.3))
    alpha.append('symmetric')
    alpha.append('asymmetric')

    # Beta parameter
    beta = list(np.arange(0.01, 1, 0.3))
    beta.append('symmetric')

    # Validation sets
    num_of_docs = len(corpus)
    corpus_sets = [corpus]
    corpus_title = ['100% Corpus']
    model_results = {'Validation_Set': [],
                     'Topics': [],
                     'Alpha': [],
                     'Beta': [],
                     'Coherence': []
                    }

    # Can take a long time to run
    if 1 == 1:
        pbar = tqdm.tqdm(total=120)

        # iterate through validation corpuses
        for i in range(len(corpus_sets)):
            # iterate through number of topics
            for k in topics_range:
                # iterate through alpha values
                for a in alpha:
                    # iterare through beta values
                    for b in beta:
                        # get the coherence score for the given parameters
                        cv = compute_coherence_values(corpus_sets[i], 
                                                      dictionary, 
                                                      data,
                                                      k, a, b)
                        # Save the model results
                        model_results['Validation_Set'].append(corpus_title[i])
                        model_results['Topics'].append(k)
                        model_results['Alpha'].append(a)
                        model_results['Beta'].append(b)
                        model_results['Coherence'].append(cv)

                        pbar.update(1)
        filename = f'Results/{temp}.csv'
        pd.DataFrame(model_results).to_csv(filename, index=False)
        pbar.close()

In [10]:
hyperparameter_tuner(gcash_good_corpus_as, gcash_good_dict_as, gcash_good_as, 'gcash_good_as')

100%|██████████| 120/120 [06:06<00:00,  3.05s/it]


In [11]:
hyperparameter_tuner(gcash_bad_corpus_as, gcash_bad_dict_as, gcash_bad_as, 'gcash_bad_as')

100%|██████████| 120/120 [18:18<00:00,  9.16s/it]


In [12]:
hyperparameter_tuner(gcash_good_corpus_ps, gcash_good_dict_ps, gcash_good_ps, 'gcash_good_ps')

100%|██████████| 120/120 [56:47<00:00, 28.40s/it]


In [13]:
hyperparameter_tuner(gcash_bad_corpus_ps, gcash_bad_dict_ps, gcash_bad_ps, 'gcash_bad_ps')

100%|██████████| 120/120 [1:25:25<00:00, 42.71s/it]


In [14]:
hyperparameter_tuner(paymaya_good_corpus_as, paymaya_good_dict_as, paymaya_good_as, 'paymaya_good_as')

 17%|█▋        | 20/120 [01:15<06:23,  3.83s/it]

KeyboardInterrupt: 

In [None]:
hyperparameter_tuner(paymaya_bad_corpus_as, paymaya_bad_dict_as, paymaya_bad_as, 'paymaya_bad_as')

In [None]:
hyperparameter_tuner(paymaya_good_corpus_ps, paymaya_good_dict_ps, paymaya_good_ps, 'paymaya_good_ps')

In [None]:
hyperparameter_tuner(paymaya_bad_corpus_ps, paymaya_bad_dict_ps, paymaya_bad_ps, 'paymaya_bad_ps')

#### D. Final Models

In [15]:
def build_final_model(df, corpus, dictionary):
    df1 = df.iloc[df['Coherence'].idxmax]
    alpha = df1.Alpha
    beta = df1.Beta
    topics = df1.Topics
    
    if (alpha != 'symmetric') and (alpha != "asymmetric"):
        alpha = float(alpha)
        
    if (beta != 'symmetric'):
        beta = float(beta)
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=topics, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=alpha,
                                           eta=beta)
    
    return lda_model

In [16]:
df_gcash_good_as = pd.read_csv('Results/gcash_good_as.csv')
df_gcash_bad_as = pd.read_csv('Results/gcash_bad_as.csv')
df_gcash_good_ps = pd.read_csv('Results/gcash_good_ps.csv')
df_gcash_bad_ps = pd.read_csv('Results/gcash_bad_ps.csv')

In [576]:
df_paymaya_good_as = pd.read_csv('Results/paymaya_good_as.csv')
df_paymaya_bad_as = pd.read_csv('Results/paymaya_bad_as.csv')
df_paymaya_good_ps = pd.read_csv('Results/paymaya_good_ps.csv')
df_paymaya_bad_ps = pd.read_csv('Results/paymaya_bad_ps.csv')

In [17]:
gcash_good_as_final = build_final_model(df_gcash_good_as, gcash_good_corpus_as, gcash_good_dict_as)
gcash_bad_as_final = build_final_model(df_gcash_bad_as, gcash_bad_corpus_as, gcash_bad_dict_as)
gcash_good_ps_final = build_final_model(df_gcash_good_ps, gcash_good_corpus_ps, gcash_good_dict_ps)
gcash_bad_ps_final = build_final_model(df_gcash_bad_ps, gcash_bad_corpus_ps, gcash_bad_dict_ps)

In [578]:
paymaya_good_as_final = build_final_model(df_paymaya_good_as, paymaya_good_corpus_as, paymaya_good_dict_as)
paymaya_bad_as_final = build_final_model(df_paymaya_bad_as, paymaya_bad_corpus_as, paymaya_bad_dict_as)
paymaya_good_ps_final = build_final_model(df_paymaya_good_ps, paymaya_good_corpus_ps, paymaya_good_dict_ps)
paymaya_bad_ps_final = build_final_model(df_paymaya_bad_ps, paymaya_bad_corpus_ps, paymaya_bad_dict_ps)

### III. Topic Visualization 
---

In [18]:
gcash_good_as_visual = gensimvis.prepare(gcash_good_as_final, gcash_good_corpus_as, gcash_good_dict_as)
print(gcash_good_as_final.print_topics())
gcash_good_as_visual

[(0, '0.156*"money" + 0.097*"pay" + 0.080*"bills" + 0.078*"transactions" + 0.068*"bank" + 0.068*"cash" + 0.061*"online" + 0.060*"transfer" + 0.060*"use" + 0.049*"hope"'), (1, '0.149*"good" + 0.143*"love" + 0.125*"convenient" + 0.095*"use" + 0.078*"need" + 0.074*"great" + 0.060*"transaction" + 0.059*"time" + 0.049*"useful" + 0.049*"easy"')]


In [19]:
gcash_bad_as_visual = gensimvis.prepare(gcash_bad_as_final, gcash_bad_corpus_as, gcash_bad_dict_as)
print(gcash_bad_as_final.print_topics())
gcash_bad_as_visual

[(0, '0.042*"money" + 0.025*"load" + 0.019*"verify" + 0.015*"tried" + 0.015*"already" + 0.015*"fix" + 0.015*"use" + 0.013*"get" + 0.013*"mpin" + 0.011*"transaction"'), (1, '0.041*"money" + 0.028*"service" + 0.024*"use" + 0.022*"bank" + 0.018*"customer_service" + 0.014*"working" + 0.014*"transaction" + 0.012*"one" + 0.012*"log" + 0.011*"ticket"'), (2, '0.024*"fix" + 0.021*"verification" + 0.020*"verified" + 0.018*"need" + 0.016*"gcredit" + 0.015*"verify" + 0.014*"use" + 0.013*"problem" + 0.013*"issue" + 0.012*"already"'), (3, '0.048*"cash" + 0.048*"update" + 0.024*"always" + 0.019*"pera" + 0.019*"use" + 0.019*"fix" + 0.016*"wala" + 0.015*"email" + 0.014*"error" + 0.013*"need"')]


In [20]:
gcash_good_ps_visual = gensimvis.prepare(gcash_good_ps_final, gcash_good_corpus_ps, gcash_good_dict_ps)
print(gcash_good_ps_final.print_topics())
gcash_good_ps_visual

[(0, '0.258*"good" + 0.166*"nice" + 0.080*"great" + 0.064*"apps" + 0.049*"ok" + 0.048*"useful" + 0.042*"excellent" + 0.030*"helpful" + 0.026*"awesome" + 0.023*"amazing"'), (1, '0.098*"easy" + 0.078*"use" + 0.063*"convenient" + 0.052*"money" + 0.028*"fast" + 0.025*"transaction" + 0.017*"send" + 0.015*"cash" + 0.014*"pay_bills" + 0.014*"payment"'), (2, '0.097*"love" + 0.018*"update" + 0.017*"load" + 0.016*"much" + 0.013*"maganda" + 0.013*"give" + 0.012*"cash" + 0.011*"really" + 0.011*"hope" + 0.009*"need"')]


In [21]:
gcash_bad_ps_visual = gensimvis.prepare(gcash_bad_ps_final, gcash_bad_corpus_ps, gcash_bad_dict_ps)
print(gcash_bad_ps_final.print_topics())
gcash_bad_ps_visual

[(0, '0.053*"update" + 0.044*"always" + 0.025*"use" + 0.025*"need" + 0.022*"open" + 0.019*"fix" + 0.017*"good" + 0.014*"error" + 0.014*"time" + 0.013*"log"'), (1, '0.023*"email" + 0.023*"code" + 0.018*"number" + 0.016*"send" + 0.013*"register" + 0.013*"already" + 0.012*"worst" + 0.012*"customer_service" + 0.011*"get" + 0.011*"mpin"'), (2, '0.066*"money" + 0.030*"cash" + 0.017*"load" + 0.015*"transaction" + 0.015*"service" + 0.013*"back" + 0.013*"send" + 0.011*"bank" + 0.009*"pesos" + 0.009*"give"'), (3, '0.028*"pera" + 0.028*"update" + 0.027*"wala" + 0.025*"load" + 0.013*"cash" + 0.012*"ayaw" + 0.009*"ok" + 0.008*"bulok" + 0.008*"tagal" + 0.008*"laging"'), (4, '0.077*"verify" + 0.052*"verified" + 0.051*"id" + 0.031*"student_id" + 0.030*"verification" + 0.022*"fully_verified" + 0.019*"get" + 0.015*"option" + 0.014*"student" + 0.013*"picture"')]


In [583]:
paymaya_good_as_visual = gensimvis.prepare(paymaya_good_as_final, paymaya_good_corpus_as, paymaya_good_dict_as)
print(paymaya_good_as_final.print_topics())
paymaya_good_as_visual

[(0, '0.014*"use" + 0.012*"convenient" + 0.010*"easy" + 0.009*"pay" + 0.007*"great" + 0.007*"good" + 0.007*"money" + 0.007*"need" + 0.007*"really" + 0.006*"love"'), (1, '0.003*"applications" + 0.003*"one" + 0.002*"money" + 0.002*"wallet" + 0.002*"service" + 0.002*"cash" + 0.002*"pay" + 0.002*"never" + 0.002*"email" + 0.002*"tumawag"'), (2, '0.003*"email" + 0.003*"password" + 0.003*"face" + 0.002*"new" + 0.002*"started" + 0.002*"frustrating" + 0.002*"recovery" + 0.002*"since" + 0.002*"sales" + 0.002*"chat"')]


In [584]:
paymaya_bad_as_visual = gensimvis.prepare(paymaya_bad_as_final, paymaya_bad_corpus_as, paymaya_bad_dict_as)
print(paymaya_bad_as_final.print_topics())
paymaya_bad_as_visual

[(0, '0.015*"money" + 0.012*"upgrade" + 0.009*"use" + 0.008*"tried" + 0.007*"customer_service" + 0.007*"error" + 0.006*"since" + 0.006*"service" + 0.005*"email" + 0.005*"card"'), (1, '0.008*"money" + 0.004*"get" + 0.004*"customer_service" + 0.004*"service" + 0.004*"card" + 0.003*"use" + 0.003*"balance" + 0.003*"fee" + 0.003*"online" + 0.003*"virtual_card"'), (2, '0.003*"replika" + 0.002*"get" + 0.002*"credentials" + 0.002*"worse" + 0.002*"greedy" + 0.002*"us" + 0.002*"users" + 0.001*"feature" + 0.001*"sorry" + 0.001*"old"'), (3, '0.003*"service" + 0.002*"number" + 0.002*"load" + 0.002*"tsk" + 0.002*"poor" + 0.002*"error" + 0.002*"grabe" + 0.002*"reply" + 0.002*"ayos" + 0.002*"wala"'), (4, '0.002*"name" + 0.002*"passport" + 0.001*"valid" + 0.001*"buffer" + 0.001*"match" + 0.001*"gateway" + 0.001*"needs" + 0.001*"screen" + 0.001*"attempt" + 0.001*"hope"')]


In [585]:
paymaya_good_ps_visual = gensimvis.prepare(paymaya_good_ps_final, paymaya_good_corpus_ps, paymaya_good_dict_ps)
print(paymaya_good_ps_final.print_topics())
paymaya_good_ps_visual

[(0, '0.114*"good" + 0.082*"nice" + 0.059*"great" + 0.049*"convenient" + 0.048*"useful" + 0.044*"apps" + 0.031*"love" + 0.029*"awesome" + 0.023*"use" + 0.023*"helpful"'), (1, '0.043*"easy" + 0.041*"use" + 0.023*"pay" + 0.019*"fast" + 0.018*"money" + 0.017*"transaction" + 0.016*"bills" + 0.015*"online" + 0.011*"service" + 0.009*"transactions"'), (2, '0.019*"really" + 0.016*"need" + 0.011*"one" + 0.011*"online" + 0.010*"time" + 0.009*"lot" + 0.009*"since" + 0.009*"ewallet" + 0.008*"load" + 0.008*"bills"'), (3, '0.008*"sobrang" + 0.008*"gamitin" + 0.007*"maganda" + 0.007*"pera" + 0.007*"wala" + 0.006*"mabilis" + 0.006*"bills" + 0.006*"madali" + 0.006*"pwede" + 0.005*"pang"')]


In [586]:
paymaya_bad_ps_visual = gensimvis.prepare(paymaya_bad_ps_final, paymaya_bad_corpus_ps, paymaya_bad_dict_ps)
print(paymaya_bad_ps_final.print_topics())
paymaya_bad_ps_visual

[(0, '0.015*"money" + 0.015*"upgrade" + 0.012*"use" + 0.009*"customer_service" + 0.007*"email" + 0.007*"error" + 0.007*"get" + 0.007*"tried" + 0.006*"already" + 0.006*"service"'), (1, '0.007*"upgrade" + 0.007*"id" + 0.005*"student_id" + 0.003*"secondary_id" + 0.003*"primary_id" + 0.003*"list" + 0.002*"sss" + 0.002*"voters_id" + 0.002*"philhealth" + 0.002*"verify"'), (2, '0.012*"wala" + 0.011*"pera" + 0.008*"load" + 0.006*"upgrade" + 0.005*"hirap" + 0.005*"ayaw" + 0.004*"id" + 0.003*"apps" + 0.003*"ok" + 0.003*"error"')]


In [587]:
collated = []
collated.append(str(gcash_good_as_final.print_topics()))
collated.append(str(gcash_bad_as_final.print_topics()))
collated.append(str(gcash_good_ps_final.print_topics()))
collated.append(str(gcash_bad_ps_final.print_topics()))
collated.append(str(paymaya_good_as_final.print_topics()))
collated.append(str(paymaya_bad_as_final.print_topics()))
collated.append(str(paymaya_good_ps_final.print_topics()))
collated.append(str(paymaya_bad_ps_final.print_topics()))

In [589]:
app_store = ['GCash Good AS', 'GCash Bad AS',
             'GCash Good PS', 'GCash Bad PS',
             'PayMaya Good AS', 'PayMaya Bad AS',
             'PayMaya Good PS', 'PayMaya Bad PS']

df_topics = pd.DataFrame(list(zip(app_store,collated)), columns = ['App Store','Topics'])

In [590]:
df_topics

Unnamed: 0,App Store,Topics
0,GCash Good AS,"[(0, '0.014*""money"" + 0.014*""convenient"" + 0.0..."
1,GCash Bad AS,"[(0, '0.014*""money"" + 0.010*""use"" + 0.009*""fix..."
2,GCash Good PS,"[(0, '0.201*""good"" + 0.128*""nice"" + 0.067*""gre..."
3,GCash Bad PS,"[(0, '0.023*""update"" + 0.020*""always"" + 0.015*..."
4,PayMaya Good AS,"[(0, '0.014*""use"" + 0.012*""convenient"" + 0.010..."
5,PayMaya Bad AS,"[(0, '0.015*""money"" + 0.012*""upgrade"" + 0.009*..."
6,PayMaya Good PS,"[(0, '0.114*""good"" + 0.082*""nice"" + 0.059*""gre..."
7,PayMaya Bad PS,"[(0, '0.015*""money"" + 0.015*""upgrade"" + 0.012*..."


In [591]:
date = datetime.now().strftime("%Y_%m_%d-%I-%M-%S_%p")

In [592]:
df_topics.to_csv(f"Results/Topics/Topics_{date}.csv", index= True)

In [None]:
models = [gcash_good_as_final, gcash_bad_as_final,
          gcash_good_ps_final, gcash_bad_ps_final,
          paymaya_good_as_final, paymaya_bad_as_final,
          paymaya_good_ps_final, paymaya_bad_ps_final,]

In [None]:
joblib.dump(models, f"Models/Model_{date}")

### IV. Topic Labeling
---