Reference: https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0?gi=b3b53e8290cd

In [5]:
import warnings
import gensim
import spacy
import joblib
import pickle
import pyLDAvis
import tqdm
import pandas as pd
import numpy as np
import pyLDAvis.gensim_models as gensimvis
import gensim.corpora as corpora

from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from datetime import datetime

warnings.filterwarnings(action="ignore")
pyLDAvis.enable_notebook()



### I. Import Data
---

In [6]:
gcash_good = joblib.load('../../Data/Preprocessed/Mico/P2/gcash_good.sav')
gcash_bad = joblib.load('../../Data/Preprocessed/Mico/P2/gcash_bad.sav')
paymaya_good = joblib.load('../../Data/Preprocessed/Mico/P2/paymaya_good.sav')
paymaya_bad = joblib.load('../../Data/Preprocessed/Mico/P2/paymaya_bad.sav')

In [7]:
def get_data(list, store):
    if store == 'as':
        data, dictionary, corpus = list[0], list[1], list[2]
    elif store == 'ps':
        data, dictionary, corpus = list[3], list[4], list[5]
    return data, dictionary, corpus

In [8]:
gcash_good_as, gcash_good_dict_as, gcash_good_corpus_as = get_data(gcash_good, 'as')
gcash_good_ps, gcash_good_dict_ps, gcash_good_corpus_ps = get_data(gcash_good, 'ps')

In [9]:
gcash_bad_as, gcash_bad_dict_as, gcash_bad_corpus_as = get_data(gcash_bad, 'as')
gcash_bad_ps, gcash_bad_dict_ps, gcash_bad_corpus_ps = get_data(gcash_bad, 'ps')

In [10]:
paymaya_good_as, paymaya_good_dict_as, paymaya_good_corpus_as = get_data(paymaya_good, 'as')
paymaya_good_ps, paymaya_good_dict_ps, paymaya_good_corpus_ps = get_data(paymaya_good, 'ps')

In [11]:
paymaya_bad_as, paymaya_bad_dict_as, paymaya_bad_corpus_as = get_data(paymaya_bad, 'as')
paymaya_bad_ps, paymaya_bad_dict_ps, paymaya_bad_corpus_ps = get_data(paymaya_bad, 'ps')

In [21]:
len(paymaya_good_ps)

40143

### II. Topic Modeling
---

#### A. Base Model

In [8]:
def build_model(corpus, dictionary):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=10, 
                                           random_state=100,
                                           chunksize=10000, # Increasing chunksize will speed up training as long as document fits memory
                                           passes=10, # Epochs
                                           per_word_topics=True)
    return lda_model

Create different LDA models for each type of review and store.

In [9]:
gcash_good_lda_as = build_model(gcash_good_corpus_as, gcash_good_dict_as)
gcash_bad_lda_as = build_model(gcash_bad_corpus_as, gcash_bad_dict_as)

In [10]:
gcash_good_lda_ps = build_model(gcash_good_corpus_ps, gcash_good_dict_ps)
gcash_bad_lda_ps = build_model(gcash_bad_corpus_ps, gcash_bad_dict_ps)

In [11]:
paymaya_good_lda_as = build_model(paymaya_good_corpus_as, paymaya_good_dict_as)
paymaya_bad_lda_as = build_model(paymaya_bad_corpus_as, paymaya_bad_dict_as)

In [12]:
paymaya_good_lda_ps = build_model(paymaya_good_corpus_ps, paymaya_good_dict_ps)
paymaya_bad_lda_ps = build_model(paymaya_bad_corpus_ps, paymaya_bad_dict_ps)

#### B. Base Coherence

Check quality of topics through topic coherence which is measured by the "degree of semantic similarity between hgih scoring words in the topics." We first compute the base coherence of the topics.

In [13]:
def compute_coherence(lda_model, data, dictionary):
    coherence_model_lda = CoherenceModel(model = lda_model,
                                         texts = data,
                                         dictionary = dictionary, 
                                         coherence = 'c_v')
    
    return coherence_model_lda.get_coherence()

In [14]:
gcash_good_as_base = compute_coherence(gcash_good_lda_as, gcash_good_as, gcash_good_dict_as)
gcash_bad_as_base = compute_coherence(gcash_bad_lda_as, gcash_bad_as, gcash_bad_dict_as)
gcash_good_ps_base = compute_coherence(gcash_good_lda_ps, gcash_good_ps, gcash_good_dict_ps)
gcash_bad_ps_base = compute_coherence(gcash_bad_lda_ps, gcash_bad_ps, gcash_bad_dict_ps)

In [15]:
paymaya_good_as_base = compute_coherence(paymaya_good_lda_as, paymaya_good_as, paymaya_good_dict_as)
paymaya_bad_as_base = compute_coherence(paymaya_bad_lda_as, paymaya_bad_as, paymaya_bad_dict_as)
paymaya_good_ps_base = compute_coherence(paymaya_good_lda_ps, paymaya_good_ps, paymaya_good_dict_ps)
paymaya_bad_ps_base = compute_coherence(paymaya_bad_lda_ps, paymaya_bad_ps, paymaya_bad_dict_ps)

In [16]:
gcash_as = [gcash_good_as_base, gcash_bad_as_base]
gcash_ps = [gcash_good_ps_base, gcash_bad_ps_base]
paymaya_as = [paymaya_good_as_base, paymaya_bad_as_base]
paymaya_ps = [paymaya_good_ps_base, paymaya_bad_ps_base]

In [17]:
df = pd.DataFrame(list(zip(gcash_as, gcash_ps, paymaya_as, paymaya_ps)), 
                  columns = ['GCash AS', 'GCash PS', 'PayMaya AS', 'PayMaya PS'],
                  index = ['Good', 'Bad'])

In [18]:
df

Unnamed: 0,GCash AS,GCash PS,PayMaya AS,PayMaya PS
Good,0.429972,0.461109,0.295934,0.549764
Bad,0.391118,0.422193,0.29937,0.463847


In [19]:
df.to_csv('Results/base.csv', index=False)

#### C. Hyperparameter Tuning

In [10]:
# supporting function
def compute_coherence_values(corpus, dictionary, data, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           random_state=100,
                                           chunksize=10000,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [11]:
def hyperparameter_tuner(corpus, dictionary, data, temp):
    grid = {}
    grid['Validation_Set'] = {}

    # Topics range
    min_topics = 3
    max_topics = 6
    step_size = 1
    topics_range = range(min_topics, max_topics, step_size)

    # Alpha parameter
    alpha = list(np.arange(0.01, 1, 0.3))
    alpha.append('symmetric')
    alpha.append('asymmetric')

    # Beta parameter
    beta = list(np.arange(0.01, 1, 0.3))
    beta.append('symmetric')

    # Validation sets
    num_of_docs = len(corpus)
    corpus_sets = [corpus]
    corpus_title = ['100% Corpus']
    model_results = {'Validation_Set': [],
                     'Topics': [],
                     'Alpha': [],
                     'Beta': [],
                     'Coherence': []
                    }

    # Can take a long time to run
    if 1 == 1:
        pbar = tqdm.tqdm(total=90)

        # iterate through validation corpuses
        for i in range(len(corpus_sets)):
            # iterate through number of topics
            for k in topics_range:
                # iterate through alpha values
                for a in alpha:
                    # iterare through beta values
                    for b in beta:
                        # get the coherence score for the given parameters
                        cv = compute_coherence_values(corpus_sets[i], 
                                                      dictionary, 
                                                      data,
                                                      k, a, b)
                        # Save the model results
                        model_results['Validation_Set'].append(corpus_title[i])
                        model_results['Topics'].append(k)
                        model_results['Alpha'].append(a)
                        model_results['Beta'].append(b)
                        model_results['Coherence'].append(cv)

                        pbar.update(1)
        filename = f'Results/{temp}.csv'
        pd.DataFrame(model_results).to_csv(filename, index=False)
        pbar.close()

In [12]:
hyperparameter_tuner(gcash_good_corpus_as, gcash_good_dict_as, gcash_good_as, 'gcash_good_as')

100%|██████████| 90/90 [06:00<00:00,  4.00s/it]


In [13]:
hyperparameter_tuner(gcash_bad_corpus_as, gcash_bad_dict_as, gcash_bad_as, 'gcash_bad_as')

100%|██████████| 90/90 [26:56<00:00, 17.97s/it]


In [14]:
hyperparameter_tuner(gcash_good_corpus_ps, gcash_good_dict_ps, gcash_good_ps, 'gcash_good_ps')

100%|██████████| 90/90 [1:04:51<00:00, 43.24s/it]


In [15]:
hyperparameter_tuner(gcash_bad_corpus_ps, gcash_bad_dict_ps, gcash_bad_ps, 'gcash_bad_ps')

100%|██████████| 90/90 [1:33:07<00:00, 62.08s/it]


In [16]:
hyperparameter_tuner(paymaya_good_corpus_as, paymaya_good_dict_as, paymaya_good_as, 'paymaya_good_as')

100%|██████████| 90/90 [08:52<00:00,  5.92s/it]


In [17]:
hyperparameter_tuner(paymaya_bad_corpus_as, paymaya_bad_dict_as, paymaya_bad_as, 'paymaya_bad_as')

100%|██████████| 90/90 [13:24<00:00,  8.94s/it]


In [18]:
hyperparameter_tuner(paymaya_good_corpus_ps, paymaya_good_dict_ps, paymaya_good_ps, 'paymaya_good_ps')

100%|██████████| 90/90 [1:00:30<00:00, 40.33s/it]


In [19]:
hyperparameter_tuner(paymaya_bad_corpus_ps, paymaya_bad_dict_ps, paymaya_bad_ps, 'paymaya_bad_ps')

100%|██████████| 90/90 [56:04<00:00, 37.38s/it]


#### D. Final Models

In [1]:
def build_final_model(df, corpus, dictionary, ntopics=0):
    df1 = df.iloc[df['Coherence'].idxmax]
    alpha = df1.Alpha
    beta = df1.Beta
    
    if ntopics != 0:
        topics = ntopics
    else:
        topics = df1.Topics
    
    if (alpha != 'symmetric') and (alpha != "asymmetric"):
        alpha = float(alpha)
        
    if (beta != 'symmetric'):
        beta = float(beta)
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=topics, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=alpha,
                                           eta=beta)
    
    return lda_model

In [5]:
df_gcash_good_as = pd.read_csv('Results/gcash_good_as.csv')
df_gcash_bad_as = pd.read_csv('Results/gcash_bad_as.csv')
df_gcash_good_ps = pd.read_csv('Results/gcash_good_ps.csv')
df_gcash_bad_ps = pd.read_csv('Results/gcash_bad_ps.csv')

In [6]:
df_paymaya_good_as = pd.read_csv('Results/paymaya_good_as.csv')
df_paymaya_bad_as = pd.read_csv('Results/paymaya_bad_as.csv')
df_paymaya_good_ps = pd.read_csv('Results/paymaya_good_ps.csv')
df_paymaya_bad_ps = pd.read_csv('Results/paymaya_bad_ps.csv')

In [14]:
df_paymaya_bad_ps.iloc[df_paymaya_bad_ps['Coherence'].idxmax]

Validation_Set           100% Corpus
Topics                             5
Alpha                     asymmetric
Beta              0.9099999999999999
Coherence                    0.48342
Name: 88, dtype: object

In [401]:
gcash_good_as_final = build_final_model(df_gcash_good_as, gcash_good_corpus_as, gcash_good_dict_as)
gcash_bad_as_final = build_final_model(df_gcash_bad_as, gcash_bad_corpus_as, gcash_bad_dict_as)
gcash_good_ps_final = build_final_model(df_gcash_good_ps, gcash_good_corpus_ps, gcash_good_dict_ps)
gcash_bad_ps_final = build_final_model(df_gcash_bad_ps, gcash_bad_corpus_ps, gcash_bad_dict_ps)

In [402]:
paymaya_good_as_final = build_final_model(df_paymaya_good_as, paymaya_good_corpus_as, paymaya_good_dict_as)
paymaya_bad_as_final = build_final_model(df_paymaya_bad_as, paymaya_bad_corpus_as, paymaya_bad_dict_as)
paymaya_good_ps_final = build_final_model(df_paymaya_good_ps, paymaya_good_corpus_ps, paymaya_good_dict_ps)
paymaya_bad_ps_final = build_final_model(df_paymaya_bad_ps, paymaya_bad_corpus_ps, paymaya_bad_dict_ps)

### III. Topic Visualization 
---

In [12]:
models = joblib.load('Models/Model_2021_06_24-08-17-04_PM')

In [13]:
gcash_good_as_final, gcash_bad_as_final, gcash_good_ps_final, gcash_bad_ps_final = models[0], models[1], models[2], models[3]
paymaya_good_as_final, paymaya_bad_as_final, paymaya_good_ps_final, paymaya_bad_ps_final = models[4], models[5], models[6], models[7]

In [24]:
gcash_good_as_visual = gensimvis.prepare(gcash_good_as_final, gcash_good_corpus_as, gcash_good_dict_as)
print(gcash_good_as_final.print_topics())
gcash_good_as_visual

[(0, '0.034*"good" + 0.033*"love" + 0.031*"convenient" + 0.027*"use" + 0.025*"transactions" + 0.020*"update" + 0.020*"need" + 0.020*"money" + 0.020*"bank" + 0.019*"pay"'), (1, '0.044*"money" + 0.038*"account" + 0.035*"number" + 0.033*"best" + 0.026*"load" + 0.025*"verified" + 0.023*"transaction" + 0.022*"able" + 0.021*"transfer" + 0.020*"get"'), (2, '0.069*"gcredit" + 0.040*"cash" + 0.033*"money" + 0.027*"send" + 0.025*"pay" + 0.025*"use" + 0.023*"available" + 0.019*"time" + 0.018*"years" + 0.017*"save"'), (3, '0.033*"convenient" + 0.033*"money" + 0.029*"online" + 0.026*"feature" + 0.023*"banks" + 0.022*"transfer" + 0.021*"use" + 0.020*"one" + 0.020*"wallet" + 0.013*"transactions"')]


In [15]:
gcash_good_as_topics = ['Convenience (General)', 'Transaction Related', 'App Products', 'Convenience (Bank Transfer)']

In [16]:
gcash_bad_as_visual = gensimvis.prepare(gcash_bad_as_final, gcash_bad_corpus_as, gcash_bad_dict_as)
print(gcash_bad_as_final.print_topics())
gcash_bad_as_visual

[(0, '0.018*"account" + 0.018*"money" + 0.017*"cash" + 0.017*"use" + 0.014*"update" + 0.013*"service" + 0.008*"verification" + 0.008*"get" + 0.008*"need" + 0.008*"time"'), (1, '0.024*"verify" + 0.022*"account" + 0.016*"verified" + 0.014*"id" + 0.011*"load" + 0.010*"need" + 0.010*"get_verified" + 0.010*"fix" + 0.010*"try" + 0.009*"working"'), (2, '0.019*"cash" + 0.013*"bank" + 0.011*"mpin" + 0.010*"payment" + 0.010*"load" + 0.010*"account" + 0.010*"error" + 0.010*"scam" + 0.009*"always" + 0.008*"paid"'), (3, '0.040*"account" + 0.027*"money" + 0.016*"email" + 0.014*"fix" + 0.012*"code" + 0.012*"already" + 0.011*"sent" + 0.010*"tried" + 0.010*"load" + 0.010*"open"'), (4, '0.021*"always" + 0.018*"transaction" + 0.018*"money" + 0.016*"fix" + 0.015*"update" + 0.015*"gcredit" + 0.015*"use" + 0.011*"error" + 0.008*"crashing" + 0.008*"make"')]


In [17]:
gcash_bad_as_topics = ['Customer Service', 'Verification', 'Product Related', 'Transaction Related', 'App Issues']

In [18]:
gcash_good_ps_visual = gensimvis.prepare(gcash_good_ps_final, gcash_good_corpus_ps, gcash_good_dict_ps)
print(gcash_good_ps_final.print_topics())
gcash_good_ps_visual

[(0, '0.274*"good" + 0.176*"nice" + 0.084*"great" + 0.068*"apps" + 0.052*"ok" + 0.045*"useful" + 0.044*"excellent" + 0.028*"awesome" + 0.027*"helpful" + 0.025*"amazing"'), (1, '0.091*"money" + 0.056*"easy" + 0.034*"thanks" + 0.031*"send" + 0.027*"load" + 0.026*"wow" + 0.024*"pay_bills" + 0.023*"pay" + 0.021*"transfer" + 0.020*"much"'), (2, '0.055*"transaction" + 0.031*"cash" + 0.025*"update" + 0.019*"okay" + 0.019*"need" + 0.018*"payment" + 0.018*"time" + 0.016*"transactions" + 0.016*"best" + 0.016*"perfect"'), (3, '0.195*"use" + 0.169*"easy" + 0.165*"love" + 0.161*"convenient" + 0.074*"fast" + 0.032*"super" + 0.021*"reliable" + 0.016*"safe" + 0.015*"helpful" + 0.014*"used"'), (4, '0.045*"really" + 0.032*"account" + 0.028*"convinient" + 0.026*"lot" + 0.025*"maganda" + 0.022*"helps" + 0.016*"open" + 0.015*"verygood" + 0.015*"gamitin" + 0.014*"easier"')]


In [19]:
gcash_good_ps_topics = ['Positive Feedback', 'App Products/Services', 'Transaction Related', 'Ease of Use', 'Convenience']

In [20]:
gcash_bad_ps_visual = gensimvis.prepare(gcash_bad_ps_final, gcash_bad_corpus_ps, gcash_bad_dict_ps)
print(gcash_bad_ps_final.print_topics())
gcash_bad_ps_visual

[(0, '0.049*"load" + 0.024*"good" + 0.020*"apps" + 0.019*"code" + 0.019*"update" + 0.018*"number" + 0.017*"open" + 0.017*"always" + 0.016*"buy_load" + 0.015*"updating"'), (1, '0.046*"account" + 0.037*"update" + 0.026*"always" + 0.022*"fix" + 0.022*"need" + 0.019*"verify" + 0.017*"money" + 0.015*"verified" + 0.014*"get" + 0.014*"use"'), (2, '0.047*"money" + 0.030*"cash" + 0.021*"use" + 0.021*"transaction" + 0.011*"pay" + 0.011*"gcredit" + 0.010*"bank" + 0.009*"deducted" + 0.008*"give" + 0.008*"payment"'), (3, '0.048*"service" + 0.033*"customer_service" + 0.025*"poor" + 0.019*"always" + 0.019*"sucks" + 0.018*"worst" + 0.016*"issue" + 0.014*"unavailable" + 0.013*"support" + 0.012*"ticket"'), (4, '0.027*"update" + 0.025*"pera" + 0.024*"wala" + 0.021*"student_id" + 0.016*"account" + 0.016*"verify" + 0.014*"id" + 0.012*"cash" + 0.010*"ayaw" + 0.010*"student"')]


In [21]:
gcash_bad_ps_topics = ['Product Related', 'Account Related', 'Transaction Related', 'Customer Service', 'Verification']

In [22]:
paymaya_good_as_visual = gensimvis.prepare(paymaya_good_as_final, paymaya_good_corpus_as, paymaya_good_dict_as)
print(paymaya_good_as_final.print_topics())
paymaya_good_as_visual

[(0, '0.028*"use" + 0.027*"convenient" + 0.025*"easy" + 0.020*"really" + 0.020*"good" + 0.017*"great" + 0.016*"payment" + 0.015*"need" + 0.014*"love" + 0.013*"load"'), (1, '0.029*"pay" + 0.025*"use" + 0.023*"convenient" + 0.022*"online" + 0.018*"ewallet" + 0.018*"bills" + 0.018*"best" + 0.017*"one" + 0.016*"money" + 0.015*"great"'), (2, '0.043*"bank" + 0.040*"transfer" + 0.024*"money" + 0.018*"account" + 0.017*"fee" + 0.014*"need" + 0.012*"support" + 0.011*"love" + 0.010*"tried" + 0.010*"transaction"'), (3, '0.024*"use" + 0.023*"customer" + 0.019*"email" + 0.017*"money" + 0.016*"every" + 0.016*"get" + 0.012*"day" + 0.012*"back" + 0.012*"could" + 0.012*"time"')]


In [23]:
paymaya_good_as_topics = ['Ease of Use', 'Convenience', 'Transaction Related', 'Customer Support Related']

In [24]:
paymaya_bad_as_visual = gensimvis.prepare(paymaya_bad_as_final, paymaya_bad_corpus_as, paymaya_bad_dict_as)
print(paymaya_bad_as_final.print_topics())
paymaya_bad_as_visual

[(0, '0.030*"account" + 0.016*"error" + 0.015*"load" + 0.014*"update" + 0.014*"use" + 0.014*"service" + 0.014*"fix" + 0.012*"money" + 0.011*"tried" + 0.010*"always"'), (1, '0.068*"account" + 0.040*"upgrade" + 0.039*"money" + 0.017*"use" + 0.016*"card" + 0.015*"customer_service" + 0.015*"virtual_card" + 0.010*"back" + 0.010*"since" + 0.010*"time"'), (2, '0.023*"tried" + 0.019*"bank" + 0.019*"account" + 0.018*"transaction" + 0.017*"email" + 0.016*"money" + 0.016*"support" + 0.014*"password" + 0.013*"balance" + 0.013*"add_money"')]


In [25]:
paymaya_bad_as_topics = ['Account Related', 'Customer Service', 'Transaction Related']

In [26]:
paymaya_good_ps_visual = gensimvis.prepare(paymaya_good_ps_final, paymaya_good_corpus_ps, paymaya_good_dict_ps)
print(paymaya_good_ps_final.print_topics())
paymaya_good_ps_visual

[(0, '0.071*"nice" + 0.066*"use" + 0.061*"easy" + 0.050*"convenient" + 0.049*"great" + 0.041*"useful" + 0.040*"apps" + 0.025*"awesome" + 0.020*"helpful" + 0.019*"excellent"'), (1, '0.023*"really" + 0.021*"best" + 0.019*"need" + 0.018*"ewallet" + 0.014*"since" + 0.013*"application" + 0.013*"features" + 0.012*"load" + 0.012*"super" + 0.011*"one"'), (2, '0.435*"good" + 0.048*"sobrang" + 0.034*"reliable" + 0.032*"apps" + 0.030*"service" + 0.024*"worth" + 0.021*"think" + 0.015*"keep" + 0.015*"verry" + 0.014*"work"'), (3, '0.038*"online" + 0.034*"money" + 0.022*"transactions" + 0.022*"pay" + 0.017*"way" + 0.017*"time" + 0.016*"bills" + 0.016*"payment" + 0.015*"get" + 0.012*"safe"'), (4, '0.041*"love" + 0.040*"bills" + 0.030*"feature" + 0.028*"pay" + 0.016*"qr_payment" + 0.016*"really" + 0.015*"transaction" + 0.014*"happy" + 0.014*"lot" + 0.014*"need"')]


In [27]:
paymaya_good_ps_topics = ['Convenience', 'General', 'Reliability', 'Transaction Related', 'App Features']

In [28]:
paymaya_bad_ps_visual = gensimvis.prepare(paymaya_bad_ps_final, paymaya_bad_corpus_ps, paymaya_bad_dict_ps)
print(paymaya_bad_ps_final.print_topics())
paymaya_bad_ps_visual

[(0, '0.021*"customer_service" + 0.020*"service" + 0.019*"always" + 0.018*"error" + 0.017*"register" + 0.013*"worst" + 0.013*"support" + 0.012*"load" + 0.012*"good" + 0.011*"response"'), (1, '0.080*"account" + 0.025*"log" + 0.020*"use" + 0.013*"fix" + 0.013*"number" + 0.012*"response" + 0.012*"already" + 0.012*"open" + 0.012*"update" + 0.012*"need"'), (2, '0.032*"pera" + 0.029*"wala" + 0.029*"account" + 0.017*"load" + 0.017*"upgrade" + 0.012*"hirap" + 0.012*"ayaw" + 0.011*"ok" + 0.009*"apps" + 0.008*"pwede"'), (3, '0.079*"upgrade" + 0.067*"account" + 0.030*"id" + 0.015*"upgrading" + 0.015*"ids" + 0.013*"use" + 0.013*"hard" + 0.011*"tried" + 0.010*"verify" + 0.010*"verification"'), (4, '0.092*"money" + 0.031*"account" + 0.022*"send" + 0.022*"cash" + 0.018*"bank" + 0.017*"add_money" + 0.017*"use" + 0.015*"transfer" + 0.014*"back" + 0.014*"transaction"')]


In [29]:
paymaya_bad_ps_topics = ['Customer Service', 'Account Related', 'Transaction Related', 'Account Upgrade', 'Feature Related']

In [30]:
collated = []
collated.append(str(gcash_good_as_final.print_topics()))
collated.append(str(gcash_bad_as_final.print_topics()))
collated.append(str(gcash_good_ps_final.print_topics()))
collated.append(str(gcash_bad_ps_final.print_topics()))
collated.append(str(paymaya_good_as_final.print_topics()))
collated.append(str(paymaya_bad_as_final.print_topics()))
collated.append(str(paymaya_good_ps_final.print_topics()))
collated.append(str(paymaya_bad_ps_final.print_topics()))

In [31]:
app_store = ['GCash Good AS', 'GCash Bad AS',
             'GCash Good PS', 'GCash Bad PS',
             'PayMaya Good AS', 'PayMaya Bad AS',
             'PayMaya Good PS', 'PayMaya Bad PS']

df_topics = pd.DataFrame(list(zip(app_store,collated)), columns = ['App Store','Topics'])

In [32]:
df_topics

Unnamed: 0,App Store,Topics
0,GCash Good AS,"[(0, '0.034*""good"" + 0.033*""love"" + 0.031*""con..."
1,GCash Bad AS,"[(0, '0.018*""account"" + 0.018*""money"" + 0.017*..."
2,GCash Good PS,"[(0, '0.274*""good"" + 0.176*""nice"" + 0.084*""gre..."
3,GCash Bad PS,"[(0, '0.049*""load"" + 0.024*""good"" + 0.020*""app..."
4,PayMaya Good AS,"[(0, '0.028*""use"" + 0.027*""convenient"" + 0.025..."
5,PayMaya Bad AS,"[(0, '0.030*""account"" + 0.016*""error"" + 0.015*..."
6,PayMaya Good PS,"[(0, '0.071*""nice"" + 0.066*""use"" + 0.061*""easy..."
7,PayMaya Bad PS,"[(0, '0.021*""customer_service"" + 0.020*""servic..."


In [33]:
date = datetime.now().strftime("%Y_%m_%d-%I-%M-%S_%p")

In [34]:
df_topics.to_csv(f"Results/Topics/Topics_{date}.csv", index= True)

In [35]:
models = [gcash_good_as_final, gcash_bad_as_final,
          gcash_good_ps_final, gcash_bad_ps_final,
          paymaya_good_as_final, paymaya_bad_as_final,
          paymaya_good_ps_final, paymaya_bad_ps_final,]

In [36]:
joblib.dump(models, f"Models/Model_{date}")

['Models/Model_2021_06_30-03-02-50_PM']

### IV. Topic Labeling
---

In [37]:
gcash_df = joblib.load('../../Data/Preprocessed/Mico/P2/gcash_df.sav')
paymaya_df = joblib.load('../../Data/Preprocessed/Mico/P2/paymaya_df.sav')

In [38]:
gcash_good_as_df, gcash_bad_as_df, gcash_good_ps_df, gcash_bad_ps_df = gcash_df[0], gcash_df[1], gcash_df[2], gcash_df[3]
paymaya_good_as_df, paymaya_bad_as_df, paymaya_good_ps_df, paymaya_bad_ps_df = paymaya_df[0], paymaya_df[1], paymaya_df[2], paymaya_df[3]

In [39]:
def label_rows(df, model, corpus, topic_list):
    topic_indices =[]
    topics = []

    for i in range(0, len(corpus)):
        scores = []

        for index, score in model[corpus[i]]:
            scores.append(score)

        topic_indices.append(scores.index(max(scores)))

    topics = [topic_list[x] for x in topic_indices]
    df['Topic'] = topics
    df = df.drop(['rating','title_review'], axis = 1)
    return df

In [40]:
gcash_good_as_df = label_rows(gcash_good_as_df, gcash_good_as_final, gcash_good_corpus_as, gcash_good_as_topics)
gcash_good_as_df.Topic.value_counts()

Convenience (General)          128
Transaction Related             42
App Products                     9
Convenience (Bank Transfer)      3
Name: Topic, dtype: int64

In [41]:
gcash_bad_as_df = label_rows(gcash_bad_as_df, gcash_bad_as_final, gcash_bad_corpus_as, gcash_bad_as_topics)
gcash_bad_as_df.Topic.value_counts()

Customer Service       669
Verification           334
Product Related        220
Transaction Related    166
App Issues              69
Name: Topic, dtype: int64

In [42]:
gcash_good_ps_df = label_rows(gcash_good_ps_df, gcash_good_ps_final, gcash_good_corpus_ps, gcash_good_ps_topics)
gcash_good_ps_df.Topic.value_counts()

Positive Feedback        52225
App Products/Services     7404
Ease of Use               6828
Transaction Related       5790
Convenience               4442
Name: Topic, dtype: int64

In [43]:
gcash_bad_ps_df = label_rows(gcash_bad_ps_df, gcash_bad_ps_final, gcash_bad_corpus_ps, gcash_bad_ps_topics)
gcash_bad_ps_df.Topic.value_counts()

Account Related        8086
Product Related        8038
Transaction Related    4725
Verification           4504
Customer Service       2893
Name: Topic, dtype: int64

In [44]:
paymaya_good_as_df = label_rows(paymaya_good_as_df, paymaya_good_as_final, paymaya_good_corpus_as, paymaya_good_as_topics)
paymaya_good_as_df.Topic.value_counts()

Ease of Use                 276
Convenience                 108
Transaction Related          36
Customer Support Related     14
Name: Topic, dtype: int64

In [45]:
paymaya_bad_as_df = label_rows(paymaya_bad_as_df, paymaya_bad_as_final, paymaya_bad_corpus_as, paymaya_bad_as_topics)
paymaya_bad_as_df.Topic.value_counts()

Account Related        314
Customer Service       182
Transaction Related     84
Name: Topic, dtype: int64

In [46]:
paymaya_good_ps_df = label_rows(paymaya_good_ps_df, paymaya_good_ps_final, paymaya_good_corpus_ps, paymaya_good_ps_topics)
paymaya_good_ps_df.Topic.value_counts()

Convenience            21389
General                 9033
Reliability             4658
Transaction Related     2825
App Features            2238
Name: Topic, dtype: int64

In [47]:
paymaya_bad_ps_df = label_rows(paymaya_bad_ps_df, paymaya_bad_ps_final, paymaya_bad_corpus_ps, paymaya_bad_ps_topics)
paymaya_bad_ps_df.Topic.value_counts()

Customer Service       3852
Account Related        2381
Transaction Related    1583
Account Upgrade        1276
Feature Related         973
Name: Topic, dtype: int64

In [48]:
gcash_df_eda = [gcash_good_as_df, gcash_bad_as_df, gcash_good_ps_df, gcash_bad_ps_df]
paymaya_df_eda = [paymaya_good_as_df, paymaya_bad_as_df, paymaya_good_ps_df, paymaya_bad_ps_df]

In [49]:
joblib.dump(gcash_df_eda, 'Results/Labeled/gcash_df_eda.sav')
joblib.dump(paymaya_df_eda, 'Results/Labeled/paymaya_df_eda.sav')

['Results/Labeled/paymaya_df_eda.sav']