# Forming Topic Models from PayMaya App Store Reviews Using Latent Dirichlet Allocation <br>
This notebook aims to generate topic models for 5-star and 1-star PayMaya app reviews taken from the App Store using Latent Dirichlet Allocation (LDA)

In [1]:
# importing libraries
import pandas as pd
import regex as re
import numpy as np
import tqdm
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import contractions
from textblob import TextBlob
from nltk.tokenize import word_tokenize

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import models

import spacy

from pprint import pprint

import seaborn as sns

import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis



In [2]:
def import_csv(path):
    reviews = pd.read_csv(path)
    reviews['title_review'] = reviews['title'] + ' ' + reviews['review']
    reviews = reviews.drop(['userName','title','review','isEdited','app_id','developerResponse','app_name'],axis=1)
    return reviews

  and should_run_async(code)


In [26]:
paymaya_reviews = import_csv('/Mynt Capstone/data/paymaya_0613-1402.csv')

  and should_run_async(code)


In [27]:
paymaya_reviews.head()

  and should_run_async(code)


Unnamed: 0,date,rating,title_review
0,2020-03-30 01:10:54,5,VERY BAD SERVICE I have been contacting them a...
1,2020-08-06 03:55:14,1,Very poor customer service There should be a z...
2,2020-05-02 01:58:12,1,App is great but service is not I really loved...
3,2020-04-09 01:32:02,1,Make your app trustworthy really Comeon this i...
4,2020-12-01 07:59:56,1,The new update... Ok so lets be honest I love ...


In [5]:
# Remove emojis and emoticons

# Code block 1: remove emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

#Example
remove_emoji("Have fun with NLP! 😃😃")

  and should_run_async(code)


'Have fun with NLP! '

In [6]:
# Function for removing emoticons
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

  and should_run_async(code)


In [7]:
# Function for removing URL's
def remove_urls (vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

  and should_run_async(code)


### Preprocessing Function

In [8]:
def preprocess(corpus, column):
    corpus[column].dropna(inplace=True) # Drop blank rows
    corpus[column] = [entry.lower() for entry in corpus[column]] # Lowercase text
    corpus[column] = corpus[column].apply(lambda x: remove_emoji(x)) #Remove emoji's
    corpus[column] = corpus[column].apply(lambda x: remove_emoticons(x)) #Remove emoticons
    corpus[column] = corpus[column].apply(lambda x: remove_urls(x)) #Remove URL's
    corpus[column] = corpus[column].apply(lambda x: contractions.fix(x)) # Expand contractions
    corpus[column] = corpus[column].apply(lambda x: str(TextBlob(x))) # Correct spelling
    corpus[column] = corpus[column].str.replace("[^a-zA-Z#]", " ") # Remove unwanted characters, numbers and symbols
    corpus[column] = corpus[column].str.replace(r'\r+|\n+|\t+','', regex=True) #Remove \n \t \r
    corpus[column] = corpus[column].str.replace('[^\w\s]','') # Remove punctuation

  and should_run_async(code)
  corpus[column] = corpus[column].str.replace('[^\w\s]','') # Remove punctuation


In [28]:
preprocess(paymaya_reviews, 'title_review')

  and should_run_async(code)
  corpus[column] = corpus[column].str.replace("[^a-zA-Z#]", " ") # Remove unwanted characters, numbers and symbols
  corpus[column] = corpus[column].str.replace('[^\w\s]','') # Remove punctuation


In [29]:
paymaya_reviews.head(10)

  and should_run_async(code)


Unnamed: 0,date,rating,title_review
0,2020-03-30 01:10:54,5,very bad service i have been contacting them a...
1,2020-08-06 03:55:14,1,very poor customer service there should be a z...
2,2020-05-02 01:58:12,1,app is great but service is not i really loved...
3,2020-04-09 01:32:02,1,make your app trustworthy really comeon this i...
4,2020-12-01 07:59:56,1,the new update ok so let us be honest i lov...
5,2020-08-03 05:16:08,1,cannot pay bills and cannot upgrade account i ...
6,2019-10-31 01:28:38,4,bills payment this payment processor digital w...
7,2018-01-06 11:12:59,2,great at first but paymaya is one of the be...
8,2020-05-23 05:39:05,5,frustrating password recovery help forgot my p...
9,2020-05-11 12:37:46,1,not useful i was given this app when i paid my...


### Splitting 5-Star and 1-Star Reviews

In [30]:
paymaya_rating5 = paymaya_reviews[paymaya_reviews['rating'] == 5]
paymaya_rating5.head()

  and should_run_async(code)


Unnamed: 0,date,rating,title_review
0,2020-03-30 01:10:54,5,very bad service i have been contacting them a...
8,2020-05-23 05:39:05,5,frustrating password recovery help forgot my p...
10,2017-10-15 08:10:23,5,best apps for online shopping this paymaya ap...
12,2019-12-06 10:45:22,5,convenient sleek and secure love the new ap...
17,2019-04-10 05:46:32,5,upgrading account original complaint i want ...


In [31]:
paymaya_rating1 = paymaya_reviews[paymaya_reviews['rating'] == 1]
paymaya_rating1.head()

  and should_run_async(code)


Unnamed: 0,date,rating,title_review
1,2020-08-06 03:55:14,1,very poor customer service there should be a z...
2,2020-05-02 01:58:12,1,app is great but service is not i really loved...
3,2020-04-09 01:32:02,1,make your app trustworthy really comeon this i...
4,2020-12-01 07:59:56,1,the new update ok so let us be honest i lov...
5,2020-08-03 05:16:08,1,cannot pay bills and cannot upgrade account i ...


### Loading Stopword List

In [14]:
from spacy.lang.tl.stop_words import STOP_WORDS as tl_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

  and should_run_async(code)


In [32]:
final_stopwords_list = list(tl_stop) + list(en_stop) + \
['paymaya', 'app','po', 'please', 'i', 'thank', 'nyo', 'lang', 'pag', 'mag', 'nag', 'rin', 'nyu', 'kau', 'naman', 'mo']

  and should_run_async(code)


In [33]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
nlp.Defaults.stop_words.update(final_stopwords_list)

  and should_run_async(code)


---
### LDA Topic Modelling <br>
### A. PayMaya 1-Star Reviews

In [34]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))

data = paymaya_rating1.title_review.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1][0][:30])

['very', 'poor', 'customer', 'service', 'there', 'should', 'be', 'zero', 'rating', 'for', 'this', 'app', 'called', 'today', 'to', 'quickly', 'ask', 'only', 'about', 'the', 'recent', 'updates', 'made', 'in', 'the', 'app', 'and', 'the', 'customer', 'service']


  and should_run_async(code)


### Phrase Modeling: Bigram Model

In [35]:
# Build the bigram model
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

  and should_run_async(code)


In [36]:
# Define functions for stopwords, bigrams, and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in final_stopwords_list] for doc in texts]


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

  and should_run_async(code)


In [37]:
# Remove Stop Words
data_words_nostops1 = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams1 = make_bigrams(data_words_nostops1)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized1 = lemmatization(data_words_bigrams1, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized1[:1])

  and should_run_async(code)


[['poor', 'customer', 'service', 'rating', 'call', 'today', 'quickly', 'ask', 'recent', 'update', 'customer', 'service', 'cs', 'say', 'proceed', 'query', 'place', 'birth', 'provide', 'match', 'record', 'weird', 'place', 'birth', 'sure', 'change', 'submit', 'information', 'account', 'upgrade', 'tell', 'cs', 'information', 'previous', 'call', 'get', 'verify', 'point', 'maybe', 'look', 'correct', 'information', 'place', 'birth', 'glitch', 'database', 'system', 'insist', 'wrong', 'anymore', 'personal', 'information', 'mobile', 'number', 'ask', 'remember', 'correct', 'place', 'birth', 'come', 'able', 'verify', 'account', 'previous', 'call', 'information', 'today', 'tell', 'provide', 'place', 'birth', 'incorrect', 'making', 'sense', 'hope', 'look', 'soon', 'possible', 'poor', 'customer', 'service']]


In [38]:
# Create Dictionary
id2word1 = corpora.Dictionary(data_lemmatized1)

# Create Corpus
texts1 = data_lemmatized1

# Term Document Frequency
corpus1 = [id2word1.doc2bow(text) for text in texts1]

# View
print(corpus1[:1])

[[(0, 1), (1, 2), (2, 1), (3, 2), (4, 5), (5, 3), (6, 1), (7, 1), (8, 2), (9, 2), (10, 3), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 5), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 5), (26, 1), (27, 2), (28, 1), (29, 2), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 3), (41, 1), (42, 1), (43, 1), (44, 1), (45, 2), (46, 2), (47, 1), (48, 1), (49, 2), (50, 1), (51, 1)]]


  and should_run_async(code)


### Base Model Creation

In [54]:
# Build LDA model
lda_model_rating1 = gensim.models.LdaMulticore(corpus=corpus1,
                                       id2word=id2word1,
                                       num_topics=10, 
                                       random_state=1,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

  and should_run_async(code)


In [55]:
# Print the keywords in the 10 topics
pprint(lda_model_rating1.print_topics())
doc_lda= lda_model_rating1[corpus_tfidf1]

[(0,
  '0.039*"register" + 0.025*"slow" + 0.024*"say" + 0.023*"money" + 0.022*"try" '
  '+ 0.020*"internet" + 0.019*"error" + 0.018*"internet_connection" + '
  '0.017*"connection" + 0.017*"use"'),
 (1,
  '0.065*"account" + 0.019*"email" + 0.017*"need" + 0.015*"number" + '
  '0.014*"upgrade" + 0.013*"receive" + 0.013*"say" + 0.012*"send" + '
  '0.010*"fix" + 0.010*"try"'),
 (2,
  '0.024*"account" + 0.019*"upgrade" + 0.016*"yung" + 0.013*"use" + '
  '0.012*"wala" + 0.010*"load" + 0.008*"pera" + 0.007*"scam" + 0.007*"money" + '
  '0.006*"update"'),
 (3,
  '0.063*"service" + 0.057*"customer" + 0.028*"support" + 0.026*"bad" + '
  '0.026*"account" + 0.023*"poor" + 0.019*"email" + 0.017*"reply" + '
  '0.012*"response" + 0.011*"day"'),
 (4,
  '0.081*"update" + 0.080*"crash" + 0.058*"open" + 0.043*"fix" + 0.025*"work" '
  '+ 0.024*"keep" + 0.022*"iphone" + 0.018*"io" + 0.017*"password" + '
  '0.015*"anymore"'),
 (5,
  '0.027*"try" + 0.019*"upgrade" + 0.019*"account" + 0.016*"use" + '
  '0.014*"

  and should_run_async(code)


In [56]:
# Compute Coherence Score
coherence_model_lda1 = CoherenceModel(model=lda_model_rating1, texts=data_lemmatized1, dictionary=id2word1, coherence='c_v')
coherence_lda1 = coherence_model_lda1.get_coherence()

print('\nCoherence Score: ', coherence_lda1)

  and should_run_async(code)



Coherence Score:  0.4508473190043252


### Hyperparameter Tuning <br>
This part aims the optimum number of topics and values for eta and alpha.

In [43]:
# supporting function
def compute_coherence_values(data_lemmatized, corpus, dictionary, k, a, eta):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           alpha=a,
                                           eta=eta,
                                           random_state=100,
                                           chunksize=100,
                                           passes=10)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

  and should_run_async(code)


In [57]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Eta parameter
eta = list(np.arange(0.01, 1, 0.3))
eta.append('symmetric')

  and should_run_async(code)


In [58]:
model_results = {'Topics': [],
                 'Alpha': [],
                 'Eta': [],
                 'Coherence': []
                }
# Loop to determine optimal number of topics
if 1 == 1:
    pbar = tqdm.tqdm(total=54)
    
    # iterate through the number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through eta values
            for e in eta:
                cv = compute_coherence_values(data_lemmatized1, 
                                              corpus=corpus1, 
                                              dictionary=id2word1, 
                                              k=k,
                                              a=a,
                                              eta=e)
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Eta'].append(e)
                model_results['Coherence'].append(cv)
        
            pbar.update(1)
    
    pd.DataFrame(model_results).to_csv('paymaya_lda_tuning_results_rating1.csv', index=False)
    pbar.close()

  and should_run_async(code)
100%|██████████| 54/54 [45:23<00:00, 50.44s/it]


In [59]:
paymaya_lda_tuning_results_rating1= pd.read_csv('paymaya_lda_tuning_results_rating1.csv')

  and should_run_async(code)


In [60]:
paymaya_lda_tuning_results_rating1.sort_values(by='Coherence', ascending=False)

  and should_run_async(code)


Unnamed: 0,Topics,Alpha,Eta,Coherence
268,10,asymmetric,0.9099999999999999,0.499126
267,10,asymmetric,0.61,0.466680
28,2,asymmetric,0.9099999999999999,0.459466
229,9,0.9099999999999999,symmetric,0.459188
225,9,0.9099999999999999,0.01,0.458338
...,...,...,...,...
56,3,asymmetric,0.31,0.331566
55,3,asymmetric,0.01,0.331491
49,3,0.9099999999999999,symmetric,0.330912
46,3,0.9099999999999999,0.31,0.330912


### Final Model

In [61]:
final_lda1 = gensim.models.LdaMulticore(corpus=corpus1,
                                           id2word=id2word1,
                                           num_topics=10, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha='asymmetric',
                                           eta=0.9099999999999999)

  and should_run_async(code)


In [62]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(final_lda1, corpus1, id2word1)
LDAvis_prepared

  and should_run_async(code)


NOTE: Can be reduced to only 4 topics

---
### B. PayMaya 5-Star Reviews
This part will aim to create a topic model for the PayMaya reviews having 5-star ratings. <br>

In [63]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))

data = paymaya_rating5.title_review.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1][0][:30])

['very', 'bad', 'service', 'have', 'been', 'contacting', 'them', 'about', 'payment', 'made', 'via', 'this', 'app', 'because', 'such', 'payment', 'did', 'not', 'reach', 'the', 'biller', 'it', 'has', 'been', 'two', 'weeks', 'since', 'the', 'payment', 'was']


  and should_run_async(code)


### Phrase Modeling: Bigram Model

In [64]:
# Build the bigram model
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

  and should_run_async(code)


In [65]:
# Remove Stop Words
data_words_nostops5 = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams5 = make_bigrams(data_words_nostops5)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized5 = lemmatization(data_words_bigrams5, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized5[:1])

  and should_run_async(code)


[['bad', 'service', 'contact', 'payment', 'payment', 'reach', 'bill', 'week', 'payment', 'writing', 'contact', 'email', 'messenger', 'avail', 'respond', 'resolution', 'offer', 'help', 'line', 'cut', 'extra', 'pay', 'bill', 'twice', 'disappointing', 'update', 'ticket', 'provide', 'check', 'email', 'miyuki', 'reply', 'ask', 'detail', 'march', 'reference', 'isent', 'screenshot', 'email', 'care', 'messenger', 'provide', 'day', 'get', 'response', 'communicate', 'care', 'get', 'response', 'follow', 'march', 'email', 'messenger', 'response']]


### Data Transformation: Corpus and Dictionary

In [66]:
# Create Dictionary
id2word5 = corpora.Dictionary(data_lemmatized5)

# Create Corpus
texts5 = data_lemmatized5

# Term Document Frequency
corpus5 = [id2word5.doc2bow(text) for text in texts5]

# View
print(corpus5[:1])

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 2), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 4), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 2), (20, 3), (21, 1), (22, 1), (23, 1), (24, 3), (25, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 3), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]]


  and should_run_async(code)


### Base Model Creation

In [67]:
# Build LDA model
lda_model_rating5 = gensim.models.LdaMulticore(corpus=corpus5,
                                       id2word=id2word5,
                                       num_topics=10, 
                                       random_state=1,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

  and should_run_async(code)


In [69]:
# Print the keywords in the 10 topics
pprint(lda_model_rating5.print_topics())

[(0,
  '0.035*"good" + 0.020*"use" + 0.017*"card" + 0.016*"great" + 0.012*"easy" + '
  '0.012*"well" + 0.011*"application" + 0.010*"payment" + 0.010*"load" + '
  '0.009*"pay"'),
 (1,
  '0.030*"pay" + 0.022*"bill" + 0.018*"money" + 0.017*"easy" + 0.015*"good" + '
  '0.013*"card" + 0.012*"send" + 0.012*"convenient" + 0.012*"online" + '
  '0.012*"load"'),
 (2,
  '0.019*"fee" + 0.016*"easy" + 0.012*"feature" + 0.011*"charge" + '
  '0.010*"need" + 0.009*"face_id" + 0.009*"convenient" + 0.009*"thank" + '
  '0.009*"love" + 0.009*"cash"'),
 (3,
  '0.017*"payment" + 0.015*"need" + 0.013*"account" + 0.013*"pay" + '
  '0.012*"card" + 0.012*"online" + 0.010*"bill" + 0.010*"convenient" + '
  '0.009*"purchase" + 0.009*"email"'),
 (4,
  '0.024*"money" + 0.019*"bill" + 0.011*"pay" + 0.010*"cashback" + '
  '0.009*"application" + 0.009*"customer" + 0.009*"life" + 0.009*"feature" + '
  '0.008*"love" + 0.008*"service"'),
 (5,
  '0.016*"pay" + 0.014*"convenient" + 0.012*"online" + 0.011*"feature" + '
  '0.

  and should_run_async(code)


### Compute Model Coherence Score

In [70]:
# Compute Coherence Score
coherence_model_lda5 = CoherenceModel(model=lda_model_rating5, texts=data_lemmatized5, dictionary=id2word5, coherence='c_v')
coherence_lda5 = coherence_model_lda5.get_coherence()

print('\nCoherence Score: ', coherence_lda5)

  and should_run_async(code)



Coherence Score:  0.37909678899669147


### Hyperparameter Tuning <br>
This part aims the optimum number of topics and values for eta and alpha.

In [71]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Eta parameter
eta = list(np.arange(0.01, 1, 0.3))
eta.append('symmetric')

  and should_run_async(code)


In [72]:
model_results = {'Topics': [],
                 'Alpha': [],
                 'Eta': [],
                 'Coherence': []
                }
# Loop to determine optimal number of topics
if 1 == 1:
    pbar = tqdm.tqdm(total=54)
    
    # iterate through the number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through eta values
            for e in eta:
                cv = compute_coherence_values(data_lemmatized5, 
                                              corpus=corpus5, 
                                              dictionary=id2word5, 
                                              k=k,
                                              a=a,
                                              eta=e)
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Eta'].append(e)
                model_results['Coherence'].append(cv)
        
            pbar.update(1)
    
    pd.DataFrame(model_results).to_csv('paymaya_lda_tuning_results_rating5.csv', index=False)
    pbar.close()

  and should_run_async(code)
100%|██████████| 54/54 [40:56<00:00, 45.50s/it]


In [73]:
paymaya_lda_tuning_results_rating5 = pd.read_csv('paymaya_lda_tuning_results_rating5.csv')

  and should_run_async(code)


In [74]:
paymaya_lda_tuning_results_rating5.sort_values(by='Coherence', ascending=False)

  and should_run_async(code)


Unnamed: 0,Topics,Alpha,Eta,Coherence
148,6,asymmetric,0.9099999999999999,0.482399
16,2,0.9099999999999999,0.31,0.461113
11,2,0.61,0.31,0.461113
15,2,0.9099999999999999,0.01,0.461113
10,2,0.61,0.01,0.461113
...,...,...,...,...
186,8,0.31,0.31,0.347111
267,10,asymmetric,0.61,0.341789
192,8,0.61,0.61,0.339654
205,8,asymmetric,0.01,0.334416


### Final Model

In [75]:
final_lda5 = gensim.models.LdaMulticore(corpus=corpus5,
                                           id2word=id2word5,
                                           num_topics=6, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha='asymmetric',
                                           eta=0.9099999999999999)

  and should_run_async(code)


In [76]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(final_lda5, corpus5, id2word5)
LDAvis_prepared

  and should_run_async(code)
