## Topic Modelling

In [54]:
import nltk 
import pandas as pd
from nltk import sent_tokenize,word_tokenize,pos_tag
import gensim
from nltk.stem import WordNetLemmatizer
import datetime
from gensim.models import CoherenceModel
import pyLDAvis.gensim
from nltk.corpus import wordnet
from gensim import models
import numpy as np
from gensim.models import Phrases
from nltk.corpus import stopwords
from itertools import chain
import json
np.random.seed(123456)

lemmatizer = WordNetLemmatizer()
my_stopwords = stopwords.words('English') + ['photo','credit','one','make','like','us','country','use']+['new','well','would',
                                             'take','include','need','go','help','may','look']+['could','get','million','many',
                                             'platform','however']+['provide','see','product','still','service','market','people'
                                             ,'even']+['time','user','year','firm','add','work']+['investor','investment',
                                             'technology','come','consumer','customer','online']+['billion','offer','good','asia'
                                             'india','singapore','china','indonesia']+['want','find','first','start','tell',
                                             'tech','ceo','app','big','last','number']+['another','give','two','since']+['team',
                                             'fund','buid','founder','grow','growth','back','lead','part','become','data','data',
                                             'allow','already','build','focus','plan','create']+['employee','share','support',
                                             'industry','digital','financial','launch','solution','large','continue','way','mean',
                                             'example','content'] + ['opportunity','asia','india','report','global','currently',
                                             'world','end','month','leave','around','process','experience','accord']+['southeast',
                                             'call','think','today','early','source','close','reach','set','according','move','across',
                                             'reach','expect','base','point','move']+['far','access','system','city','note','right','now',
                                            'pay','show','less','game','order','change','much','though','know','believe','late','always'
                                            'round','player','keep','major','run','three','ecommerce','next','thing','value','statement',
                                            'sale','local','region','partner','capital','invest']+['payment','revenue','week','operate',
                                            'increase','raise','different']

In [55]:
df = pd.read_csv('./Data/posts.csv')

In [56]:
df['sentences'] = df['content'].apply(sent_tokenize)
print(df['sentences'].head(1).tolist()[0])

df['tokens_sentences'] = df['sentences'].apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
print(df['tokens_sentences'].head(1).tolist()[0][:3])

df['POS_tokens'] = df['tokens_sentences'].apply(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])
print(df['POS_tokens'].head(1).tolist()[0][:3])



['  Every day, 100k+ smart people read our newsletter.', 'You can  sign up here .', 'Hello readers, \n Two familiar names were spotted making investments this week: Tencent and Alibaba.', '👀 \n The intense rivalry between these Chinese giants isn’t dying down anytime soon.', 'This week, the two unknowingly brought their clash to the companies they invested in, which were both the most-funded in China this week: \n \n Yipin Shengxian, a Chinese fresh food discount chain supermarket, raised US$362 million in a round led by Tencent.', 'Xpeng Motors, a Chinese electric vehicle and technology company, which Alibaba invested in, raised US$300 million.', 'You can find all other important investment deals that happened in the last few days in our  weekly funding news wrap-up .', 'Let’s dive into the biggest deals and M&amp;As that recently took place.', 'The biggest deals by country \n 🇨🇳  Yipin Shengxian : A Chinese fresh food discount chain supermarket that raised US$362 million from Capital

In [57]:
def pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

df['tokens_sentences_lemmatized'] = df['POS_tokens'].apply(lambda list_POS: 
[
[lemmatizer.lemmatize(el[0], pos(el[1])) if pos(el[1]) != '' else el[0] for el in tokens_POS] 
        for tokens_POS in list_POS
    ]
)
print(df['tokens_sentences_lemmatized'].head(1).tolist()[0][:3])

[['Every', 'day', ',', '100k+', 'smart', 'people', 'read', 'our', 'newsletter', '.'], ['You', 'can', 'sign', 'up', 'here', '.'], ['Hello', 'reader', ',', 'Two', 'familiar', 'name', 'be', 'spot', 'make', 'investment', 'this', 'week', ':', 'Tencent', 'and', 'Alibaba', '.']]


In [58]:
df['tokens'] = df['tokens_sentences_lemmatized'].map(lambda sentences: list(chain.from_iterable(sentences)))
df['tokens'] = df['tokens'].map(lambda tokens: [token.lower() for token in tokens if token.isalpha() 
                                                    and token.lower() not in my_stopwords and len(token)>1])

In [59]:
tokens = df['tokens'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])

In [60]:
dictionary_LDA = gensim.corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(token) for token in tokens]

In [61]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [62]:
print(datetime.datetime.now())
    
#model_list = []
coherence_values = []
perplexity_values = []
model_topics = []

for num_topics in range(2,10):
    lda_x = models.LdaModel(corpus_tfidf, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))
    coherencemodel = CoherenceModel(model=lda_x, texts=tokens, dictionary=dictionary_LDA, coherence='c_v')
    model_topics.append(num_topics)
    #model_list.append(lda_x)
    coherence_values.append(coherencemodel.get_coherence())
    perplexity_values.append(lda_x.log_perplexity(corpus))
    print("#Topics: " + str(num_topics) + " Coherence Score: " 
              , str(coherencemodel.get_coherence())+ ' Perplexity score : '+ str(lda_x.log_perplexity(corpus)))

2020-08-15 13:27:51.301200
#Topics: 2 Coherence Score:  0.22480611985630067 Perplexity score : -9.643941609917055
#Topics: 3 Coherence Score:  0.25987751227725836 Perplexity score : -10.054247572926341
#Topics: 4 Coherence Score:  0.28025181193377163 Perplexity score : -10.363273669324922
#Topics: 5 Coherence Score:  0.285067293094602 Perplexity score : -10.616437897782935
#Topics: 6 Coherence Score:  0.2976168643959622 Perplexity score : -10.862093282367773
#Topics: 7 Coherence Score:  0.3264702668592852 Perplexity score : -11.050975791356192
#Topics: 8 Coherence Score:  0.35082924425591233 Perplexity score : -11.235138156684622
#Topics: 9 Coherence Score:  0.36534897067967437 Perplexity score : -11.393944929021592


In [63]:
num_topics = 5
lda_x = models.LdaModel(corpus, num_topics=num_topics, 
                                  id2word=dictionary_LDA,
                                  alpha=[0.01]*num_topics,
                                  eta=[0.01]*len(dictionary_LDA.keys()))
for i,topic in lda_x.show_topics(formatted=True, num_topics=num_topics, num_words=20):
    print(str(i)+":\n"+ topic)
    print()

0:
0.003*"bank" + 0.003*"aim" + 0.002*"brand" + 0.002*"develop" + 0.002*"deal" + 0.002*"strategy" + 0.002*"expand" + 0.002*"sell" + 0.002*"gojek" + 0.002*"transaction" + 0.002*"bring" + 0.002*"pandemic" + 0.002*"claim" + 0.002*"serve" + 0.002*"development" + 0.002*"funding" + 0.002*"group" + 0.002*"tiktok" + 0.002*"money" + 0.002*"round"

1:
0.003*"pandemic" + 0.003*"cost" + 0.003*"network" + 0.002*"expand" + 0.002*"food_delivery" + 0.002*"grab" + 0.002*"economy" + 0.002*"feature" + 0.002*"brand" + 0.002*"delivery" + 0.002*"shopee" + 0.002*"government" + 0.002*"home" + 0.002*"merchant" + 0.002*"gojek" + 0.002*"aim" + 0.002*"space" + 0.002*"operation" + 0.002*"bring" + 0.002*"serve"

2:
0.004*"grab" + 0.003*"tiktok" + 0.003*"chinese" + 0.003*"deal" + 0.003*"delivery" + 0.002*"oyo" + 0.002*"develop" + 0.002*"gojek" + 0.002*"apps" + 0.002*"serve" + 0.002*"pandemic" + 0.002*"round" + 0.002*"space" + 0.002*"brand" + 0.002*"operation" + 0.002*"group" + 0.002*"announce" + 0.002*"mobile" + 0.0

#### Possible Topics from the Model

0:
0.003*"bank" + 0.003*"aim" + 0.002*"brand" + 0.002*"develop" + 0.002*"deal" + 0.002*"strategy" + 0.002*"expand" + 0.002*"sell" + 0.002*"gojek" + 0.002*"transaction" + 0.002*"bring" + 0.002*"pandemic" + 0.002*"claim" + 0.002*"serve" + 0.002*"development" + 0.002*"funding" + 0.002*"group" + 0.002*"tiktok" + 0.002*"money" + 0.002*"round"

--> Topic about strategies for cost saving and funding during the pandemic

1:
0.003*"pandemic" + 0.003*"cost" + 0.003*"network" + 0.002*"expand" + 0.002*"food_delivery" + 0.002*"grab" + 0.002*"economy" + 0.002*"feature" + 0.002*"brand" + 0.002*"delivery" + 0.002*"shopee" + 0.002*"government" + 0.002*"home" + 0.002*"merchant" + 0.002*"gojek" + 0.002*"aim" + 0.002*"space" + 0.002*"operation" + 0.002*"bring" + 0.002*"serve"

-->Topic about Food and Goods delivery by Grab, Gojek and Shopee during pandemic

2:
0.004*"grab" + 0.003*"tiktok" + 0.003*"chinese" + 0.003*"deal" + 0.003*"delivery" + 0.002*"oyo" + 0.002*"develop" + 0.002*"gojek" + 0.002*"apps" + 0.002*"serve" + 0.002*"pandemic" + 0.002*"round" + 0.002*"space" + 0.002*"brand" + 0.002*"operation" + 0.002*"group" + 0.002*"announce" + 0.002*"mobile" + 0.002*"expand" + 0.002*"enable"

--> Unclear topic involving mobile apps development of Grab, Gojek, Tiktok and Oyo.

3:
0.005*"grab" + 0.002*"deal" + 0.002*"fintech" + 0.002*"space" + 0.002*"develop" + 0.002*"money" + 0.002*"place" + 0.002*"recently" + 0.002*"logistics" + 0.002*"feature" + 0.002*"claim" + 0.002*"expand" + 0.002*"home" + 0.002*"operation" + 0.002*"aim" + 0.002*"merchant" + 0.002*"scale" + 0.002*"future" + 0.002*"bank" + 0.002*"enable"

--> Topic about Grab's entry into fintech space by applying banking liscence

4:
0.002*"expand" + 0.002*"space" + 0.002*"claim" + 0.002*"program" + 0.002*"group" + 0.002*"gojek" + 0.002*"sell" + 0.002*"travel" + 0.002*"announce" + 0.002*"aim" + 0.002*"deal" + 0.002*"pandemic" + 0.002*"grab" + 0.002*"scale" + 0.002*"develop" + 0.002*"sector" + 0.002*"economy" + 0.002*"brand" + 0.002*"venture" + 0.002*"tencent"

--> Expansion and scaling up programs for Gojek,Grab and Tencent

In [77]:
num_topics = 6
lda_x = models.LdaModel(corpus, num_topics=num_topics, 
                                  id2word=dictionary_LDA,
                                  alpha=[0.01]*num_topics,
                                  eta=[0.01]*len(dictionary_LDA.keys()))
for i,topic in lda_x.show_topics(formatted=True, num_topics=num_topics, num_words=20):
    print(str(i)+":\n"+ topic)
    print()

0:
0.003*"grab" + 0.003*"bank" + 0.002*"space" + 0.002*"enable" + 0.002*"cost" + 0.002*"brand" + 0.002*"aim" + 0.002*"develop" + 0.002*"fintech" + 0.002*"government" + 0.002*"sell" + 0.002*"recently" + 0.002*"merchant" + 0.002*"manage" + 0.002*"pandemic" + 0.002*"within" + 0.002*"scale" + 0.002*"announce" + 0.002*"enterprise" + 0.002*"explain"

1:
0.003*"deal" + 0.003*"develop" + 0.003*"economy" + 0.003*"expand" + 0.003*"aim" + 0.002*"rise" + 0.002*"space" + 0.002*"cost" + 0.002*"chinese" + 0.002*"feature" + 0.002*"development" + 0.002*"serve" + 0.002*"group" + 0.002*"total" + 0.002*"high" + 0.002*"pandemic" + 0.002*"drive" + 0.002*"operation" + 0.002*"announce" + 0.002*"network"

2:
0.005*"grab" + 0.003*"pandemic" + 0.002*"expand" + 0.002*"develop" + 0.002*"food_delivery" + 0.002*"deal" + 0.002*"travel" + 0.002*"program" + 0.002*"able" + 0.002*"place" + 0.002*"uber" + 0.002*"space" + 0.002*"group" + 0.002*"model" + 0.002*"merchant" + 0.002*"challenge" + 0.002*"food" + 0.002*"bring" + 

#### Possible Topics for the Model

0:
0.003*"grab" + 0.003*"bank" + 0.002*"space" + 0.002*"enable" + 0.002*"cost" + 0.002*"brand" + 0.002*"aim" + 0.002*"develop" + 0.002*"fintech" + 0.002*"government" + 0.002*"sell" + 0.002*"recently" + 0.002*"merchant" + 0.002*"manage" + 0.002*"pandemic" + 0.002*"within" + 0.002*"scale" + 0.002*"announce" + 0.002*"enterprise" + 0.002*"explain"

-> Grab entering the fintech space by applying for a banking liscence

1:
0.003*"deal" + 0.003*"develop" + 0.003*"economy" + 0.003*"expand" + 0.003*"aim" + 0.002*"rise" + 0.002*"space" + 0.002*"cost" + 0.002*"chinese" + 0.002*"feature" + 0.002*"development" + 0.002*"serve" + 0.002*"group" + 0.002*"total" + 0.002*"high" + 0.002*"pandemic" + 0.002*"drive" + 0.002*"operation" + 0.002*"announce" + 0.002*"network"

-> Chinese strategies to push the economy

2:
0.005*"grab" + 0.003*"pandemic" + 0.002*"expand" + 0.002*"develop" + 0.002*"food_delivery" + 0.002*"deal" + 0.002*"travel" + 0.002*"program" + 0.002*"able" + 0.002*"place" + 0.002*"uber" + 0.002*"space" + 0.002*"group" + 0.002*"model" + 0.002*"merchant" + 0.002*"challenge" + 0.002*"food" + 0.002*"bring" + 0.002*"brand" + 0.002*"drive"

-> Challenges faced by Grab and Uber in the food delivery during the pandemic

3:
0.004*"delivery" + 0.003*"grab" + 0.003*"brand" + 0.003*"gojek" + 0.002*"aim" + 0.002*"deal" + 0.002*"pandemic" + 0.002*"recently" + 0.002*"claim" + 0.002*"travel" + 0.002*"enable" + 0.002*"client" + 0.002*"operation" + 0.002*"group" + 0.002*"provider" + 0.002*"spend" + 0.002*"expand" + 0.002*"oyo" + 0.002*"venture" + 0.002*"space"

-> Unclear topic about expeniture Grab, Gojek and Oyo during the pandemic

4:
0.003*"tiktok" + 0.002*"group" + 0.002*"money" + 0.002*"fintech" + 0.002*"claim" + 0.002*"deal" + 0.002*"expand" + 0.002*"chinese" + 0.002*"software" + 0.002*"pandemic" + 0.002*"management" + 0.002*"aim" + 0.002*"funding" + 0.002*"delivery" + 0.002*"bank" + 0.002*"feature" + 0.002*"space" + 0.002*"meanwhile" + 0.002*"office" + 0.002*"round"

-> News about Tiktok and Chinese management during the pandemic

5:
0.004*"gojek" + 0.003*"grab" + 0.003*"food_delivery" + 0.002*"merchant" + 0.002*"tencent" + 0.002*"network" + 0.002*"april" + 0.002*"expand" + 0.002*"develop" + 0.002*"brand" + 0.002*"delivery" + 0.002*"operation" + 0.002*"feature" + 0.002*"chinese" + 0.002*"transaction" + 0.002*"follow" + 0.002*"claim" + 0.002*"recently" + 0.002*"program" + 0.002*"deal"

-> Unclear topic about expansion of Grab, Gojek and Tencent during the pandemic

In [65]:
def visualise(model,vecs,dictionary):
    visual= pyLDAvis.gensim.prepare(model, vecs, dictionary)
    pyLDAvis.save_html(visual, "./Models/TFIDF/my_topics_tfidf_corpus_tfidf.html")
    #pyLDAvis.save_html(visual, "my_topics_tfidf_2020_08_15_4.html")
    return True

x = visualise(model = lda_x, vecs = corpus_tfidf,dictionary = dictionary_LDA)

In [66]:
lda_x.save('./Models/TFIDF/Topicmodelling_tfidf.h5')

In [67]:
lda_x[corpus[1]]

[(3, 0.99954545)]

In [69]:
mix = []
for i in range(900):
    #print('Doc :',i,' Topic',lda_x[corpus[i]])
    mix.append(lda_x[corpus[i]])
df['Topic_Mixture'] = mix

In [75]:
dct = dict(dictionary_LDA)
json.dump( dct, open( "./Models/TFIDF/dictionary_lda.json", 'w' ) )

In [76]:
final_bow = df[['id', 'title', 'content', 'excerpt', 'comments_count', 'read_time',
       'author.display_name','tokens','Topic_Mixture']]
final_bow.to_csv('./Models/TFIDF/TFIDF_Topic_Modelling.csv',index = False)