In [14]:
import nltk 
import pandas as pd
from nltk import sent_tokenize,word_tokenize,pos_tag
import spacy
import gensim
from nltk.stem import WordNetLemmatizer
import datetime
from gensim.models import CoherenceModel
import pyLDAvis.gensim
from nltk.corpus import wordnet
from tqdm import tqdm_notebook
tqdm_notebook().pandas()
from gensim.models import Phrases
from nltk.corpus import stopwords
from itertools import chain

lemmatizer = WordNetLemmatizer()
my_stopwords = stopwords.words('English') + ['photo','credit']

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm_notebook().pandas()


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [35]:
data = pd.read_csv('./Data/posts.csv')

In [36]:
data['sentences'] = data.content.apply(sent_tokenize)

data['sentences'].head(1).tolist()[0][:3]

'''
['  Every day, 100k+ smart people read our newsletter.',
 'You can  sign up here .',
 'Hello readers, \n Two familiar names were spotted making investments this week: Tencent and Alibaba.']
'''

"\n['  Every day, 100k+ smart people read our newsletter.',\n 'You can  sign up here .',\n 'Hello readers, \n Two familiar names were spotted making investments this week: Tencent and Alibaba.']\n"

In [37]:
data['tokens_sentences'] = data['sentences'].progress_map(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
print(data['tokens_sentences'].head(1).tolist()[0][:3])

HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))


[['Every', 'day', ',', '100k+', 'smart', 'people', 'read', 'our', 'newsletter', '.'], ['You', 'can', 'sign', 'up', 'here', '.'], ['Hello', 'readers', ',', 'Two', 'familiar', 'names', 'were', 'spotted', 'making', 'investments', 'this', 'week', ':', 'Tencent', 'and', 'Alibaba', '.']]


In [5]:
print(data['tokens_sentences'].head(1).tolist()[0][:3])

[['Every', 'day', ',', '100k+', 'smart', 'people', 'read', 'our', 'newsletter', '.'], ['You', 'can', 'sign', 'up', 'here', '.'], ['Hello', 'readers', ',', 'Two', 'familiar', 'names', 'were', 'spotted', 'making', 'investments', 'this', 'week', ':', 'Tencent', 'and', 'Alibaba', '.']]


In [7]:
data['POS_tokens'] = data['tokens_sentences'].progress_map(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])
print(data['POS_tokens'].head(1).tolist()[0][:3])

HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))



[[('Every', 'DT'), ('day', 'NN'), (',', ','), ('100k+', 'CD'), ('smart', 'JJ'), ('people', 'NNS'), ('read', 'VBP'), ('our', 'PRP$'), ('newsletter', 'NN'), ('.', '.')], [('You', 'PRP'), ('can', 'MD'), ('sign', 'VB'), ('up', 'RP'), ('here', 'RB'), ('.', '.')], [('Hello', 'NNP'), ('readers', 'NNS'), (',', ','), ('Two', 'CD'), ('familiar', 'JJ'), ('names', 'NNS'), ('were', 'VBD'), ('spotted', 'VBN'), ('making', 'VBG'), ('investments', 'NNS'), ('this', 'DT'), ('week', 'NN'), (':', ':'), ('Tencent', 'NN'), ('and', 'CC'), ('Alibaba', 'NNP'), ('.', '.')]]


In [8]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''


In [11]:
data['tokens_sentences_lemmatized'] = data['POS_tokens'].progress_map(
    lambda list_tokens_POS: [
        [
            lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) 
            if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS
        ] 
        for tokens_POS in list_tokens_POS
    ]
)

HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))





In [12]:
data['tokens_sentences_lemmatized'].head(1).tolist()[0][:3]

[['Every',
  'day',
  ',',
  '100k+',
  'smart',
  'people',
  'read',
  'our',
  'newsletter',
  '.'],
 ['You', 'can', 'sign', 'up', 'here', '.'],
 ['Hello',
  'reader',
  ',',
  'Two',
  'familiar',
  'name',
  'be',
  'spot',
  'make',
  'investment',
  'this',
  'week',
  ':',
  'Tencent',
  'and',
  'Alibaba',
  '.']]

In [15]:
data['tokens'] = data['tokens_sentences_lemmatized'].map(lambda sentences: list(chain.from_iterable(sentences)))
data['tokens'] = data['tokens'].map(lambda tokens: [token.lower() for token in tokens if token.isalpha() 
                                                    and token.lower() not in my_stopwords and len(token)>1])

In [16]:
data['tokens'].head(1).tolist()[0][:30]

['every',
 'day',
 'smart',
 'people',
 'read',
 'newsletter',
 'sign',
 'hello',
 'reader',
 'two',
 'familiar',
 'name',
 'spot',
 'make',
 'investment',
 'week',
 'tencent',
 'alibaba',
 'intense',
 'rivalry',
 'chinese',
 'giant',
 'die',
 'anytime',
 'soon',
 'week',
 'two',
 'unknowingly',
 'bring',
 'clash']

In [18]:
tokens = data['tokens'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])

In [19]:
dictionary_LDA = gensim.corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

In [20]:
from gensim import models
import numpy as np

In [21]:
np.random.seed(123456)
num_topics = 20
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

Wall time: 26.5 s


In [30]:
print(datetime.datetime.now())
    
model_list = []
coherence_values = []
perplexity_values = []
model_topics = []

for num_topics in range(2,10):
    lda_x = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))
    coherencemodel = CoherenceModel(model=lda_x, texts=tokens, dictionary=dictionary_LDA, coherence='c_v')
    model_topics.append(num_topics)
    model_list.append(lda_x)
    coherence_values.append(coherencemodel.get_coherence())
    perplexity_values.append(lda_x.log_perplexity(corpus))
    print("#Topics: " + str(num_topics) + " Coherence Score: " 
              , str(coherencemodel.get_coherence())+ ' Perplexity score : '+ str(lda_x.log_perplexity(corpus)))

2020-08-13 15:53:24.213222
#Topics: 2 Coherence Score:  0.24925239320766918 Perplexity score : -8.508362460276315
#Topics: 3 Coherence Score:  0.25794003730491794 Perplexity score : -8.624460907788004
#Topics: 4 Coherence Score:  0.23564634553708685 Perplexity score : -8.734755334518242
#Topics: 5 Coherence Score:  0.2490754466003894 Perplexity score : -8.796507822619725
#Topics: 6 Coherence Score:  0.2540845448986197 Perplexity score : -8.858655273513795
#Topics: 7 Coherence Score:  0.2455929439936954 Perplexity score : -8.948534697869487
#Topics: 8 Coherence Score:  0.2445575689123854 Perplexity score : -9.023336902832995
#Topics: 9 Coherence Score:  0.26292386484893165 Perplexity score : -9.05561824789073


In [22]:
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20):
    print(str(i)+": "+ topic)
    print()

0: 0.010*"gudangada" + 0.008*"market" + 0.008*"user" + 0.008*"app" + 0.007*"agent" + 0.007*"product" + 0.006*"firm" + 0.006*"new" + 0.005*"like" + 0.005*"china" + 0.005*"apps" + 0.005*"use" + 0.005*"invest" + 0.005*"super" + 0.004*"platform" + 0.004*"indonesia" + 0.004*"people" + 0.004*"social_commerce_startup" + 0.003*"however" + 0.003*"offer"

1: 0.009*"gojek" + 0.007*"product" + 0.007*"tiktok" + 0.007*"market" + 0.006*"india" + 0.005*"country" + 0.005*"indonesia" + 0.005*"include" + 0.004*"make" + 0.004*"photo_credit" + 0.004*"help" + 0.004*"platform" + 0.004*"zenius" + 0.004*"investment" + 0.004*"service" + 0.004*"ceo" + 0.004*"new" + 0.004*"user" + 0.004*"year" + 0.003*"work"

2: 0.008*"market" + 0.005*"online" + 0.005*"platform" + 0.005*"user" + 0.005*"new" + 0.004*"one" + 0.004*"service" + 0.004*"include" + 0.004*"firm" + 0.004*"southeast_asia" + 0.004*"make" + 0.004*"product" + 0.004*"like" + 0.003*"content" + 0.003*"however" + 0.003*"use" + 0.003*"india" + 0.003*"china" + 0.00

In [26]:
lda_model[corpus[0]]

[(7, 0.13615821), (10, 0.25062823), (13, 0.61264855)]