## Topic Modelling

In [1]:
import nltk 
import pandas as pd
from nltk import sent_tokenize,word_tokenize,pos_tag
import gensim
from nltk.stem import WordNetLemmatizer
import datetime
from gensim.models import CoherenceModel
import pyLDAvis.gensim
from nltk.corpus import wordnet
from gensim import models
import numpy as np
from gensim.models import Phrases
from nltk.corpus import stopwords
from itertools import chain

np.random.seed(123456)

lemmatizer = WordNetLemmatizer()
my_stopwords = stopwords.words('English') + ['photo','credit','one','make','like','us','country','use',]+['new','well','would',
                                             'take','include','need','go','help','may','look']+['could','get','million','many',
                                             'platform','however']+['provide','see','product','still','service','market','people'
                                             ,'even']+['time','user','year','firm','add','work']+['investor','investment',
                                             'technology','come','consumer','customer','online']+['billlion','offer','good','asia'
                                             'india','singapore','china','indonesia']+['want','find','first','start','tell',
                                             'tech','ceo','app','big','last','number']+['another','give','two','since']+['team',
                                             'fund','buid','founder','grow','growth','back','lead','part','become','data','data','allow']

In [2]:
df = pd.read_csv('./Data/posts.csv')

In [None]:
df['sentences'] = df['content'].apply(sent_tokenize)
print(df['sentences'].head(1).tolist()[0])

df['tokens_sentences'] = df['sentences'].apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
print(df['tokens_sentences'].head(1).tolist()[0][:3])

df['POS_tokens'] = df['tokens_sentences'].apply(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])
print(df['POS_tokens'].head(1).tolist()[0][:3])



['  Every day, 100k+ smart people read our newsletter.', 'You can  sign up here .', 'Hello readers, \n Two familiar names were spotted making investments this week: Tencent and Alibaba.', '👀 \n The intense rivalry between these Chinese giants isn’t dying down anytime soon.', 'This week, the two unknowingly brought their clash to the companies they invested in, which were both the most-funded in China this week: \n \n Yipin Shengxian, a Chinese fresh food discount chain supermarket, raised US$362 million in a round led by Tencent.', 'Xpeng Motors, a Chinese electric vehicle and technology company, which Alibaba invested in, raised US$300 million.', 'You can find all other important investment deals that happened in the last few days in our  weekly funding news wrap-up .', 'Let’s dive into the biggest deals and M&amp;As that recently took place.', 'The biggest deals by country \n 🇨🇳  Yipin Shengxian : A Chinese fresh food discount chain supermarket that raised US$362 million from Capital

In [None]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

df['tokens_sentences_lemmatized'] = df['POS_tokens'].apply(lambda list_POS: 
[
[lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS] 
        for tokens_POS in list_POS
    ]
)
print(df['tokens_sentences_lemmatized'].head(1).tolist()[0][:3])

In [None]:
df['tokens'] = df['tokens_sentences_lemmatized'].map(lambda sentences: list(chain.from_iterable(sentences)))
df['tokens'] = df['tokens'].map(lambda tokens: [token.lower() for token in tokens if token.isalpha() 
                                                    and token.lower() not in my_stopwords and len(token)>1])

In [None]:
tokens = df['tokens'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])

In [None]:
dictionary_LDA = gensim.corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

In [None]:
print(datetime.datetime.now())
    
model_list = []
coherence_values = []
perplexity_values = []
model_topics = []

for num_topics in range(2,10):
    lda_x = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))
    coherencemodel = CoherenceModel(model=lda_x, texts=tokens, dictionary=dictionary_LDA, coherence='c_v')
    model_topics.append(num_topics)
    model_list.append(lda_x)
    coherence_values.append(coherencemodel.get_coherence())
    perplexity_values.append(lda_x.log_perplexity(corpus))
    print("#Topics: " + str(num_topics) + " Coherence Score: " 
              , str(coherencemodel.get_coherence())+ ' Perplexity score : '+ str(lda_x.log_perplexity(corpus)))

In [None]:
num_topics = 5
lda_x = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))
for i,topic in lda_x.show_topics(formatted=True, num_topics=num_topics, num_words=20):
    print(str(i)+":\n"+ topic)
    print()

In [None]:
def visualise(model,vecs,dictionary):
    visual= pyLDAvis.gensim.prepare(model, vecs, dictionary)
    pyLDAvis.save_html(visual, "./Models/BOW/my_topics_bow_2020_08_14_2.html")
    return True

x = visualise(model = lda_x, vecs = corpus,dictionary = dictionary_LDA)

In [None]:
lda_x.save('./Models/BOW/Topicmodelling_bow_2.h5')

In [None]:
lda_x[corpus[1]]

In [None]:
df.columns

In [None]:
final_bow = df[['id', 'title', 'content', 'excerpt', 'comments_count', 'read_time',
       'author.display_name','tokens']]
final_bow.to_csv('./Models/BOW/BOW_Topic_Modelling_2.csv')