# Topic Modeling

In this section, we used the preprocessed dataset from feature_engineering.ipynb to model coherent topics.

In [None]:
#importing necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models

In [2]:
#loading in preprocessed data
trigram = pd.read_csv('data/trigram.csv')

In [3]:
#cleaning tweets
trigram_sentences = trigram.lemma_words_trigram.apply(lambda x: x.split())

#creating dictionary for lda model
id2word_trigrams = corpora.Dictionary(trigram_sentences)

#creating corpus for lda model
texts_trigrams = list(trigram_sentences)
corpus_trigram = [id2word_trigrams.doc2bow(x) for x in texts_trigrams]

In [4]:
#creating trigram model
lda_model_trigram_three_topics = gensim.models.ldamodel.LdaModel(corpus=corpus_trigram, id2word=id2word_trigrams, num_topics=3, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

In [5]:
import pyLDAvis
import pyLDAvis.gensim_models

#visualizing model
pyLDAvis.enable_notebook()
vis_3 = pyLDAvis.gensim_models.prepare(lda_model_trigram_three_topics, corpus_trigram, id2word_trigrams)
vis_3

  default_term_info = default_term_info.sort_values(


In [6]:
#evaluating coherence of LDA model
coherence_model_trigram_three_topics = CoherenceModel(model=lda_model_trigram_three_topics, texts=texts_trigrams, dictionary=id2word_trigrams, coherence='c_v')
coherence_model_trigram_three_topics.get_coherence()

0.21741394603184763

### Coherence score is not good but substantially better than baseline score of 9%.

In [7]:
#appending probabilities of each tweet
vecs_0 = []
vecs_1 = []
vecs_2 = []
for i in range(len(trigram)):
    top_topics = lda_model_trigram_three_topics.get_document_topics(corpus_trigram[i], minimum_probability=0.0)
    vecs_0.append(top_topics[0][1])
    vecs_1.append(top_topics[1][1])
    vecs_2.append(top_topics[2][1])


#creating dataframe
topic_0 = pd.Series(vecs_0, index=trigram.index)
topic_1 = pd.Series(vecs_1, index=trigram.index)
topic_2 = pd.Series(vecs_2, index=trigram.index)

topic_model_df = pd.concat([trigram, topic_0, topic_1, topic_2], axis=1)

#saving dataframe for model testing
topic_model_df.to_csv('data/topic_model_dataframe.csv')