**Topic Analysis using LDA**



In [1]:
# Import Libraries 
import pandas as pd
import snscrape.modules.twitter as sntwitter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim.utils import simple_preprocess
import gensim.downloader as api
import gensim.corpora as corpora

import nltk
nltk.download('stopwords') 
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

import string
import re
import textblob
from textblob import TextBlob

from wordcloud import WordCloud, STOPWORDS
from wordcloud import ImageColorGenerator
from PIL import Image
from emot.emo_unicode import UNICODE_EMOJI

porter = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

import pyLDAvis.gensim_models 
import pickle 
import pyLDAvis
import os

import warnings
from pprint import pprint
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pdhiman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pdhiman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pdhiman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\pdhiman\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\pdhiman\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


**Using pickle to get the NLP Preproccesed tweets**

In [2]:
# Loading All the tweets 

df = pd.read_pickle('source/cleaned_df.pkl')
df.head()

Unnamed: 0.1,Unnamed: 0,Date,ID,location,tweet,num_of_likes,num_of_retweet,language,cleaned_tweets
0,0,2022-11-07 23:59:59+00:00,1589769667765469186,"California, USA",Taking into account personal contributions &am...,2,1,en,taking account personal bad everyone better tr...
1,1,2022-11-07 23:59:59+00:00,1589769667652235267,@jlo follows ♡ 01.29.21,whats your fav song?\n\n❥ I’m voting #Jennifer...,0,10,en,whats song voting
2,2,2022-11-07 23:59:59+00:00,1589769667127934977,Unknown,@MayoIsSpicyy He is allowed to speak his opini...,0,0,en,speak opinion like rest u opinion vote republi...
3,3,2022-11-07 23:59:59+00:00,1589769666918244352,USA,HEY NY DISTRICT 10! PLEASE VOTE FOR @danielsgo...,1,1,en,hey district please vote
4,4,2022-11-07 23:59:59+00:00,1589769666679144448,DMV,@YDanasmithdutra @BaddCompani @politicalblond ...,3,0,en,vote blue matter


Preparing for LDA Analysis


In [4]:
# Converting document (in our case tweets)  to array for doc2bow to process and create bag of word Corpus. 

def sent2words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
data = df.cleaned_tweets.values.tolist()
data_words = list(sent2words(data))

In [5]:
id2word = corpora.Dictionary(data_words) # Create Dictionary
texts = data_words  # Create Corpus
corpus = [id2word.doc2bow(text) for text in texts] # Term Document Frequency

print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)]


In [6]:
print(id2word)

Dictionary(8264 unique tokens: ['account', 'bad', 'best', 'better', 'blue']...)


Building base LDA Model

In [7]:
num_topics = 10  # number of topics
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True,
                                       alpha=0.01,
                                       eta=0.9)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.092*"election" + 0.035*"day" + 0.013*"voting" + 0.011*"tomorrow" + '
  '0.009*"night" + 0.008*"time" + 0.007*"early" + 0.006*"watch" + '
  '0.006*"ballot" + 0.006*"fraud"'),
 (1,
  '0.083*"vote" + 0.022*"democrat" + 0.016*"voting" + 0.014*"people" + '
  '0.013*"tomorrow" + 0.012*"red" + 0.012*"get" + 0.010*"blue" + '
  '0.010*"republican" + 0.010*"want"'),
 (2,
  '0.183*"vote" + 0.077*"year" + 0.041*"choice" + 0.029*"people" + '
  '0.020*"celebrity" + 0.003*"coming" + 0.002*"music" + 0.002*"award" + '
  '0.002*"fan" + 0.002*"kim"'),
 (3,
  '0.132*"voting" + 0.070*"favorite" + 0.029*"pop" + 0.027*"group" + '
  '0.021*"duo" + 0.018*"year" + 0.011*"harry" + 0.009*"swift" + 0.005*"male" + '
  '0.004*"best"'),
 (4,
  '0.025*"vote" + 0.019*"voting" + 0.015*"democrat" + 0.015*"election" + '
  '0.013*"win" + 0.009*"people" + 0.009*"would" + 0.009*"make" + 0.008*"think" '
  '+ 0.007*"person"'),
 (5,
  '0.013*"voice" + 0.005*"michigan" + 0.004*"weather" + 0.003*"storm" + '
  '0.002*"ob

* Calculating Model perplexity and Coherence score
    * Multiple simulations were conducted with seperate parameters and we identified that at num_topis = 10 , we get the best coherance score for our model.

In [8]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.41535095106713504


Analysis LDA Model Results with pyLDAvis
- Top 10 Topics


In [9]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('visualization/ldavis_prepared_'+str(num_topics))
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, 'visualization/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

  default_term_info = default_term_info.sort_values(
