## Topic Modelling

In this notebook, we apply one of the most popular topic modelling algorithms, Latent Dirichlet Allocation (LDA), to extract the most popular topics within the senator tweets

In [1]:
pip install matplotlib --upgrade

Collecting matplotlib
  Downloading matplotlib-3.6.2-cp39-cp39-macosx_10_12_x86_64.whl (7.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting contourpy>=1.0.1
  Downloading contourpy-1.0.6-cp39-cp39-macosx_10_9_x86_64.whl (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.8/240.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: contourpy, matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.5.2
    Uninstalling matplotlib-3.5.2:
      Successfully uninstalled matplotlib-3.5.2
Successfully installed contourpy-1.0.6 matplotlib-3.6.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
## Importing the libraries

import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import gensim
from gensim import corpora, models, similarities
import re
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
import matplotlib.colors as mcolors
import seaborn as sns


  if LooseVersion(mpl.__version__) >= "3.0":
  other = LooseVersion(other)


In [1]:
stop_words = stopwords.words('english')

In [6]:
df = pd.read_excel("tweets.xlsx")

In [7]:
df['tweets'] = df['text']

In [8]:
## Function to obtain tokens from tweets

def tokenize(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub('@\S*\s?', '', text)  # remove tags
    text = re.sub('\s+', ' ', text)  # remove newline chars
    text = re.sub("\'", "", text)  # remove single quotes
    text = gensim.utils.simple_preprocess(str(text), deacc=True) 
    #tokens = word_tokenize(text)
    stopwords_removed = [token.lower() for token in text if token.lower() not in stop_words and len(token) > 3]
    return stopwords_removed

df['text'] = df['text'].apply(tokenize)


  text = re.sub('@\S*\s?', '', text)  # remove tags
  text = re.sub('\s+', ' ', text)  # remove newline chars


In [9]:
## function to apply lemmatization to the obtained tokens

lemmatizer = WordNetLemmatizer()
def lemmatize_text(df_text):
    lemmatized =[]
    for w in df_text:
        lemmatized.append(lemmatizer.lemmatize(w))
    more_stopwords = ['cant', 'dont', 'im', 'wont', 'youre', 'le', 'say', 'sure', 'way', 'help', 'need', 'america', 'many', 'back', 'u.s.', '&amp' ]

    stopwords_removed = [token for token in lemmatized if token not in more_stopwords and len(token) > 3]

    return stopwords_removed

df['text'] = df['text'].apply(lemmatize_text)


In [10]:
## Creation of dictionary and corpus
## Creation of embedding using TF-IDF

texts = list(df['text'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus] 

In [11]:
total_topics = 20

In [12]:
## Fitting of the LDA Model

lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=total_topics, random_state=100)
corpus_lda = lda[corpus_tfidf]

In [13]:
lda.show_topics(total_topics)

[(0,
  '0.013*"county" + 0.010*"beautiful" + 0.010*"look" + 0.009*"soon" + 0.008*"leave" + 0.008*"looking" + 0.008*"washington" + 0.008*"constitution" + 0.007*"event" + 0.007*"hope"'),
 (1,
  '0.009*"question" + 0.009*"tomorrow" + 0.007*"answer" + 0.007*"word" + 0.007*"icymi" + 0.007*"else" + 0.006*"debate" + 0.006*"vision" + 0.006*"kind" + 0.006*"released"'),
 (2,
  '0.009*"vote" + 0.008*"know" + 0.008*"mean" + 0.007*"left" + 0.007*"behind" + 0.007*"october" + 0.006*"career" + 0.006*"make" + 0.005*"well" + 0.005*"thing"'),
 (3,
  '0.010*"asking" + 0.006*"serving" + 0.006*"border" + 0.006*"crisis" + 0.006*"stock" + 0.006*"represent" + 0.005*"candidate" + 0.005*"great" + 0.005*"southern" + 0.005*"miss"'),
 (4,
  '0.015*"energy" + 0.008*"inflation" + 0.007*"american" + 0.006*"price" + 0.006*"january" + 0.006*"feel" + 0.006*"corporation" + 0.005*"production" + 0.005*"climate" + 0.005*"domestic"'),
 (5,
  '0.016*"sign" + 0.014*"love" + 0.009*"exactly" + 0.008*"pushing" + 0.008*"yard" + 0.0

In [14]:
## This function creates a dataframe which includes all tweets along with the dominant topic at each, and the contribution of the most dominant topic according to LDA

def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    sent_topics_df["original"] = df['tweets']
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus_tfidf, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text', 'Original']

df_dominant_topic[['Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Original']].head(40)


  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)


Unnamed: 0,Dominant_Topic,Topic_Perc_Contrib,Keywords,Original
0,2,0.3499,"vote, know, mean, left, behind, october, caree...",Elijah Cummings was an unstoppable force for j...
1,4,0.4522,"energy, inflation, american, price, january, f...","For millions of Americans, hearing loss is a m..."
2,4,0.6956,"energy, inflation, american, price, january, f...",The student loan forgiveness application is no...
3,16,0.3075,"south, registered, nazi, there, campaign, poll...","When Putin began his invasion, brave Ukrainian..."
4,5,0.3748,"sign, love, exactly, pushing, yard, maybe, min...",This &amp; every month I'm thankful for strong...
5,3,0.4142,"asking, serving, border, crisis, stock, repres...","Whether it’s tidal marshes at Martin, the brac..."
6,6,0.6456,"difference, follow, interview, campaign, washi...",It’s a simple principle that should not be con...
7,13,0.5086,"tune, live, tell, black, violence, lawsuit, de...","Trump loves to talk, but never after raising h..."
8,18,0.4133,"care, cost, thats, health, drug, republican, p...",Our elections &amp; courts are being corrupted...
9,5,0.3158,"sign, love, exactly, pushing, yard, maybe, min...",Weaponizing DOJ. Armed vigilantes at voting si...


In [75]:
## Here we create a dataframe in which for each topic we identify a representative tweet and the tokens
## This will help us "interpet" the topics provided by the LDA Model which are currently in keyword format

pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text", "Original"]

# Show
sent_topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text,Original
0,0,0.8151,"county, beautiful, look, soon, leave, looking, washington, constitution, event, hope","[left, election, thing, know, defend, democracy, everything, weve, work, together, november, mis...","There are only 48 days left until the election, and one thing I know for sure is that we have to..."
1,1,0.8064,"question, tomorrow, answer, word, icymi, else, debate, vision, kind, released","[thank, warm, welcome, escambia, county, democrat, enjoyed, spending, time, tonight, sharing, pl...",Thank you for the warm welcome Escambia County Democrats! I enjoyed spending time with you toni...
2,2,0.8127,"vote, know, mean, left, behind, october, career, make, well, thing","[american, soldier, sacrificed, life, freedom, hero, street, monument, silvis, must, stop, trave...",American soldiers who sacrificed their lives for our freedom. The Hero Street Monument in Silvi...
3,3,0.7589,"asking, serving, border, crisis, stock, represent, candidate, great, southern, miss","[ryan, done, drug, cartel, congressional, district, time, send, home]",Tim Ryan has done far more for the drug cartels than for his own congressional district. Time to...
4,4,0.8202,"energy, inflation, american, price, january, feel, corporation, production, climate, domestic","[price, rise, continue, threaten, economic, health, american, family, business, support, return,...",Gas prices are again on the rise. This will only continue to threaten the economic health of Ame...
5,5,0.7886,"sign, love, exactly, pushing, yard, maybe, minute, catch, line, praying","[tonight, city, louis, people, neighborhood, came, together, celebrate, annual, enjoyed, taking,...","Tonight, all over the city of St. Louis, people and neighborhoods came together to celebrate the..."
6,6,0.8223,"difference, follow, interview, campaign, washington, rsvp, bless, federal, investment, keep","[fought, tooth, nail, level, playing, field, clean, energy, inflation, reduction, already, payin...",I fought tooth and nail to level the playing field for clean energy under the Inflation Reductio...
7,7,0.8172,"failed, ballot, believe, election, biden, abortion, policy, woman, control, change","[freedom, make, medical, decision, getting, abortion, doctor, eric, schmitt, want, every, doctor...",The freedom to make your own medical decision about getting an abortion should be between you an...
8,8,0.767,"john, something, truth, soft, remember, front, alert, california, progressive, president","[national, recognition, honor, remember, american, prisoner, still, missing, action, forgotten]","On National POW/MIA Recognition Day, we honor and remember Americans who were prisoners of war a..."
9,9,0.8149,"killed, tackle, stopped, actually, quick, tailgate, capitol, mind, cannabis, fantastic","[alex, jones, perpetuated, sickening, conspiracy, theory, make, fortune, even, caused, irreparab...",Alex Jones perpetuated sickening conspiracy theories to make a fortune – even as it caused irrep...


In [15]:
## PCA visualization of topics

pyLDAvis.enable_notebook()

In [16]:
panel = pyLDAvis.gensim_models.prepare(lda, corpus_lda, dictionary)
panel


  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  if LooseVe

Sources: 

`LDA`
- https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
- https://www.kaggle.com/code/errearanhas/topic-modelling-lda-on-elon-tweets
- https://towardsdatascience.com/lda-topic-modeling-with-tweets-deff37c0e131

`Election Issues`
https://www.washingtonpost.com/politics/2022/10/03/midterms-issues-choices/