# **Latent Dirichlet Allocation (LDA) for Topic Modeling Monkeypox Twitter Discourse**
By Minal Mishra

In [None]:
# installing libraries
!python -m pip install nltk
!python -m pip install pyLDAvis

In [None]:
# importing modules
import numpy as np
import json
import glob

#gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import CoherenceModel

#spacy and nltk
import spacy
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

#pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models

# Dataset Loading and Preprocessing

In [None]:
import re
def clean_data(line):
  line = line.lower()   # convert all characters to lower case
  line = re.sub(r'[\'"]', " ", line) # to avoid removing contractions in english
  line = re.sub(r'\s+&amp[^A-Za-z0-9]', ' ', line) # removes &amp
  line = re.sub(r"@[A-Za-z0-9_]+","", line) # removes name mentions
  line = re.sub(r"#[A-Za-z0-9_]+","", line) # removes hashtags
  line = re.sub(r'http\S+', '', line) # removes all hyperlinks
  line = re.sub(r'\\n', ' ', line) # remove newline characters
  line = re.sub(r'[()!*?\[\]]', ' ', line) # remove punctuation marks
  line = re.sub(r"[^a-z0-9]"," ", line) # removes all characters except [a-z] and [0-9]
  line = line.split()
  line = [w for w in line if not w in stopwords]
  line = " ".join(word for word in line)
  line = line.strip()

  return line

fr = open("40k_all_tweets.txt") # replace 40k_all_tweets.txt with your dataset file name before execution

tweets=[]
for line in fr:
  if not line.isspace():
    tweets.append(clean_data(line))
tweets=[str for str in tweets if str]

print(tweets)

data = tweets
fr.close()

In [None]:
#lemmatisation
def lemmatization(texts, allowed_postags = ["NOUN", "ADJ", "VERB", "PROPN", "ADV"]):
  nlp = spacy.load("en_core_web_sm", disable=['parser','ner'])
  texts_out=[]

  for text in texts:
    doc = nlp(text)
    new_text=[]
    for token in doc:
      if token.pos_ in allowed_postags and not token.is_stop:
        new_text.append(token.lemma_)
    final = " ".join(new_text)
    texts_out.append(final)
  return texts_out

lemmatized_texts=lemmatization(data)


In [None]:
#removing stopwords
from gensim.parsing.preprocessing import remove_stopwords

new_lemmatized_texts = []
for sentence in lemmatized_texts:
  new_lemmatized_texts.append(remove_stopwords(sentence))

lemmatized_texts = new_lemmatized_texts


# prerocessing text
def gen_words(texts):
  final = []
  for text in texts:
    new = gensim.utils.simple_preprocess(text, deacc= True, min_len= 3)
    final.append(new)
  return final

data_words=gen_words(lemmatized_texts)

In [None]:
# removing custom stopwords

custom_stopwords = ['wtf','shit','nit','fuck','lol','oh']

new_data_words=[]
for sentence in data_words:
   new_sentence = [word for word in sentence if word not in custom_stopwords]
   new_data_words.append(new_sentence)

data_words = new_data_words

In [None]:
# Adding Bigrams to Vocabulary

from gensim.models import Phrases

bigrams = Phrases(data_words, min_count = 5)         

for sentence_id in range(len(data_words)):
  for token in bigrams[data_words[sentence_id]]:
    if '_' in token:
      data_words[sentence_id].append(token)

# Creating TFIDF Embeddings

In [None]:
# Creating the tfidf word embeddings for corpus of tweets

from gensim import corpora

#creating the vocabulary
dct = corpora.Dictionary(data_words)

#BoW embedding/model
BoW_corpus = [dct.doc2bow(doc) for doc in data_words]
print("Bow Corpus:", BoW_corpus)

#tfidf embedding/model
from gensim.models import TfidfModel

#tfidf_corpus = TfidfModel(BoW_corpus)
tfidf_corpus = TfidfModel(BoW_corpus, normalize=True, pivot=10, slope=0.25)

BoW_corpus = [i for i in BoW_corpus if i != []]
tfidf_corpus_final = tfidf_corpus[BoW_corpus]

# **Application of LDA Topic Model**

In [None]:
# applying the LDA model
from gensim.models import LdaModel

num_topics=16
lda_model = LdaModel(tfidf_corpus_final, num_topics = num_topics, id2word=dct, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, eval_every=10, iterations=50, random_state=42)


In [None]:
print("Topic Word Distribution (word_probability*word pairs per topic):")
print(lda_model.get_topics())      # probabilities of each word in each topic, for all topics

#Creating topics (list of listt of words) for evaluation
topics_org=[]
for topicid in range(num_topics):
  topics_org.append([dct.id2token[word[0]] for word in lda_model.get_topic_terms(topicid, topn=10)])  # gives word_id, probability pairs in descending order of relevancy
print("\nTopics:\n",topics_org)

topics=[]
for words_per_topic in topics_org:
  topic=[]
  for word in words_per_topic:
    word_split=word.split('_')
    if len(word_split) == 1:
      topic.append(word)
    else:
      [topic.append(subword) for subword in word_split]
  topics.append(topic)

print("\nTopics:\n",topics)


# **Evaluation**

In [None]:
# Evaluating the topics - Topic Coherence
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel


#creating BoW corpus for evaluation
nlp_eval = spacy.load("en_core_web_sm", disable=['parser','ner'])

def clean_data_for_eval(line):
  line = line.lower()   # convert all characters to lower case
  line = re.sub(r'[\'"]', " ", line) # to avoid removing contractions in english
  line = re.sub(r'\s+&amp[^A-Za-z0-9]', ' ', line) # removes  &amp
  line = re.sub(r"@[A-Za-z0-9_]+","", line) # removes name mentions
  line = re.sub(r"#[A-Za-z0-9_]+","", line)  # remove hashtags
  line = re.sub(r'http\S+', '', line) # removes hyperlinks
  line = re.sub(r'\\n', ' ', line) # removes newline characters
  line = re.sub(r'[()!*?\[\]]', ' ', line) # removes punctuation
  line = re.sub(r"[^a-z0-9]"," ", line) # removes all characters except [a-z] and [0-9]
  line = line.split()
  line = [w for w in line if not w in stopwords]
  line = " ".join(word for word in line)
  line = line.strip()
  doc = nlp_eval(line)
  new_line=[]

  for token in doc:
    new_line.append(token.lemma_)
  line = " ".join(new_line)

  return line

fr_eval = open("/content/40k_all_tweets.txt")

tweets=[]
for line in fr_eval:
  if not line.isspace():
    tweets.append(clean_data_for_eval(line))
tweets=[str for str in tweets if str]
print(tweets)

data = tweets

texts = [[word for word in str(document).split()] for document in tweets]
print(texts)
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

fr_eval.close()

coherence_model = CoherenceModel(topics=topics, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_v')
coherence = coherence_model.get_coherence()
print("\nCoherence Score:",coherence)

# **Visualization of Topics**

In [None]:
# Vizualization of LDA Model

pyLDAvis.enable_notebook()

viz = pyLDAvis.gensim_models.prepare(lda_model, tfidf_corpus_final, dct)

viz
