# **BERTopic for Topic Modeling Monkeypox Twitter Discourse**
By Minal Mishra

In [None]:
# installing libraries
!pip install bertopic
#!pip install joblib==1.1.0 # uncomment for second and sebsequent runs
!pip install cleantext

In [None]:
# importing modules
#!pip install --upgrade joblib==1.1.0   # uncomment for second and sebsequent runs
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

# Data Loading and Preprocessing

In [None]:
# Data Preprocessing
import re
def clean_data(line):
  line = line.strip('"')  # removes leading and trailing double quotes
  line = re.sub(r'\s+&amp[^A-Za-z0-9]', ' ', line) # removes &amp
  line = re.sub(r"@[A-Za-z0-9_]+","", line) # removes name mentions
  line = re.sub(r"#[A-Za-z0-9_]+","", line) # removes hashtags
  line = re.sub(r'http\S+', '', line) # removes hyperlinks
  line = re.sub(r'\\n', ' ', line) # removes newline characters
  line = re.sub(r'["\U0001F600-\U0001F64F" "\U0001F300-\U0001F5FF" "\U0001F680-\U0001F6FF" "\U0001F1E0-\U0001F1FF" "\U000024C2-\U0001F251" "\U00002700-\U000027BF" "\U00002600-\U000026FF" "\U0001F900-\U0001F9FF" "\U0001FA70-\U0001FAFF"]+',' ', line)  # removes  emojis and pictographs
  line = line.split()
  line = " ".join(word for word in line)
  line = line.strip()

  return line
  
fr = open("/content/40k_all_tweets.txt")

tweets=[]
for line in fr:
  if not line.isspace():
    tweets.append(clean_data(line))
tweets=[str for str in tweets if str]

print(tweets)

fr.close()


# Dimensionality Reduction and Clustering Models

In [None]:
#UMAP Model for Dimensionality Reduction
from umap import UMAP
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0, random_state=42)

#HDBSCAN Model for Clustering
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=10, prediction_data=True, gen_min_span_tree=False)

In [None]:
#stopwords removal
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = list(stopwords.words('english')) + ['wtf','shit','nit','fuck','lol','oh']

#Incorporating biagrams
vectorizer_model = CountVectorizer(ngram_range=(1,2), stop_words=stopwords)

# Embedding Model

In [None]:
# embedding model - all-MiniLM-L6-v2
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')


# **Application of BERTopic Topic Model**

In [None]:
# applying LDA model

nr_topics=50
model = BERTopic(umap_model=umap_model, 
                 hdbscan_model=hdbscan_model,
                 embedding_model=embedding_model, 
                 vectorizer_model=vectorizer_model, 
                 top_n_words=10,
                 n_gram_range=(1,2),
                 min_topic_size=10,
                 nr_topics=nr_topics,
                 language='english', 
                 calculate_probabilities='False',
                 verbose='True')


topics, probs = model.fit_transform(tweets)

In [None]:
# listing documents and topic ids
for i in range(50):
  print(topics[i],":",tweets[i])

In [None]:
# topic representation
print("Topic Information:")
topic_information = model.get_topic_info()  # returns topic information including - topic id, number of documents that the topic occurs in and name of the topic

no_of_topics = len(topic_information)-1 
print(topic_information)

print("\nNo. of topics:")
print(no_of_topics)

no_of_topics=len(model.get_topics())-1
print(no_of_topics)

# generated topics
model_topics=model.get_topics()
print("\n\n Topics and words per topic with c-TFIDF scores for every word:",model_topics)
topics_org=[]
for i in model_topics:
  if i == -1:
    continue
  topic= model.get_topic(i)
  words_in_topic_org=[]
  for word in topic:
    words_in_topic_org.append(word[0])
  topics_org.append(words_in_topic_org)

print("\n\n Topics and words per topic:\n", topics_org)
  


# topics for evaluation
print("\n\n Topics and words per topic (For Evaluation):")
topics=[]
for i in model_topics:
  if i == -1:
    continue
  words_in_topic=[]
  topic= model.get_topic(i)
  for word in topic:
    if len(word[0].split()) == 1:
        english_word = re.sub(r"[^a-z0-9]"," ", word[0])
        if len(english_word.replace(" ","").strip()) == len(word[0].strip()):
          words_in_topic.append(english_word)
    else:
      for sub_word in word[0].split():
        english_word = re.sub(r"[^a-z0-9]"," ", sub_word)
        if len(english_word.replace(" ","").strip()) == len(sub_word.strip()):
          words_in_topic.append(english_word)
  if len(words_in_topic) != 0:
    topics.append(words_in_topic)

print(topics)
        

# **Evaluation**

In [None]:
# evaluating the topics using topic coherence
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

# creating BoW corpus for evaluation
def clean_data_for_eval(line):
  line = line.lower()   # convert all characters to lower case
  line = re.sub(r'[\'"]', " ", line)  # to avoid removing contractions in english
  line = re.sub(r'\s+&amp[^A-Za-z0-9]', ' ', line) # removes  &amp
  line = re.sub(r"@[A-Za-z0-9_]+","", line) # removes name mentions
  line = re.sub(r"#[A-Za-z0-9_]+","", line) # remove hashtags
  line = re.sub(r'http\S+', '', line) # removes hyperlinks
  line = re.sub(r'\\n', ' ', line) # remove newline characters
  line = re.sub(r'[()!*?\[\]]', ' ', line)# remove punctuation 
  line = re.sub(r"[^a-z0-9]"," ", line) # removes all characters except [a-z] and [0-9]
  line = line.split()
  line = [w for w in line if not w in stopwords]
  line = " ".join(word for word in line)
  line = line.strip()

  return line
  
fr_eval = open("/content/40k_all_tweets.txt")

tweets=[]
for line in fr_eval:
  if not line.isspace():
    tweets.append(clean_data_for_eval(line))
tweets=[str for str in tweets if str]
print(tweets)

data = tweets

texts = [[word for word in str(document).split()] for document in tweets]
print(texts)
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

fr_eval.close()

coherence_model = CoherenceModel(topics=topics, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_v')
coherence = coherence_model.get_coherence()
print("\nCoherence Score:",coherence)

# **Visualization of Topics**

In [None]:
#visualizing topics

model.visualize_topics()