## BERTopic with optimal setting
Source: https://maartengr.github.io/BERTopic/index.html#installation

------------------------


In [None]:
import pandas as pd

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP

from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from hdbscan import HDBSCAN

import re
from collections import Counter
import nltk
#nltk.download("stopwords")
from nltk.corpus import stopwords

import gensim.corpora as corpora
from gensim.models import CoherenceModel 

### Get data

In [None]:
data = pd.read_csv("orban_speeches_en_thesis.csv")

In [None]:
# drop the empty cells and convert column to list

docs = data['speech'].dropna().tolist()

In [None]:
# drop the empty cells and convert column to list

docs_tokenized = data['tokenized_speech'].dropna().tolist()

### Preprocess functions

In [None]:
# remove default stopwords

def preprocess_data(documents, stop_words):
    
    # Tokenize and remove stopwords
    texts = [[word for word in doc.split() if word not in stop_words] for doc in documents]
 
    return texts

### Corpus-specific stopwords
based on the topic representations

In [None]:
stopwords_spec = ["also",
                  "well",
                  "like",
                  "would",
                  "must",
                  "weve", 
                  "theres", 
                  "theyre", 
                  "dont", 
                  "isnt", 
                  "sz00f6vegtestchar", #meta
                  "sz00f6vegtest", #meta
                  #"normalchar", #meta
                  "span", #meta
                  "pcs", # interviewer Péter Csermely's monogram
                  "gik", # interviewer Gábor István Kiss's monogram
                  "vo"] # prime minister Viktor Orbán's monogram

In [None]:
# Define custom stopwords

sopwords_custom = stopwords_spec + stopwords.words('english')

### Embeddings

In [None]:
# Pre-calculate embeddings --> feed them to BERTopic to skip calculating embeddings each time

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs, show_progress_bar=True)

### Removing stopwords
It is important to include all the words when embedding! The methods below will help in the model to remove them later when creating the topic representations.

In [None]:
# 1. option: CountVectorizer -- removes specified words

vectorizer_model = CountVectorizer(stop_words=sopwords_custom)

In [None]:
# 2. option: ClassTfidfTransformer -- redues the occurrence of frequent words in the representations

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [None]:
# 3. option: KeyBERT-Inspired model -- in theory reduces the occurrance of frequent words and improves the representations

#representation_model = KeyBERTInspired()

### Dimensionality Reduction

In [None]:
# fine tuning UMAP -

umap_model = UMAP(
    n_neighbors=15, # number of neighboring sample points used when making the manifold approximation (higher --> larger clusters)
    n_components=5, # dimensionality of the embeddings after reducing them (too high --> hard time clustering, too low --> too little information)
    metric='cosine', 
    random_state=42 # for the sake of reproducibility
)

### Clastering

In [None]:
# fine tuning HDBSCAN -- to indirectly reduce topic numbers

hdbscan_model = HDBSCAN(min_cluster_size=10)

### Topic model

In [None]:
topic_model = BERTopic(
    # pipeline models
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    #embedding_model=embedding_model, #uncomment if representation_model is used
    
    # remove stop words
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    #representation_model=representation_model,
    
    # hyperparameters
    top_n_words = 10, # number of returned topic words
    n_gram_range = (1, 2), # possibility to return two-word phrases
    min_topic_size = 5, # minimum number of returned topics (default = 10)
    nr_topics = 'auto',
    low_memory = False, # set to True if yor computer needs it
)

In [None]:
# Model fitting

topics, probs = topic_model.fit_transform(docs_tokenized, embeddings)

In [None]:
# Return generated topics

topic_model.get_topic_info()

In [None]:
# list of the topic words, outlier topic excluded (list of list of str)

topic_list = topic_model.get_topic_info()["Representation"].to_list()[1:]

### Saving output

In [None]:
all_topics=topic_model.get_topics()

In [None]:
all_topics_df = pd.DataFrame()

# Iterates throught the dictionary and creates a DataFrame
for key, values in all_topics.items():
    words = [pair[0] for pair in values]
    probs = [pair[1] for pair in values]
    all_topics_df[f"{key}_word"] = words
    all_topics_df[f"{key}_prob"] = probs

print(all_topics_df)

In [None]:
# Save the df
all_topics_df.to_excel("Topic_outputs/BERTopic_opt_best_v2.xlsx")

### Coherence score
**<span style="color: crimson"> Important! For this part, top_n_words must set to 25 in the model!</span>**

In [None]:
processed_texts = preprocess_data(docs_tokenized, sopwords_custom)

In [None]:
# Create Dictionary (the dictionary is a mapping between words and their integer IDs)
id2word = corpora.Dictionary(processed_texts)

In [None]:
#Evaluate the model using the coherence score
coherence_model = CoherenceModel(topics=topic_list, 
                                 texts=processed_texts, 
                                 dictionary=id2word, 
                                 coherence="c_npmi")

coherence = coherence_model.get_coherence()
print("Coherence Score: ", coherence)

### Topic diversity
**<span style="color: crimson"> Important! For this part, top_n_words must set to 25 in the model!</span>**

In [None]:
topic_list_all = [word for topic in topic_list for word in topic]

In [None]:
# topic diversity = ratio of unique words in the top 25 words of topics
# by converting a list to set, it removes the duplicates

topic_diversity = len(set(topic_list_all))/len(topic_list_all)
print(topic_diversity)

### Set topic labels

In [None]:
topic_model.set_topic_labels(
    {0: "Általános", 
     1: "Covid", 
     2: "Törökország & Egyiptom",
     3: "Egyház",
     4: "Igazságszolgáltatás",
     5: "Választás",
     6: "Kína",
     7: "Európa",
     8: "Ukrajna",
     9: "Olimpia",
     10: "Orosz-ukrán háború",
     11: "Orosz együttműködés",
     12: "Sport & egyetem",
     13: "Általános lakossági",
     14: "Nyugdíj",
     15: "Településfejlesztés"
    }
)

### Topic visualizations

In [None]:
topic_model.visualize_topics(width=600, height=700, title="") #custom_labels=True)

In [None]:
topic_model.visualize_heatmap(custom_labels=True, width=700, height = 600, title="")

In [None]:
topic_model.visualize_documents(
    docs_tokenized, 
    embeddings=embeddings, 
    #hide_annotations=True, 
    width=900, 
    height=800, 
    title="",
    custom_labels=True)

In [None]:
topic_model.visualize_hierarchy(custom_labels=True, width= 800, title="")

### Saving model (serialization)
https://maartengr.github.io/BERTopic/getting_started/serialization/serialization.html

In [None]:
topic_model.save("Modellek/bertopic_opt", serialization="pickle")