## BERTopic with "LDA setting"
Source: https://maartengr.github.io/BERTopic/index.html#installation

------------------------


In [None]:
import pandas as pd

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP

from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired

import re
from collections import Counter
import nltk
#nltk.download("stopwords")
from nltk.corpus import stopwords

import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess

### Get data

In [None]:
data = pd.read_csv("orban_speeches_en_thesis.csv")

In [None]:
# drop the empty cells and convert column to list

docs = data['speech'].dropna().tolist()

In [None]:
# drop the empty cells and convert column to list

docs_tokenized = data['tokenized_speech'].dropna().tolist()

### Split docs to sentences

In [None]:
# uncomment if needed

#docs_sentences = []

#for doc in docs:
    #docs_sentences.append(doc.split(". "))
    
#print(docs_sentences[0])

### Preprocess functions

In [None]:
# function to make a list of the most common n words
## if n parameter isn't specified by the user, it returns all of the words from the list

def freq_topn(str_list, n = None):
    frequency = Counter(str_list).most_common(n)
    freq_topn_list = []
    
    for tupl in frequency:
        freq_topn_list.append(tupl[0])
        
    return freq_topn_list

In [None]:
# function to find tokens ending in ".hu" and make a list of them
    
def get_hu(str_list):
    hu_set = {x for x in str_list if re.search(r'\.hu$', x)}
    hu_list = list(hu_set)
    return hu_list

In [None]:
# function to remove specified stopwords

def preprocess_data(documents, stop_words):
 
    # Tokenize and remove stopwords
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in documents]
 
    return texts

### Preprocess: dicover custom stop words

In [None]:
# concatenate the speeches into one string

speeches_longstring = ' '.join(map(str, docs_tokenized))

In [None]:
# remove default stopwords to find other frequent words beyond them

stop_words_default = stopwords.words("english")
tokenize_words = speeches_longstring.split()

filtered_speeches_lst = [w for w in tokenize_words if not w in stop_words_default]

In [None]:
#define custom stopwords

stop_words_custom = stopwords.words("english") + freq_topn(filtered_speeches_lst, 95) + get_hu(filtered_speeches_lst)

### Embeddings

In [None]:
# Pre-calculate embeddings --> feed them to BERTopic to skip calculating embeddings each time

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs, show_progress_bar=True)

### Removing stopwords
It is important to include all the words when embedding! The methods below will help in the model to remove them later when creating the topic representations.

In [None]:
vectorizer_model = CountVectorizer(stop_words=stop_words_custom)

### Dimensionality Reduction

In [None]:
# fine tuning UMAP -- random state

umap_model = UMAP(
    random_state=42 #for the sake of reproducibility
)

### Topic model

In [None]:
topic_model = BERTopic(
    # pipeline models
    umap_model=umap_model,
    
    # remove stop words
    vectorizer_model=vectorizer_model,

    #hyperparameters
    top_n_words = 25, # number of returned topic words
    nr_topics = 10, # number of topics to return
    low_memory = False, # set to True if yor computer needs it
)

In [None]:
# Model fitting

topics, probs = topic_model.fit_transform(docs_tokenized, embeddings)

In [None]:
# Return generated topics

#topic_model.get_topic_info()

In [None]:
# list of the topic words, outlier topic excluded (list of list of str)

topic_list = topic_model.get_topic_info()["Representation"].to_list()[1:]

### Saving output

In [None]:
all_topics=topic_model.get_topics()

In [None]:
all_topics_df = pd.DataFrame()

# Iterates throught the dictionary and creates a DataFrame
for key, values in all_topics.items():
    words = [pair[0] for pair in values]
    probs = [pair[1] for pair in values]
    all_topics_df[f"{key}_word"] = words
    all_topics_df[f"{key}_prob"] = probs

#print(all_topics_df)

In [None]:
# Save the df
all_topics_df.to_excel("Topic_outputs/Final/BERT_LDA_25words.xlsx")

### Coherence score
**<span style="color: crimson"> Important! For this part, top_n_words must set to 10 in the model!</span>**

In [None]:
processed_texts = preprocess_data(docs_tokenized, stop_words_custom)

In [None]:
# Create Dictionary (the dictionary is a mapping between words and their integer IDs)
id2word = corpora.Dictionary(processed_texts)

In [None]:
#Evaluate the model using the coherence score
coherence_model = CoherenceModel(topics=topic_list, 
                                 texts=processed_texts, 
                                 dictionary=id2word, 
                                 coherence="c_npmi")

coherence = coherence_model.get_coherence()
print("Coherence Score: ", coherence)

### Topic diversity
**<span style="color: crimson"> Important! For this part, top_n_words must set to 25 in the model!</span>**

In [None]:
topic_list_all = [word for topic in topic_list for word in topic]

In [None]:
# topic diversity = ratio of unique words in the top 25 words of topics
# by converting a list to set, it removes the duplicates

topic_diversity = len(set(topic_list_all))/len(topic_list_all)
print(topic_diversity)

### Set topic labels

In [None]:
topic_model.set_topic_labels(
    {0: "Sikeres fejlődés", 
     1: "Európa", 
     2: "Orosz-ukrán háború",
     3: "Hazafiság",
     4: "Covid",
     5: "Egyház",
     6: "Sport",
     7: "Kína",
     8: "Ipar",
    }
)

### Topic visualizations

In [None]:
topic_model.visualize_topics(width=600, height=600, title="", custom_labels=True)

In [None]:
topic_model.visualize_heatmap(custom_labels=True, width=550, height = 450, title="")

In [None]:
topic_model.visualize_documents(
    docs_tokenized, 
    embeddings=embeddings, 
    #hide_annotations=True, 
    width=800, 
    height=800, 
    title="",
    custom_labels=True)

In [None]:
topic_model.visualize_hierarchy(custom_labels=True, width= 700, title="")

### Saving model (serialization)
https://maartengr.github.io/BERTopic/getting_started/serialization/serialization.html

In [None]:
topic_model.save("Modellek/bertopic_lda", serialization="pickle")