# LDA model

The codes are partially based on: https://bennett-holiday.medium.com/a-step-by-step-guide-to-writing-an-lda-program-in-python-690aa99119ea

### Packages

In [None]:
import re
import numpy as np
import pandas as pd
from collections import Counter
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel 
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim

import nltk
#nltk.download("stopwords") #uncomment if needed
from nltk.corpus import stopwords

### Get data

In [None]:
data = pd.read_csv("orban_speeches_en_thesis.csv")

In [None]:
documents = data['tokenized_speech'].dropna()

### Preprocess

In [None]:
# concatenate the speeches into one string

speeches_longstring = ' '.join(map(str, list(documents.values)))

In [None]:
# remove default stopwords to find other frequent words beyond them

stop_words_default = stopwords.words("english")
tokenize_words = speeches_longstring.split()
filtered_speeches_lst = [w for w in tokenize_words if not w in stop_words_default]

In [None]:
# function to make a list of the most common n words
## if n parameter isn't specified by the user, it returns all of the words from the list

def freq_topn(str_list, n = None):
    frequency = Counter(str_list).most_common(n)
    freq_topn_list = []
    
    for tupl in frequency:
        freq_topn_list.append(tupl[0])
        
    return freq_topn_list

In [None]:
# function to find tokens ending in ".hu" and make a list of them
    
def get_hu(str_list):
    hu_set = {x for x in str_list if re.search(r'\.hu$', x)}
    hu_list = list(hu_set)
    return hu_list

In [None]:
# function to remove specified stopwords

def preprocess_data(documents, stop_words):
 
    # Tokenize and remove stopwords
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in documents]
 
    return texts

# instead of "simple_preprocess(str(doc))", "doc.split()" and "re.split(" |'", doc)" were tried also
# "simple_preprocess(str(doc))" performed best in terms of the coherence score

In [None]:
# define custom stopwords

stop_words_custom = stopwords.words("english") + freq_topn(filtered_speeches_lst, 95) + get_hu(filtered_speeches_lst)

In [None]:
processed_texts = preprocess_data(documents, stop_words_custom)

### LDA

In [None]:
# Create Corpus (the corpus is a list of documents represented as a bag-of-words (BoW))
texts = processed_texts

In [None]:
# Create Dictionary (the dictionary is a mapping between words and their integer IDs)
id2word = corpora.Dictionary(processed_texts)

In [None]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# Set number of topics
num_topics = 9

In [None]:
# Build LDA model
lda_model = LdaModel(corpus=corpus, 
                     id2word=id2word, 
                     num_topics=num_topics, 
                     random_state=42) 
                     #passes=10, # Number of passes through the corpus during training. - mi a default?
                     #alpha="auto", # default prior selecting strategies - ’auto’: Learns an asymmetric prior from the corpus
                     #per_word_topics=True) # If True, the model also computes a list of topics, sorted in descending order of most likely topics for each word, along with their phi values multiplied by the feature length (i.e. word count).

In [None]:
# Print the Keyword in the topics
pprint(lda_model.print_topics())

### Saving output

In [None]:
all_topics=lda_model.show_topics(num_topics=20, num_words=25, formatted=False)

In [None]:
all_topics_df = pd.DataFrame()
words = []
probs = []

# Iteráljunk a dictionary-n és hozzuk létre a DataFrame-et
for topic in all_topics:
    words = [token[0] for token in topic[1]]
    probs = [token[1] for token in topic[1]]
    all_topics_df[f"{topic[0]}_word"] = words
    all_topics_df[f"{topic[0]}_prob"] = probs

#print(all_topics_df)

In [None]:
# Save the df
all_topics_df.to_excel("Topic_outputs/Final/LDA_25words.xlsx")

### Coherence score

In [None]:
# For coherence score, the top 10 topic words are needed
x=lda_model.show_topics(num_topics=20, num_words=10, formatted=False)
topic_list_10 = [[word[0] for word in topic[1]] for topic in x]

In [None]:
# Evaluate the model using the coherence score
coherence_model = CoherenceModel(topics=topic_list_10,
                                 texts=processed_texts,
                                 dictionary=id2word,
                                 coherence="c_npmi")

coherence = coherence_model.get_coherence()
print("Coherence Score: ", coherence)

### Topic diversity

In [None]:
# For topic diversity, the top 25 topicwords are needed
x=lda_model.show_topics(num_topics=20, num_words=25, formatted=False)
topic_list_25 = [[word[0] for word in topic[1]] for topic in x]

In [None]:
topic_list_all = [word for topic in topic_list_25 for word in topic]

In [None]:
# topic diversity = ratio of unique words in the top 25 words of topics
# by converting a list to set, it removes the duplicates

topic_diversity = len(set(topic_list_all))/len(topic_list_all)
print(topic_diversity)

### Custom labels

In [None]:
# Define list of custom topiclabels

labels = ["Támogatás & EU",
         "Kereszténység & siker",
         "Migráció & krízis",
         "Támogatás & választás",
         "Szerbia & nehézség",
         "Brüsszel & külföld",
         "Hosszútávú & növekedés",
         "Energia & fejlődés",
         "Nyugat & problémák"]

### Topic visualization

In [None]:
# Description: https://pyldavis.readthedocs.io/en/latest/modules/API.html

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, 
                              corpus, 
                              dictionary=lda_model.id2word,
                              R = 10,
                              plot_opts={'xlab': 'D1', 'ylab': 'D2'},
                              sort_topics=False)
vis

In [None]:
# Source: https://radimrehurek.com/gensim/auto_examples/howtos/run_compare_lda.html

def plot_difference(mdiff, labels=None, title="", annotation=None): # módosítás: labels paraméter
    """Plot the difference between models.

    Uses plotly as the backend."""
    import plotly.graph_objs as go
    import plotly.offline as py

    annotation_html = None
    if annotation is not None:
        annotation_html = [
            [
                "+++ {}<br>--- {}".format(", ".join(int_tokens), ", ".join(diff_tokens))
                for (int_tokens, diff_tokens) in row
            ]
            for row in annotation
        ]

    data = go.Heatmap(z=mdiff, colorscale= "viridis", text=annotation_html, x=labels, y=labels) # modification: color scale and x=labels, y=labels
    layout = go.Layout(width=500, height=500, title="")  # modification: axis labels are removed
    py.iplot(dict(data=[data], layout=layout))


In [None]:
# Heatmap of the topics based on Kullback-Leibler Distance

mdiff, annotation = lda_model.diff(lda_model, distance='kullback_leibler')
plot_difference(mdiff, annotation=annotation, labels = labels)

## Archive

In [None]:
# This algorithm was used to optimalize the model

for i in range(1,21, 1):
    
    num_topics = i
    
    lda_model = LdaModel(corpus=corpus, 
                         id2word=id2word, 
                         num_topics=num_topics, 
                         random_state=42) 
    
    x=lda_model.show_topics(num_topics=20, num_words=10, formatted=False)
    topic_list_10 = [[word[0] for word in topic[1]] for topic in x]

    coherence_model_lda = CoherenceModel(topics=topic_list_10,
                                         #model=lda_model, 
                                         texts=processed_texts, 
                                         dictionary=id2word, 
                                         coherence="c_npmi")
    
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"Number of topics: {i} -- Coherence Score: {coherence_lda}")