Installing libraries

In [None]:
!pip install gensim
!pip install bertopic

Loading libraries

In [None]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import numpy  as np
import string
import re
import pickle

#LDA
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim import corpora, models
from gensim.models import Phrases
from gensim.corpora import Dictionary

import matplotlib.pyplot as plt

# BERTopic
from bertopic import BERTopic


In [None]:
# Loading preprocessed data in
df_clean = pd.read_pickle("data_randomized_complete_cleaned.pkl")

NameError: name 'pd' is not defined

## LDA topic modelling

Feature selection

In [None]:
# Vectorization tf_idf
vectorizer = TfidfVectorizer(min_df=2, max_df=0.4, norm='l2')
X = vectorizer.fit_transform(df_clean['clean_text'])
tf_idf = pd.DataFrame(data = X.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
# Copying the clean_text column of df_clean
responses = df_clean['clean_text'].copy()
responses = responses.reset_index()
responses

In [None]:
  # Tokenization of the text
responses['clean_text'] =  responses['clean_text'].apply(word_tokenize)
responses['clean_text'].head()

LDA with maximum of 100 topics without gridsearch

In [None]:
#Function for calculating coherence scores
def coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        print('Calculating for',num_topics,'topics')
        model = LdaModel(corpus=corpus, num_topics=num_topics, alpha=0.01, eta=0.1)
        model_list.append(model)

        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

    #create dictionary
id2word = corpora.Dictionary(responses['clean_text'])


#Filter out words that occur in less than 20 documents or more than 50% of the documents. You can experiment with different values here.
id2word.filter_extremes(no_below=10, no_above=0.4)

#Create corpus
corpus = [id2word.doc2bow(doc) for doc in responses['clean_text']]

start = 1
limit = 100
step = 2

model_list, coherence_values = coherence_values(dictionary=id2word, corpus=corpus, texts=responses['clean_text'], start=start, limit=limit, step=step)

In [None]:
# Plotting the coherence values
x = range(start, limit, step)
plt.figure(figsize=(10,10))
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Printing the topics and their coherence values
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Combine the number of topics and coherence values
topics_coherence = list(zip(x, coherence_values))

# Sort the list by coherence values in descending order
topics_coherence.sort(key=lambda pair: pair[1], reverse=True)

# Get the top three coherence values and their corresponding number of topics
top_three = topics_coherence[:6]

# Print the top three results
for m, cv in top_three:
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Generate model output
lda_model = LdaModel(corpus=corpus, num_topics=7,id2word=id2word)
lda_model.print_topics()

In [None]:
# Generate model output
lda_model = LdaModel(corpus=corpus, num_topics=11,id2word=id2word)
lda_model.print_topics()

In [None]:
# Taking subset for gridsearch, due to computational constraints
sampled_responses_fixed = responses.sample(n=50000, random_state=42)

In [None]:
# Function for calculating coherence scores and performing grid search
def coherence_values(dictionary, corpus, texts, start, limit, step, alpha_values, eta_values):
    coherence_values = []
    model_list = []
    param_configurations = []

    # Grid search over number of topics, alpha, and eta
    for num_topics in range(start, limit, step):
        for alpha in alpha_values:
            for eta in eta_values:
                print(f'Calculating for {num_topics} topics, alpha={alpha}, eta={eta}')
                # Build LDA model with specified parameters
                model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                                 alpha=alpha, eta=eta, random_state=100)
                model_list.append(model)

                # Calculate Coherence
                coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
                coherence_score = coherencemodel.get_coherence()
                coherence_values.append(coherence_score)
                param_configurations.append((num_topics, alpha, eta, coherence_score))

    return model_list, coherence_values, param_configurations

id2word = corpora.Dictionary(responses['clean_text'])

#Create corpus
corpus = [id2word.doc2bow(doc) for doc in responses['clean_text']]

# Specified alpha and eta values for the grid search
alpha_values = [0.01, 0.1, 0.5, 'asymmetric']
eta_values = [0.01, 0.5, 0.1, 'auto']

# Defined the range of topic numbers
start, limit, step = 1, 20, 1

# Call the coherence_values function with alpha and eta grid search
model_list, coherence_scores, configurations = coherence_values(
    dictionary=id2word, corpus=corpus, texts=responses['clean_text'],
    start=start, limit=limit, step=step, alpha_values=alpha_values, eta_values=eta_values
)

# Print out the coherence scores and parameter configurations
for config in configurations:
    print("Num Topics:", config[0], "Alpha:", config[1], "Eta:", config[2], "Coherence:", round(config[3], 4))


In [None]:
# Getting the topics for the best coherence value
lda_model = LdaModel(corpus=corpus, num_topics=3,id2word=id2word, alpha= 0.1, eta=0.01)
lda_model.print_topics()

## BERTopic modelling

In [None]:
# Converting into list
documents = df_clean['clean_text'].tolist()
documents

In [None]:
# Create a BERTopic model.
topic_model = BERTopic(embedding_model='paraphrase-Multilingual-MiniLM-L12-v2', language='Dutch', min_topic_size=40, nr_topics='auto')


#'paraphrase-MiniLM-L6-v2'
# Fit the model to your documents
topics, probabilities = topic_model.fit_transform(documents)

# Get an overview of the topics
topic_info = topic_model.get_topic_info()

In [None]:
# Print the topics found by the model
print(topic_model.get_topic_info())

# Retrieve words for each topic
for i in range(len(topic_model.get_topics())):
    print(f"Topic {i}'s top words: {topic_model.get_topic(i)}")

In [None]:
# Visualize the topics for better understanding and to make further decisions
topic_model.visualize_topics()

In [None]:
# After visualization decided to take 49 topics, because a lot of topics were in the same area
topic_model.reduce_topics(documents, nr_topics=49)

# Examine the new topic information
bertopic_info_df = topic_model.get_topic_info()
bertopic_info_df

In [None]:
# Extracting words from topics
topics = topic_model.get_topics()
words_per_topic = {topic: [word for word, _ in topics[topic]] for topic in topics}

# Create a dictionary and corpus
# Exclude -1 because it represents outlier class
dictionary = corpora.Dictionary([words_per_topic[topic] for topic in topics if topic != -1])

# Convert words to BOW format
corpus = [dictionary.doc2bow(words) for words in words_per_topic.values() if words]

In [None]:
# Extract the top words for each topic
texts = [[word for word, _ in topic_model.get_topic(topic)] for topic in topics if topic != -1]

# Retrieving coherence values
coherence_model = CoherenceModel(topics=texts, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values = coherence_model.get_coherence()

In [None]:
# Getting the coherence value of the BERTopic model
coherence_values