# LDA

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#### Documenten voorbereiden
De chunks worden aan het model gegeven

In [None]:
import pickle

chunks_file = "/home/nena-meijer/PyCharmMiscProject/topic_modelling/BERTopic/chunks/chunks_decoded.pkl"

with open(chunks_file, 'rb') as f:
    docs = pickle.load(f)

print(len(docs))
print(docs[0][:500])

### Preprocess en vectorize de chunks

In [None]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [None]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

Bigrams zijn sets van twee adjacent woorden. Door bigrams te gebruiken krijgen we representaties zoals "machine_learning" in onze output, in plaats van alleen maar "machine" en "learning".

In [None]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 175
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha=0.8,
    eta=0.5,
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [None]:
# Sla het model op
model.save("/home/nena-meijer/PyCharmMiscProject/topic_modelling/LDA/models/training_5/lda_model.gensim")

In [None]:
for topic in model.print_topics():
    print(topic)

In [None]:
from gensim.models.coherencemodel import CoherenceModel

coherence_model_umass = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
coherence_model_cv = CoherenceModel(model=model, texts=docs, corpus=corpus, dictionary=dictionary, coherence='c_v')
coherence_model_npmi = CoherenceModel(model=model, texts=docs, corpus=corpus, dictionary=dictionary, coherence='c_npmi')

print(f"u_mass Coherence: {coherence_model_umass.get_coherence()}")
print(f"c_v Coherence: {coherence_model_cv.get_coherence()}")
print(f"c_npmi Coherence: {coherence_model_npmi.get_coherence()}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Aantal topics en topwoorden per topic
num_topics = model.num_topics
top_words_per_topic = 10  # Aantal topwoorden per topic

# Haal de topwoorden per topic
for topic_num in range(num_topics):
    print(f"Topic #{topic_num}:")
    top_words = model.show_topic(topic_num, top_words_per_topic)
    for word, weight in top_words:
        print(f"  {word}: {weight:.4f}")

# Optioneel: Maak een visualisatie van de topwoorden per topic
for topic_num in range(num_topics):
    top_words = model.show_topic(topic_num, top_words_per_topic)
    words = [word for word, _ in top_words]
    weights = [weight for _, weight in top_words]

    plt.figure(figsize=(10, 6))
    plt.barh(words, weights, color='skyblue')
    plt.title(f"Topic #{topic_num} - Top {top_words_per_topic} Words")
    plt.xlabel('Weight')
    plt.show()


In [None]:
# Bereken de topic-distributie per document
document_topics = model.get_document_topics(corpus)

# Visualiseer de verdeling van topics per document
topic_proportions = []
for doc in document_topics:
    topic_proportions.append([t[1] for t in doc])  # Haal de gewichten van de topics op

# Converteer naar numpy array voor gemakkelijke verwerking
topic_proportions = np.array(topic_proportions)

# Plot de verdeling van topics per document
plt.figure(figsize=(10, 6))
for i in range(num_topics):
    plt.plot(topic_proportions[:, i], label=f"Topic #{i}")

plt.title("Topic Distribution per Document")
plt.xlabel("Document Index")
plt.ylabel("Topic Proportion")
plt.legend()
plt.show()


In [None]:
# Genereren van samenvattingen van de topics
topic_summaries = []
for topic_num in range(num_topics):
    top_words = model.show_topic(topic_num, top_words_per_topic)
    topic_words = [word for word, _ in top_words]
    topic_summary = ', '.join(topic_words)
    topic_summaries.append(f"Topic #{topic_num}: {topic_summary}")

# Print de samenvattingen
for summary in topic_summaries:
    print(summary)
