In [None]:
from bertopic import BERTopic
import pandas as pd
import torch
from thinc.api import set_gpu_allocator, require_gpu

# Controleer of GPU beschikbaar is
if torch.cuda.is_available():
    set_gpu_allocator("pytorch")  # Stel de GPU-geheugenbeheerder in voor PyTorch
    require_gpu(0)  # Vereis GPU 0
    device = "cuda"
    print("GPU wordt gebruikt!")  # Gebruik GPU
else:
    device = "cpu"
    print("CPU wordt gebruikt")  # Gebruik CPU als er geen GPU is

torch.cuda.empty_cache()

import cupy as cp

# Forceer het vrijmaken van geheugen
cp.get_default_memory_pool().free_all_blocks()

## Documenten voorbereiden

In [None]:
import pandas as pd

# Lees de CSV
df = pd.read_csv("/home/nena-meijer/PyCharmMiscProject/event_extraction/GEMINI_RESULTS_preprocessed.csv")

# Filter alleen rijen waar 'summary' een string is
filtered_df = df[df['summary'].apply(lambda x: isinstance(x, str))]

# Haal de tekst en datums op
document_ids = filtered_df['document_id'].tolist()
document_text = filtered_df['summary'].tolist()
document_dates = pd.to_datetime(filtered_df['document_date_y'], errors='coerce')

# Verwijder rijen zonder geldige datum
valid_mask = document_dates.notna()
document_text = [doc for doc, keep in zip(document_text, valid_mask) if keep]
document_dates = document_dates[valid_mask].tolist()

# Controle
print(len(document_text), len(document_dates))

## Chunk de documenten

In [None]:
from transformers import AutoTokenizer
import pickle

# Laad de tokenizer
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large-instruct")

# Functie om tekst te chunkeren
def chunk_text(text, tokenizer, max_length=512, overlap=50):
    # Tokenize de tekst met truncatie
    tokens = tokenizer.encode(text, truncation=True, padding=False, max_length=max_length)

    # Start met een lege lijst voor de chunks
    chunks = []

    # Itereer door de tokens en maak chunks met overlap
    for i in range(0, len(tokens), max_length - overlap):
        chunk = tokens[i:i + max_length]  # Maak een chunk van max_length tokens
        chunks.append(chunk)  # Voeg de chunk toe aan de lijst

    return chunks


# Lijst voor de chunks
all_chunks = []

# Tokenize en chunk elk document in de lijst document_text
for idx, text in enumerate(document_text):
    print(f"Tokenizing document {idx + 1}/{len(document_text)}...")
    chunks = chunk_text(text, tokenizer, max_length=512, overlap=50)
    all_chunks.append({
        'document_id': idx,  # Document ID om het document te traceren
        'chunks': chunks
    })

# Bestandsnaam voor de opgeslagen chunks
chunks_file = '/home/nena-meijer/PyCharmMiscProject/topic_modelling/BERTopic/chunks/chunks_512_extra_preprocessing.pkl'

# Opslaan van de chunks in een pickle-bestand
with open(chunks_file, 'wb') as f:
    pickle.dump(all_chunks, f)

print(f"Chunks zijn opgeslagen in: {chunks_file}")

## Embeddings genereren van de chunks

In [None]:
# Pre-calculate de embeddings
import pickle
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Het model wordt uitgevoerd op: {device}")

# Laad het model en de tokenizer
model_name = "intfloat/multilingual-e5-large-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Functie voor het genereren van embeddings
def generate_embeddings(chunks, tokenizer, model):
    # Initialiseer een lijst om de embeddings op te slaan
    embeddings = []

    # Itereer over de chunks en genereer embeddings
    for chunk in chunks:
        text = tokenizer.decode(chunk, skip_special_tokens=True)
        # Tokenize de chunk
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

        # Voer het model uit om de verborgen toestanden te krijgen
        with torch.no_grad():
            outputs = model(**inputs)
            last_hidden_state = outputs.last_hidden_state

        # We nemen de gemiddelde pooling van de laatste verborgen toestand
        attention_mask = inputs["attention_mask"]
        last_hidden_state = last_hidden_state.masked_fill(~attention_mask.unsqueeze(-1).bool(), 0)
        chunk_embedding = last_hidden_state.sum(dim=1) / attention_mask.sum(dim=1).unsqueeze(-1)

        # Normaliseer de embedding
        chunk_embedding = F.normalize(chunk_embedding, p=2, dim=1)

        # Voeg de embedding toe aan de lijst
        embeddings.append(chunk_embedding.squeeze().cpu().numpy())  # Verwijder batch dim en converteer naar numpy array

    return embeddings

# Laad de chunks uit het eerder opgeslagen pickle-bestand
chunks_file = '/home/nena-meijer/PyCharmMiscProject/topic_modelling/BERTopic/chunks/chunks_512_extra_preprocessing.pkl'

with open(chunks_file, 'rb') as f:
    loaded_chunks = pickle.load(f)

# Lijst om de embeddings op te slaan
all_embeddings = []

# Genereer embeddings voor de chunks
for doc in loaded_chunks:
    document_id = doc['document_id']
    chunks = doc['chunks']
    print(f"Genereer embeddings voor document {document_id + 1}...")
    document_embeddings = generate_embeddings(chunks, tokenizer, model)
    all_embeddings.append({
        'document_id': document_id,
        'embeddings': document_embeddings
    })

# Opslaan van de gegenereerde embeddings
embeddings_file = '/home/nena-meijer/PyCharmMiscProject/topic_modelling/BERTopic/embeddings/embeddings_chunks_512_extra_preprocessing.pkl'

with open(embeddings_file, 'wb') as f:
    pickle.dump(all_embeddings, f)

print(f"Embeddings zijn opgeslagen in: {embeddings_file}")

In [None]:
# Pre-calculate de embeddings voor de samenvattingen
import pickle
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Het model wordt uitgevoerd op: {device}")

# Laad het model en de tokenizer
model_name = "intfloat/multilingual-e5-large-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

from tqdm import tqdm  # Voor voortgangsbalk
import numpy as np

def generate_embeddings_batch(docs, tokenizer, model, device, batch_size=32):
    model.to(device)
    embeddings = []

    for i in tqdm(range(0, len(docs), batch_size)):
        batch_docs = docs[i:i+batch_size]
        inputs = tokenizer(batch_docs, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            last_hidden_state = outputs.last_hidden_state

        attention_mask = inputs["attention_mask"]
        last_hidden_state = last_hidden_state.masked_fill(~attention_mask.unsqueeze(-1).bool(), 0)
        doc_embeddings = last_hidden_state.sum(dim=1) / attention_mask.sum(dim=1).unsqueeze(-1)
        doc_embeddings = F.normalize(doc_embeddings, p=2, dim=1)

        embeddings.extend(doc_embeddings.cpu().numpy())

    return embeddings

# Gebruik:
all_embeddings = generate_embeddings_batch(document_text, tokenizer, model, device, batch_size=32)

# Opslaan van de gegenereerde embeddings
embeddings_file = '/home/nena-meijer/PyCharmMiscProject/topic_modelling/BERTopic/embeddings/embeddings_summaries_overtime.pkl'

with open(embeddings_file, 'wb') as f:
    pickle.dump(all_embeddings, f)

print(f"Embeddings zijn opgeslagen in: {embeddings_file}")

## Embeddings normaliseren

In [None]:
import pickle
from sklearn.preprocessing import normalize
import numpy as np

print("📂 Embeddings worden geladen...")
with open('/home/nena-meijer/PyCharmMiscProject/topic_modelling/BERTopic/embeddings/embeddings_summaries.pkl', 'rb') as f:
    embeddings_data = pickle.load(f)

print("🔄 Embeddings worden samengevoegd in één matrix...")
all_vectors = []
for doc in embeddings_data:
    all_vectors.extend(doc['embeddings'])  # Elke doc['embeddings'] is een lijst van numpy-arrays

all_vectors = np.vstack(all_vectors)
print(f"✅ Totaal aantal embeddings: {all_vectors.shape[0]}")

# Normaliseer de embeddings in batches en print voortgang
batch_size = 1000  # Pas de batchgrootte aan indien nodig
normalized_embeddings = []

print("📏 Embeddings worden genormaliseerd...")
for i in range(0, len(all_vectors), batch_size):
    batch = all_vectors[i:i + batch_size]
    batch_normalized = normalize(batch, norm='l2')
    normalized_embeddings.append(batch_normalized)

    # Print voortgang per batch
    print(f"✅ Batch {i // batch_size + 1} genormaliseerd: {i + len(batch)} / {len(all_vectors)} embeddings")

# Zet alle genormaliseerde embeddings weer in één matrix
normalized_embeddings = np.vstack(normalized_embeddings)
print("✅ Normalisatie voltooid.")

## Decodeer de chunks voor BERTopic invoer

In [None]:
import pickle
from transformers import AutoTokenizer

# Laad de tokenizer
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large-instruct")

# Laad de chunks uit het eerder opgeslagen pickle-bestand
chunks_file = '/home/nena-meijer/PyCharmMiscProject/topic_modelling/BERTopic/chunks/chunks_512_extra_preprocessing.pkl'

with open(chunks_file, 'rb') as f:
    loaded_chunks = pickle.load(f)

# Genereer document_text per chunk en decodeer de tokens
decoded_chunks = []
for doc in loaded_chunks:
    for chunk in doc['chunks']:
        # Decoderen van de tokens naar tekst
        decoded_text = tokenizer.decode(chunk, skip_special_tokens=True)
        decoded_chunks.append(decoded_text)

# Print de eerste 5 decodes chunks om te controleren
print(decoded_chunks[:5])

## BERTopic configuratie pipeline modellen

In [None]:
import cuml
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
import cupy as cp
import numpy as np
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech
from sklearn.feature_extraction.text import CountVectorizer

umap_model = UMAP(n_components=5, n_neighbors=20, metric='cosine', min_dist=0.0, random_state=42)

hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True,
                        min_samples=10)

vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 4))

## Train BERTopic

In [None]:
embeddings_file = "/home/nena-meijer/PyCharmMiscProject/topic_modelling/BERTopic/embeddings/embeddings_summaries_preprocessed.pkl"
with open(embeddings_file, 'rb') as f:
    embeddings_data = pickle.load(f)

all_embeddings = np.array(embeddings_data)

# Training & saving the model
topic_model = BERTopic(
    # Pipeline models
    embedding_model=None,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,

    # Hyperparameters
    top_n_words=10,
    verbose=True,
    language="multilingual",
    n_gram_range=(1, 4)
)

topics, probs = topic_model.fit_transform(document_text, all_embeddings)

topic_model.save("/home/nena-meijer/PyCharmMiscProject/topic_modelling/BERTopic/models/summaries/training_3.pkl",
                 serialization="safetensors", save_ctfidf=True)
# get topics
topic_model.get_topic_info()

Topics over time

In [None]:
topics_over_time = topic_model.topics_over_time(document_text, document_dates)

### Reduce outliers

In [None]:
new_topics = topic_model.reduce_outliers(document_text, topics, strategy="embeddings", embeddings=all_embeddings)

In [None]:
topic_model.update_topics(document_text, topics=new_topics)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.save("/home/nena-meijer/PyCharmMiscProject/topic_modelling/BERTopic/models/extra_preprocessing/512/training_3_reduced_outliers_embedddings.pkl",
                 serialization="safetensors", save_ctfidf=True)

## Topic Coherence Measures

In [None]:
import pandas as pd
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import gc

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Geheugen vrijmaken
torch.cuda.empty_cache()
cp.get_default_memory_pool().free_all_blocks()

# Preprocess Documents
documents = pd.DataFrame({"Document": document_text,
                          "ID": range(len(document_text)),
                          "Topic": new_topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})

# In plaats van de documenten eerst volledig te preprocessen, maken we nu direct een sparse matrix via de vectorizer
# Deze sparse matrix bevat alleen de niet-nul entries (efficiënt in geheugen)
sparse_matrix = topic_model.vectorizer_model.transform(documents_per_topic.Document.values)

# Bouw de corpus op basis van de sparse matrix
# Voor elke documentrij halen we de indices en bijbehorende aantallen op
corpus = [
    list(zip(sparse_matrix.getrow(i).indices, sparse_matrix.getrow(i).data))
    for i in range(sparse_matrix.shape[0])
]

# Maak een dictionary op basis van de vectorizer's vocabulaire
# vectorizer.vocabulary_ is een dict met token: index; we bouwen de inverse mapping
id2word = {v: k for k, v in topic_model.vectorizer_model.vocabulary_.items()}
dictionary = corpora.Dictionary()
dictionary.id2token = id2word
dictionary.token2id = topic_model.vectorizer_model.vocabulary_

# Voor de 'texts' parameter gebruiken we de analyzer om de documenten te tokenizen
analyzer = topic_model.vectorizer_model.build_analyzer()
texts = [analyzer(doc) for doc in documents_per_topic.Document.values]

# Haal de topic-woorden op uit BERTopic (elke topic als een lijst van woorden)
topic_words = [
    [word for word, _ in topic_model.get_topic(topic)]
    for topic in range(len(set(new_topics)) - 1)
]

# Optioneel: Ruim tussentijdse objecten op als je deze niet meer nodig hebt
del documents_per_topic
gc.collect()

# Bereken de topic coherence met de sparse corpus
coherence_model_umass = CoherenceModel(
    topics=topic_words,
    texts=texts,
    corpus=corpus,
    dictionary=dictionary,
    coherence='c_npmi'
)
coherence_umass = coherence_model_umass.get_coherence()
print("Coherence score algemeen:", coherence_umass)

coherence_per_topic = coherence_model_umass.get_coherence_per_topic()
for i, (score, words) in enumerate(zip(coherence_per_topic, topic_words)):
    print(f"Topic {i}: {words[:5]} → Coherence: {score:.4f}")

del corpus
del dictionary
gc.collect()