In [13]:
#!pip install bertopic datasets accelerate bitsandbytes xformers adjustText

# imports
import pandas as pd
import ipywidgets as widgets
import math
import os
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from octis.dataset.dataset import Dataset
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import time
import en_core_web_sm


optimizer=Optimizer()
nlp = en_core_web_sm.load()

In [None]:
from sentence_transformers import SentenceTransformer, models
import torch
import os

# Load your pre-trained model
model_name = 'emanjavacas/GysBERT'
word_embedding_model = models.Transformer(model_name)

# Define the custom pooling layer with mean and max pooling combined
class CustomPoolingLayer(models.Pooling):
    def __init__(self, word_embedding_dimension):
        super(CustomPoolingLayer, self).__init__(word_embedding_dimension)
    
    def mean_pooling(self, token_embeddings, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
    
    def max_pooling(self, token_embeddings, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
        max_embeddings = torch.max(token_embeddings, 1)[0]
        return max_embeddings

    def forward(self, features):
        token_embeddings = features['token_embeddings']
        attention_mask = features['attention_mask']
        
        # Mean pooling
        mean_pooled = self.mean_pooling(token_embeddings, attention_mask)
        
        # Max pooling
        max_pooled = self.max_pooling(token_embeddings, attention_mask)
        
        # Concatenate mean and max pooled embeddings
        sentence_embedding = torch.cat((mean_pooled, max_pooled), 1)
        
        features.update({'sentence_embedding': sentence_embedding})
        return features

# Get the embedding dimension
embedding_dim = word_embedding_model.get_word_embedding_dimension()

# Create the custom pooling layer
custom_pooling = CustomPoolingLayer(embedding_dim)

# Define the SentenceTransformer model with the custom pooling
sentence_transformer_model = SentenceTransformer(modules=[word_embedding_model, custom_pooling])


In [20]:
#SETUP
DATA = "NL"
dDATA = "NL_dtm"
EM = "emanjavacas/GysBERT"
EM2 = "paraphrase-multilingual-MiniLM-L12-v2"
#EM3 = sentence_transformer_model

In [21]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(DATA)
docs = [" ".join(words) for words in dataset.get_corpus()]

In [22]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(EM2) #EM3
embeddings = embedding_model.encode(docs, show_progress_bar=True)


Batches:   0%|          | 0/596 [00:00<?, ?it/s]

In [23]:
dataset, custom = DATA, True

In [24]:
from evaluation3 import Trainer

# BERTopic

In [25]:
#TC & TD CALCULATION
for i in range(1):
    custom = True
    params = {
        "embedding_model": embedding_model,
        "nr_topics": [(i+1)*10 for i in range(5)],
        "min_topic_size": 10,
        #"diversity": None,
        "verbose": True
    }

    trainer = Trainer(dataset=dataset,
                        model_name="BERTopic",
                        params=params,
                        bt_embeddings=embeddings,
                        custom_dataset=custom,
                        verbose=True)
    results = trainer.train(save=f"results/NL/scores/bertopic_{i+1}")

2024-06-17 14:53:58,530 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-17 14:54:09,054 - BERTopic - Dimensionality - Completed ✓
2024-06-17 14:54:09,055 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-17 14:54:09,188 - BERTopic - Cluster - Completed ✓
2024-06-17 14:54:09,189 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-17 14:54:11,929 - BERTopic - Representation - Completed ✓
2024-06-17 14:54:11,931 - BERTopic - Topic reduction - Reducing number of topics
2024-06-17 14:54:11,932 - BERTopic - Topic reduction - Reduced number of topics from 8 to 8
2024-06-17 14:54:23,466 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Results
npmi: 0.03958452182863852
diversity: 0.6428571428571429
 


2024-06-17 14:54:38,908 - BERTopic - Dimensionality - Completed ✓
2024-06-17 14:54:38,909 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-17 14:54:39,058 - BERTopic - Cluster - Completed ✓
2024-06-17 14:54:39,059 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-17 14:54:41,813 - BERTopic - Representation - Completed ✓
2024-06-17 14:54:41,815 - BERTopic - Topic reduction - Reducing number of topics
2024-06-17 14:54:41,816 - BERTopic - Topic reduction - Reduced number of topics from 8 to 8
2024-06-17 14:54:53,413 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Results
npmi: 0.020784525555494347
diversity: 0.6714285714285714
 


2024-06-17 14:55:07,249 - BERTopic - Dimensionality - Completed ✓
2024-06-17 14:55:07,250 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-17 14:55:12,590 - BERTopic - Cluster - Completed ✓
2024-06-17 14:55:12,590 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-17 14:55:15,413 - BERTopic - Representation - Completed ✓
2024-06-17 14:55:15,415 - BERTopic - Topic reduction - Reducing number of topics
2024-06-17 14:55:15,416 - BERTopic - Topic reduction - Reduced number of topics from 8 to 8
2024-06-17 14:55:27,405 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Results
npmi: 0.039438096127898316
diversity: 0.7285714285714285
 


2024-06-17 14:55:41,215 - BERTopic - Dimensionality - Completed ✓
2024-06-17 14:55:41,217 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-17 14:55:44,871 - BERTopic - Cluster - Completed ✓
2024-06-17 14:55:44,872 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-17 14:55:47,668 - BERTopic - Representation - Completed ✓
2024-06-17 14:55:47,670 - BERTopic - Topic reduction - Reducing number of topics
2024-06-17 14:55:47,671 - BERTopic - Topic reduction - Reduced number of topics from 8 to 8
2024-06-17 14:55:59,604 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Results
npmi: 0.02691926249087866
diversity: 0.6857142857142857
 


2024-06-17 14:56:12,463 - BERTopic - Dimensionality - Completed ✓
2024-06-17 14:56:12,465 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-17 14:56:17,726 - BERTopic - Cluster - Completed ✓
2024-06-17 14:56:17,727 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-17 14:56:20,505 - BERTopic - Representation - Completed ✓
2024-06-17 14:56:20,507 - BERTopic - Topic reduction - Reducing number of topics
2024-06-17 14:56:20,508 - BERTopic - Topic reduction - Reduced number of topics from 8 to 8


Results
npmi: 0.024355885605067136
diversity: 0.7
 


FileNotFoundError: [Errno 2] No such file or directory: 'results/kmeans/scores/bertopicNL_1.json'

In [None]:
#TOPIC EXTRACTION
custom = True
params = {
    "embedding_model": embedding_model,
    "nr_topics": 15,
    "min_topic_size": 10,
    #"diversity": None,
    "verbose": True
}

trainer = Trainer(dataset=dataset,
                    model_name="BERTopic",
                    params=params,
                    bt_embeddings=embeddings,
                    custom_dataset=custom,
                    verbose=True)
results = trainer.train(save=f"results/NL/topics/bertopic")

# LDA

In [None]:
for i, random_state in enumerate([0, 21, 42]):
    dataset, custom = DATA, True
    params = {"num_topics": [(i+1)*10 for i in range(5)], "random_state": random_state}

    trainer = Trainer(dataset=dataset,
                      model_name="LDA",
                      params=params,
                      custom_dataset=custom,
                      verbose=True)
    results = trainer.train(save=f"results/NL/scores/lda_{i+1}")

In [None]:
#TOPIC CREATION:
dataset, custom = DATA, True
params = {"num_topics": 15, "random_state": 42}#[(i+1)*10 for i in range(5)], "random_state": random_state}

trainer = Trainer(dataset=dataset,
                  model_name="LDA",
                  params=params,
                  custom_dataset=custom,
                  verbose=True)
results = trainer.train(save=f"results/NL/topics/lda")

# NMF

In [None]:
for i, random_state in enumerate([0, 21, 42]):
    dataset, custom = DATA, True
    params = {"num_topics": [(i+1)*10 for i in range(5)], "random_state": random_state}

    trainer = Trainer(dataset=dataset,
                      model_name="NMF",
                      params=params,
                      custom_dataset=custom,
                      verbose=True)
    results = trainer.train(save=f"results/NL/scores/nmf_{i+1}")

In [None]:
#TOPIC CREATION:
dataset, custom = DATA, True
params = {"num_topics": 15, "random_state": 42}#[(i+1)*10 for i in range(5)], "random_state": random_state}

trainer = Trainer(dataset=dataset,
                  model_name="NMF",
                  params=params,
                  custom_dataset=custom,
                  verbose=True)
results = trainer.train(save=f"results/NL/topics/nmf")

# CTM

In [None]:
import nltk

#nltk.download("stopwords")
from nltk.corpus import stopwords

In [None]:
# TC & TD
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
#!pip install contextualized_topic_models
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
for i in range(1):
    dataset, custom = DATA, True
    params = {
        "n_components": [(i+1)*10 for i in range(5)],
        "contextual_size":768
    }

    trainer = Trainer(dataset=dataset,
                      model_name="CTM_CUSTOM",
                      params=params,
                      custom_dataset=custom,
                      verbose=True)
    results = trainer.train(save=f"results/NL/scores/ctm_{i+1}")

In [None]:
# TOPIC CREATION
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
#!pip install contextualized_topic_models
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
#for i in range(3):
dataset, custom = DATA, True
params = {
    "n_components": 15,#[(i+1)*10 for i in range(5)],
    "contextual_size":768
}

trainer = Trainer(dataset=dataset,
                  model_name="CTM_CUSTOM",
                  params=params,
                  custom_dataset=custom,
                  verbose=True)
results = trainer.train(save=f"results/NL/topics/ctm")

# Dynamic TM

## HISTORICAL

In [None]:
#SETUP
dDATA = "NL_dtm"

In [None]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(dDATA)
data = [" ".join(words) for words in dataset.get_corpus()]

In [None]:
dataset, custom = dDATA, True
from data_NL import DataLoader
data_loader = DataLoader(dataset)
_, timestamps = data_loader.load_docs()
data = data_loader.load_octis(custom)
data = [" ".join(words) for words in data.get_corpus()]

embedding_model = SentenceTransformer(EM) #EM3
embeddings = embedding_model.encode(docs, show_progress_bar=True)

In [None]:
# Match indices
import os
os.listdir(f"./{dataset}")
with open(f"./{dataset}/indexes.txt") as f:
    indices = f.readlines()
    
indices = [int(index.split("\n")[0]) for index in indices]
timestamps = [timestamp for index, timestamp in enumerate(timestamps) if index in indices]
len(data), len(timestamps)

In [None]:
from evaluation import Trainer
#for i in range(3):
params = {
        "nr_topics": [(i+1)*10 for i in range(5)],
        "min_topic_size": 5,
        "verbose": True #,
        #"evolution_tuning": True,
        #"global_tuning": False
    }

trainer = Trainer(dataset=dataset,
                      model_name="BERTopic",
                      params=params,
                      bt_embeddings=embeddings,
                      custom_dataset=custom,
                      bt_timestamps=timestamps,
                      topk=5,
                      bt_nr_bins=5,
                      verbose=True)
results = trainer.train(f"D_NL_all-H")

# MODERN

In [None]:
#SETUP
dDATA = "NL_dtm"

In [None]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(dDATA)
data = [" ".join(words) for words in dataset.get_corpus()]

In [None]:
dataset, custom = dDATA, True
from data_NL import DataLoader
data_loader = DataLoader(dataset)
_, timestamps = data_loader.load_docs()
data = data_loader.load_octis(custom)
data = [" ".join(words) for words in data.get_corpus()]

embedding_model = SentenceTransformer(EM2) #EM3
embeddings = embedding_model.encode(docs, show_progress_bar=True)

In [None]:
# Match indices
import os
os.listdir(f"./{dataset}")
with open(f"./{dataset}/indexes.txt") as f:
    indices = f.readlines()
    
indices = [int(index.split("\n")[0]) for index in indices]
timestamps = [timestamp for index, timestamp in enumerate(timestamps) if index in indices]
len(data), len(timestamps)

In [None]:
from evaluation import Trainer
#for i in range(3):
params = {
        "nr_topics": [(i+1)*10 for i in range(5)],
        "min_topic_size": 5,
        "verbose": True #,
        # "evolution_tuning": True,
        # "global_tuning": False
    }

trainer = Trainer(dataset=dataset,
                      model_name="BERTopic",
                      params=params,
                      bt_embeddings=embeddings,
                      custom_dataset=custom,
                      bt_timestamps=timestamps,
                      topk=5,
                      bt_nr_bins=5,
                      verbose=True)
results = trainer.train(f"D_NL_all_M")