In [5]:
#!pip install bertopic datasets accelerate bitsandbytes xformers adjustText

# imports
import pandas as pd
import ipywidgets as widgets
import math
import os
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from octis.dataset.dataset import Dataset
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import time
import en_core_web_sm


optimizer=Optimizer()
nlp = en_core_web_sm.load()

  "class": algorithms.Blowfish,


In [6]:
from sentence_transformers import SentenceTransformer, models
import torch
import os

# Load your pre-trained model
model_name = 'emanjavacas/MacBERTh'
word_embedding_model = models.Transformer(model_name)

# Define the custom pooling layer with mean and max pooling combined
class CustomPoolingLayer(models.Pooling):
    def __init__(self, word_embedding_dimension):
        super(CustomPoolingLayer, self).__init__(word_embedding_dimension)
    
    def mean_pooling(self, token_embeddings, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
    
    def max_pooling(self, token_embeddings, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
        max_embeddings = torch.max(token_embeddings, 1)[0]
        return max_embeddings

    def forward(self, features):
        token_embeddings = features['token_embeddings']
        attention_mask = features['attention_mask']
        
        # Mean pooling
        mean_pooled = self.mean_pooling(token_embeddings, attention_mask)
        
        # Max pooling
        max_pooled = self.max_pooling(token_embeddings, attention_mask)
        
        # Concatenate mean and max pooled embeddings
        sentence_embedding = torch.cat((mean_pooled, max_pooled), 1)
        
        features.update({'sentence_embedding': sentence_embedding})
        return features

# Get the embedding dimension
embedding_dim = word_embedding_model.get_word_embedding_dimension()

# Create the custom pooling layer
custom_pooling = CustomPoolingLayer(embedding_dim)

# Define the SentenceTransformer model with the custom pooling
sentence_transformer_model = SentenceTransformer(modules=[word_embedding_model, custom_pooling])

In [7]:
#SETUP
DATA = "EN"
EM = "emanjavacas/MacBERTh"
EM2 = "all-mpnet-base-v2"
#EM3 = sentence_transformer_model

In [8]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(DATA)
docs = [" ".join(words) for words in dataset.get_corpus()]

In [9]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(EM2) #EM3
embeddings = embedding_model.encode(docs, show_progress_bar=True)


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [10]:
dataset, custom = DATA, True

In [11]:
from evaluation_kmeans import Trainer

2024-06-17 15:19:42.422360: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512_VNNI
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-17 15:19:42.808512: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pspaargaren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# BERTopic

In [12]:
#TC & TD CALCULATION
for i in range(1):
    custom = True
    params = {
        "embedding_model": embedding_model,
        "nr_topics": [(i+1)*10 for i in range(5)],
        "min_topic_size": 10,
        #"diversity": None,
        "verbose": True
    }

    trainer = Trainer(dataset=dataset,
                        model_name="BERTopic",
                        params=params,
                        bt_embeddings=embeddings,
                        custom_dataset=custom,
                        verbose=True)
    #results = trainer.train(save=f"results/EN/scores/bertopic_{i+1}")
    results = trainer.train(save=f"results/kmeans/scores/bertopicNL_{i+1}")

2024-06-17 15:19:52,445 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-17 15:20:24,071 - BERTopic - Dimensionality - Completed ✓
2024-06-17 15:20:24,072 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-17 15:20:24,263 - BERTopic - Cluster - Completed ✓
2024-06-17 15:20:24,265 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-17 15:20:25,249 - BERTopic - Representation - Completed ✓
2024-06-17 15:20:25,250 - BERTopic - Topic reduction - Reducing number of topics
2024-06-17 15:20:25,251 - BERTopic - Topic reduction - Reduced number of topics from 8 to 8
2024-06-17 15:20:32,079 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Results
npmi: 0.05982443598337828
diversity: 0.6142857142857143
 


2024-06-17 15:20:50,189 - BERTopic - Dimensionality - Completed ✓
2024-06-17 15:20:50,190 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-17 15:20:50,282 - BERTopic - Cluster - Completed ✓
2024-06-17 15:20:50,283 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-17 15:20:51,294 - BERTopic - Representation - Completed ✓
2024-06-17 15:20:51,294 - BERTopic - Topic reduction - Reducing number of topics
2024-06-17 15:20:51,296 - BERTopic - Topic reduction - Reduced number of topics from 8 to 8
2024-06-17 15:20:58,172 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Results
npmi: 0.054036331518551105
diversity: 0.6714285714285714
 


2024-06-17 15:21:10,011 - BERTopic - Dimensionality - Completed ✓
2024-06-17 15:21:10,012 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-17 15:21:10,114 - BERTopic - Cluster - Completed ✓
2024-06-17 15:21:10,116 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-17 15:21:11,107 - BERTopic - Representation - Completed ✓
2024-06-17 15:21:11,108 - BERTopic - Topic reduction - Reducing number of topics
2024-06-17 15:21:11,109 - BERTopic - Topic reduction - Reduced number of topics from 8 to 8
2024-06-17 15:21:18,015 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Results
npmi: 0.05556631870419089
diversity: 0.6428571428571429
 


2024-06-17 15:21:30,899 - BERTopic - Dimensionality - Completed ✓
2024-06-17 15:21:30,900 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-17 15:21:30,996 - BERTopic - Cluster - Completed ✓
2024-06-17 15:21:30,998 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-17 15:21:31,993 - BERTopic - Representation - Completed ✓
2024-06-17 15:21:31,994 - BERTopic - Topic reduction - Reducing number of topics
2024-06-17 15:21:31,995 - BERTopic - Topic reduction - Reduced number of topics from 8 to 8
2024-06-17 15:21:38,840 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Results
npmi: 0.038774305763555464
diversity: 0.6428571428571429
 


2024-06-17 15:21:51,580 - BERTopic - Dimensionality - Completed ✓
2024-06-17 15:21:51,582 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-17 15:21:51,680 - BERTopic - Cluster - Completed ✓
2024-06-17 15:21:51,682 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-17 15:21:52,668 - BERTopic - Representation - Completed ✓
2024-06-17 15:21:52,669 - BERTopic - Topic reduction - Reducing number of topics
2024-06-17 15:21:52,670 - BERTopic - Topic reduction - Reduced number of topics from 8 to 8


Results
npmi: 0.05770491383436944
diversity: 0.6714285714285714
 


FileNotFoundError: [Errno 2] No such file or directory: 'results/kmeans/scores/bertopicNL_1.json'

In [None]:
#TOPIC EXTRACTION
custom = True
params = {
    "embedding_model": embedding_model,
    "nr_topics": 15,
    "min_topic_size": 10,
    #"diversity": None,
    "verbose": True
}

trainer = Trainer(dataset=dataset,
                    model_name="BERTopic",
                    params=params,
                    bt_embeddings=embeddings,
                    custom_dataset=custom,
                    verbose=True)
results = trainer.train(save=f"results/EN/topics/bertopic")

# LDA

In [None]:
for i, random_state in enumerate([0, 21, 42]):
    dataset, custom = DATA, True
    params = {"num_topics": [(i+1)*10 for i in range(5)], "random_state": random_state}

    trainer = Trainer(dataset=dataset,
                      model_name="LDA",
                      params=params,
                      custom_dataset=custom,
                      verbose=True)
    results = trainer.train(save=f"results/EN/scores/lda_{i+1}")

In [None]:
#TOPIC CREATION:
dataset, custom = DATA, True
params = {"num_topics": 15, "random_state": 42}#[(i+1)*10 for i in range(5)], "random_state": random_state}

trainer = Trainer(dataset=dataset,
                  model_name="LDA",
                  params=params,
                  custom_dataset=custom,
                  verbose=True)
results = trainer.train(save=f"results/EN/topics/lda")

# NMF

In [None]:
for i, random_state in enumerate([0, 21, 42]):
    dataset, custom = DATA, True
    params = {"num_topics": [(i+1)*10 for i in range(5)], "random_state": random_state}

    trainer = Trainer(dataset=dataset,
                      model_name="NMF",
                      params=params,
                      custom_dataset=custom,
                      verbose=True)
    results = trainer.train(save=f"results/EN/scores/nmf_{i+1}")

In [None]:
#TOPIC CREATION:
dataset, custom = DATA, True
params = {"num_topics": 15, "random_state": 42}#[(i+1)*10 for i in range(5)], "random_state": random_state}

trainer = Trainer(dataset=dataset,
                  model_name="NMF",
                  params=params,
                  custom_dataset=custom,
                  verbose=True)
results = trainer.train(save=f"results/EN/topics/nmf")

# CTM

In [None]:
import nltk

#nltk.download("stopwords")
from nltk.corpus import stopwords

In [None]:
# TC & TD
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
#!pip install contextualized_topic_models
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
for i in range(1):
    dataset, custom = DATA, True
    params = {
        "n_components": [(i+1)*10 for i in range(5)],
        "contextual_size":768
    }

    trainer = Trainer(dataset=dataset,
                      model_name="CTM_CUSTOM",
                      params=params,
                      custom_dataset=custom,
                      verbose=True)
    results = trainer.train(save=f"results/EN/scores/ctm_{i+1}")

In [None]:
# TOPIC CREATION
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
#!pip install contextualized_topic_models
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
#for i in range(3):
dataset, custom = DATA, True
params = {
    "n_components": 15,#[(i+1)*10 for i in range(5)],
    "contextual_size":768
}

trainer = Trainer(dataset=dataset,
                  model_name="CTM_CUSTOM",
                  params=params,
                  custom_dataset=custom,
                  verbose=True)
results = trainer.train(save=f"results/EN/topics/ctm")

# Dynamic TM

In [8]:
#SETUP
dDATA = "EN_dtm"

In [9]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(dDATA)

In [10]:
# Prepare data
from data_flower import DataLoader
dataset, custom = dDATA, True
data_loader = DataLoader(dataset)
_, timestamps = data_loader.load_docs()
data = data_loader.load_octis(custom)
data = [" ".join(words) for words in data.get_corpus()]

embedding_model = SentenceTransformer(EM) #EM3
embeddings = embedding_model.encode(docs, show_progress_bar=True)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/pspaargaren/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
No sentence-transformers model found with name emanjavacas/MacBERTh. Creating a new one with MEAN pooling.


Batches:   0%|          | 0/1445 [00:00<?, ?it/s]

In [11]:
# Match indices
import os
os.listdir(f"./{dataset}")
with open(f"./{dataset}/indexes.txt") as f:
    indices = f.readlines()
    
indices = [int(index.split("\n")[0]) for index in indices]
timestamps = [timestamp for index, timestamp in enumerate(timestamps) if index in indices]
len(data), len(timestamps)

(46236, 46236)

In [12]:
from evaluation import Trainer

params = {
        "nr_topics": [(i+1)*10 for i in range(5)],
        "min_topic_size": 5,
        "verbose": True,
    }

trainer = Trainer(dataset=dataset,
                      model_name="BERTopic",
                      params=params,
                      bt_embeddings=embeddings,
                      custom_dataset=custom,
                      bt_timestamps=timestamps,
                      topk=5,
                      bt_nr_bins=5,
                      verbose=True)
results = trainer.train(f"DynamicBERTopic_EN")

2024-06-03 16:04:46,721 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-03 16:05:31,135 - BERTopic - Dimensionality - Completed ✓
2024-06-03 16:05:31,136 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

marjo
bcautify


2024-06-03 16:11:00,431 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-03 16:11:24,226 - BERTopic - Dimensionality - Completed ✓
2024-06-03 16:11:24,228 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

regula
baiga
stepe
bcautify
orlanois


2024-06-03 16:16:25,809 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-03 16:17:01,500 - BERTopic - Dimensionality - Completed ✓
2024-06-03 16:17:01,503 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

regula
marjo
baiga
stepe
bcautify
orlanois
apcides


2024-06-03 16:21:42,645 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-03 16:22:06,393 - BERTopic - Dimensionality - Completed ✓
2024-06-03 16:22:06,395 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

regula
marjo
baiga
stepe
semelinesse
swere
sicle
apcides
bcautify
orlanois
prd


2024-06-03 16:27:27,129 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-03 16:27:52,652 - BERTopic - Dimensionality - Completed ✓
2024-06-03 16:27:52,654 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

regula
marjo
sepul
baiga
stepe
mazus
ernor
bcautify
orlanois
apcides
prd


# MODERN

# MODERN

In [8]:
#SETUP
dDATA = "EN_dtm"

In [9]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(dDATA)

In [10]:
# Prepare data
from data_flower import DataLoader
dataset, custom = dDATA, True
data_loader = DataLoader(dataset)
_, timestamps = data_loader.load_docs()
data = data_loader.load_octis(custom)
data = [" ".join(words) for words in data.get_corpus()]

embedding_model = SentenceTransformer(EM2) #EM3
embeddings = embedding_model.encode(docs, show_progress_bar=True)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/pspaargaren/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Batches:   0%|          | 0/1445 [00:00<?, ?it/s]

In [11]:
# Match indices
import os
os.listdir(f"./{dataset}")
with open(f"./{dataset}/indexes.txt") as f:
    indices = f.readlines()
    
indices = [int(index.split("\n")[0]) for index in indices]
timestamps = [timestamp for index, timestamp in enumerate(timestamps) if index in indices]
len(data), len(timestamps)

(46236, 46236)

In [12]:
from evaluation import Trainer
#for i in range(3):
params = {
        "nr_topics": [(i+1)*10 for i in range(5)],
        "min_topic_size": 5,
        "verbose": True #,
        # "evolution_tuning": True,
        # "global_tuning": False
    }

trainer = Trainer(dataset=dataset,
                      model_name="BERTopic",
                      params=params,
                      bt_embeddings=embeddings,
                      custom_dataset=custom,
                      bt_timestamps=timestamps,
                      topk=5,
                      bt_nr_bins=5,
                      verbose=True)
results = trainer.train(f"D_EN_NO_M")

2024-06-04 08:18:57,095 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-04 08:19:46,755 - BERTopic - Dimensionality - Completed ✓
2024-06-04 08:19:46,758 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

marjo
baiga
embellifli


2024-06-04 08:24:06,663 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-04 08:24:33,236 - BERTopic - Dimensionality - Completed ✓
2024-06-04 08:24:33,239 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-04 08:24:36,462 - BERTopic - Cluster - Completed ✓
2024-06-04 08:24:36,463 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-04 08:24:43,775 - BERTopic - Representation - Completed ✓
2024-06-04 08:24:43,778 - BERTopic - Topic reduction - Reducing number of topics
2024-06-04 08:24:49,735 - BERTopic - Topic reduction - Reduced number of topics from 1345 to 20
5it [00:05,  1.08s/it]


regula
baiga
longitudi
embellifli


2024-06-04 08:28:54,694 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-04 08:29:20,922 - BERTopic - Dimensionality - Completed ✓
2024-06-04 08:29:20,925 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-04 08:29:24,190 - BERTopic - Cluster - Completed ✓
2024-06-04 08:29:24,191 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-04 08:29:31,548 - BERTopic - Representation - Completed ✓
2024-06-04 08:29:31,551 - BERTopic - Topic reduction - Reducing number of topics
2024-06-04 08:29:37,694 - BERTopic - Topic reduction - Reduced number of topics from 1349 to 30
5it [00:06,  1.31s/it]


regula
marjo
baiga
stepe


2024-06-04 08:34:22,503 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-04 08:34:58,959 - BERTopic - Dimensionality - Completed ✓
2024-06-04 08:34:58,963 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

regula
baiga
stepe
marjo
thof
encyclopdia


2024-06-04 08:40:18,315 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-04 08:40:46,224 - BERTopic - Dimensionality - Completed ✓
2024-06-04 08:40:46,227 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

regula
marjo
sepul
baiga
stepe


In [13]:
print("Het is klaar")

Het is klaar
