In [1]:
#!pip install bertopic datasets accelerate bitsandbytes xformers adjustText

# imports
import pandas as pd
import ipywidgets as widgets
import math
import os
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from octis.dataset.dataset import Dataset
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import time
import en_core_web_sm


optimizer=Optimizer()
nlp = en_core_web_sm.load()

  "class": algorithms.Blowfish,


In [2]:
#SETUP
DATA = "EN"
EM = "all-mpnet-base-v2"

In [3]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(DATA)
docs = [" ".join(words) for words in dataset.get_corpus()]

In [4]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(EM)
embeddings = embedding_model.encode(docs, show_progress_bar=True)


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [5]:
dataset, custom = DATA, True

In [6]:
from evaluation3 import Trainer

2024-06-07 08:10:33.926067: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512_VNNI
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-07 08:10:34.137285: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pspaargaren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# BERTopic

In [None]:
#TC & TD CALCULATION
for i in range(3):
    custom = True
    params = {
        "embedding_model": embedding_model,
        "nr_topics": [(i+1)*10 for i in range(5)],
        "min_topic_size": 10,
        #"diversity": None,
        "verbose": True
    }

    trainer = Trainer(dataset=dataset,
                        model_name="BERTopic",
                        params=params,
                        bt_embeddings=embeddings,
                        custom_dataset=custom,
                        verbose=True)
    results = trainer.train(save=f"results/EN_mod/scores/bertopic_{i+1}")

In [None]:
#TOPIC EXTRACTION
custom = True
params = {
    "embedding_model": embedding_model,
    "nr_topics": 15,
    "min_topic_size": 10,
    #"diversity": None,
    "verbose": True
}

trainer = Trainer(dataset=dataset,
                    model_name="BERTopic",
                    params=params,
                    bt_embeddings=embeddings,
                    custom_dataset=custom,
                    verbose=True)
results = trainer.train(save=f"results/EN_mod/topics/bertopic")

# LDA

In [None]:
for i, random_state in enumerate([0, 21, 42]):
    dataset, custom = DATA, True
    params = {"num_topics": [(i+1)*10 for i in range(5)], "random_state": random_state}

    trainer = Trainer(dataset=dataset,
                      model_name="LDA",
                      params=params,
                      custom_dataset=custom,
                      verbose=True)
    results = trainer.train(save=f"results/EN_mod/scores/lda_{i+1}")

In [None]:
#TOPIC CREATION:
dataset, custom = DATA, True
params = {"num_topics": 15, "random_state": 42}#[(i+1)*10 for i in range(5)], "random_state": random_state}

trainer = Trainer(dataset=dataset,
                  model_name="LDA",
                  params=params,
                  custom_dataset=custom,
                  verbose=True)
results = trainer.train(save=f"results/EN_mod/topics/lda")

# NMF

In [None]:
for i, random_state in enumerate([0, 21, 42]):
    dataset, custom = DATA, True
    params = {"num_topics": [(i+1)*10 for i in range(5)], "random_state": random_state}

    trainer = Trainer(dataset=dataset,
                      model_name="NMF",
                      params=params,
                      custom_dataset=custom,
                      verbose=True)
    results = trainer.train(save=f"results/EN_mod/scores/nmf_{i+1}")

In [None]:
#TOPIC CREATION:
dataset, custom = DATA, True
params = {"num_topics": 15, "random_state": 42}#[(i+1)*10 for i in range(5)], "random_state": random_state}

trainer = Trainer(dataset=dataset,
                  model_name="NMF",
                  params=params,
                  custom_dataset=custom,
                  verbose=True)
results = trainer.train(save=f"results/EN_mod/topics/nmf")

# CTM

In [None]:
import nltk

#nltk.download("stopwords")
from nltk.corpus import stopwords

In [None]:
# TC & TD
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
#!pip install contextualized_topic_models
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
for i in range(1):
    dataset, custom = DATA, True
    params = {
        "n_components": [(i+1)*10 for i in range(5)],
        "contextual_size":768
    }

    trainer = Trainer(dataset=dataset,
                      model_name="CTM_CUSTOM",
                      params=params,
                      custom_dataset=custom,
                      verbose=True)
    results = trainer.train(save=f"results/EN_mod/scores/ctm_{i+1}")

In [None]:
# TOPIC CREATION
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
#!pip install contextualized_topic_models
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
#for i in range(3):
dataset, custom = DATA, True
params = {
    "n_components": 15,#[(i+1)*10 for i in range(5)],
    "contextual_size":768
}

trainer = Trainer(dataset=dataset,
                  model_name="CTM_CUSTOM",
                  params=params,
                  custom_dataset=custom,
                  verbose=True)
results = trainer.train(save=f"results/EN_mod/topics/ctm")

# Dynamic TM

In [7]:
#SETUP
dDATA = "EN_dtm"

In [8]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(dDATA)

In [9]:
#prepare data
from sentence_transformers import SentenceTransformer
from data_t_flower import DataLoader
dataset, custom = dDATA, True
data_loader = DataLoader(dataset)
_, timestamps = data_loader.load_docs()

data = data_loader.load_octis(custom)
data = [" ".join(words) for words in data.get_corpus()]

embedding_model = SentenceTransformer(EM) #EM3
embeddings = embedding_model.encode(docs, show_progress_bar=True)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/pspaargaren/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [10]:
# Match indices
import os
os.listdir(f"./{dataset}")
with open(f"./{dataset}/indexes.txt") as f:
    indices = f.readlines()
    
indices = [int(index.split("\n")[0]) for index in indices]
timestamps = [timestamp for index, timestamp in enumerate(timestamps) if index in indices]
#print(timestamps)
len(data), len(timestamps)

(10000, 10000)

In [11]:
from evaluation import Trainer
params = {
        "nr_topics": [(i+1)*10 for i in range(5)],
        "min_topic_size": 5,
        "verbose": True,
    }

trainer = Trainer(dataset=dataset,
                      model_name="BERTopic",
                      params=params,
                      bt_embeddings=embeddings,
                      custom_dataset=custom,
                      bt_timestamps=timestamps,
                      topk=5,
                      bt_nr_bins=5,
                      verbose=True)
results = trainer.train(f"D_EN_T_EVO_M")

2024-06-05 12:17:27,918 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-05 12:18:05,343 - BERTopic - Dimensionality - Completed ✓
2024-06-05 12:18:05,345 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-05 12:18:05,783 - BERTopic - Cluster - Completed ✓
2024-06-05 12:18:05,783 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-05 12:18:07,040 - BERTopic - Representation - Completed ✓
2024-06-05 12:18:07,041 - BERTopic - Topic reduction - Reducing number of topics
2024-06-05 12:18:08,116 - BERTopic - Topic reduction - Reduced number of topics from 230 to 10
5it [00:02,  2.25it/s]
2024-06-05 12:19:13,546 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-05 12:19:25,496 - BERTopic - Dimensionality - Completed ✓
2024-06-05 12:19:25,498 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-05 12:19:25,937 - BERTopic - Cluster - Comple

wrttemberg
tiar
nymjih
lry


2024-06-05 12:20:34,726 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-05 12:20:47,463 - BERTopic - Dimensionality - Completed ✓
2024-06-05 12:20:47,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-05 12:20:47,899 - BERTopic - Cluster - Completed ✓
2024-06-05 12:20:47,899 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-05 12:20:49,176 - BERTopic - Representation - Completed ✓
2024-06-05 12:20:49,177 - BERTopic - Topic reduction - Reducing number of topics
2024-06-05 12:20:50,218 - BERTopic - Topic reduction - Reduced number of topics from 270 to 30
5it [00:02,  2.48it/s]


blachandrika
lry
rasnjana


2024-06-05 12:21:51,413 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-05 12:22:10,897 - BERTopic - Dimensionality - Completed ✓
2024-06-05 12:22:10,899 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-05 12:22:11,345 - BERTopic - Cluster - Completed ✓
2024-06-05 12:22:11,346 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-05 12:22:12,634 - BERTopic - Representation - Completed ✓
2024-06-05 12:22:12,635 - BERTopic - Topic reduction - Reducing number of topics
2024-06-05 12:22:13,695 - BERTopic - Topic reduction - Reduced number of topics from 259 to 40
5it [00:01,  2.57it/s]


abb
lry


2024-06-05 12:23:13,305 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-05 12:23:25,402 - BERTopic - Dimensionality - Completed ✓
2024-06-05 12:23:25,403 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-05 12:23:25,850 - BERTopic - Cluster - Completed ✓
2024-06-05 12:23:25,850 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-05 12:23:27,143 - BERTopic - Representation - Completed ✓
2024-06-05 12:23:27,144 - BERTopic - Topic reduction - Reducing number of topics
2024-06-05 12:23:28,228 - BERTopic - Topic reduction - Reduced number of topics from 248 to 50
5it [00:02,  2.32it/s]


abb
blachandrika
thas
lry
