In [1]:
import pandas as pd
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import pickle

import octis
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.models.CTM import CTM
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder("content/corpus")

In [3]:
cv = Coherence(texts=dataset.get_corpus(), measure='c_v')
umass = Coherence(texts=dataset.get_corpus(), measure='u_mass')
uci = Coherence(texts=dataset.get_corpus(), measure='c_uci')
npmi = Coherence(texts=dataset.get_corpus())
topic_diversity = TopicDiversity(topk=10)

In [4]:
model = CTM(num_topics=7,inference_type="combined",reduce_on_plateau=True,learn_priors=True,num_epochs=50,use_partitions=False, bert_model='sentence-transformers/nli-bert-base', bert_path='content/bert_path/nli-bert-base/_sbertbase')

In [5]:
# Train the model using default partitioning choice 
output = model.train_model(dataset)

Downloading (…)53a76/.gitattributes: 100%|██████████| 690/690 [00:00<?, ?B/s] 
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<?, ?B/s] 
Downloading (…)6dae053a76/README.md: 100%|██████████| 3.89k/3.89k [00:00<00:00, 250kB/s]
Downloading (…)ae053a76/config.json: 100%|██████████| 613/613 [00:00<00:00, 120kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<?, ?B/s] 
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [02:05<00:00, 3.50MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 27.5kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<?, ?B/s] 
Downloading (…)53a76/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 570kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 375/375 [00:00<?, ?B/s] 
Downloading (…)6dae053a76/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 303kB/s]
Downloading (…)e053a76/modules.json: 100%|██████████| 229/229 [00:00<00:00, 15.3kB/s]


In [6]:
for t in output['topics']:
  print(" ".join(t))

game roblox fun star really play glitch character lag love
easy capcut editing tiktok template edit learn platform effect friendly
account cash money card support customer help receive service never
openable pleaseeeee erasis curse gltich pony apear pighead scrape reajoin
work phone reinstall issue restart open uninstall load fix crash
song ad playlist music spotify youtube watch listen premium video
ai chat snapchat post top see remove view photo option


In [7]:
topic_diversity_score = topic_diversity.score(output)
cv_score = cv.score(output)
npmi_score = npmi.score(output)
uci_score = uci.score(output)

print(f"topic diversity: {topic_diversity_score}")
print(f"coherence CV: {cv_score}")
print(f"coherence NPMI: {npmi_score}")
print(f"coherence UCI: {uci_score}")

topic diversity: 1.0
coherence CV: 0.6332211608330759
coherence NPMI: 0.05662020069112566
coherence UCI: -0.4819006677865613
