In [1]:
import pandas as pd
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import pickle

import octis
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.models.CTM import CTM
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder("content/corpus")

In [3]:
cv = Coherence(texts=dataset.get_corpus(), measure='c_v')
umass = Coherence(texts=dataset.get_corpus(), measure='u_mass')
uci = Coherence(texts=dataset.get_corpus(), measure='c_uci')
npmi = Coherence(texts=dataset.get_corpus())
topic_diversity = TopicDiversity(topk=10)

In [8]:
model = CTM(num_topics=7,inference_type="combined",num_epochs=50,use_partitions=False, bert_model='all-mpnet-base-v2', bert_path='content/bert_path/all-mpnet-base-v2/_allmpnet')

In [9]:
# Train the model using default partitioning choice 
output = model.train_model(dataset)

In [10]:
for t in output['topics']:
  print(" ".join(t))

exiting remake dump mediocre partiality flush combat gam extend obsurde
work open reinstall phone crash issue notification restart connection uninstall
account cash money card customer support help email bank service
chat ai snapchat feature split user option group top remove
game fun roblox character good really love great sometimes overall
edit post video reel instagram tiktok capcut photo effect story
song playlist ad spotify music listen premium watch youtube skip


In [11]:
topic_diversity_score = topic_diversity.score(output)
cv_score = cv.score(output)
npmi_score = npmi.score(output)
uci_score = uci.score(output)

print(f"topic diversity: {topic_diversity_score}")
print(f"coherence CV: {cv_score}")
print(f"coherence NPMI: {npmi_score}")
print(f"coherence UCI: {uci_score}")

topic diversity: 1.0
coherence CV: 0.6571189515555153
coherence NPMI: 0.05571620402671863
coherence UCI: -0.7102445989597391
