In [1]:
import pandas as pd
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import pickle

import octis
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.models.CTM import CTM
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder("content/corpus")

In [3]:
cv = Coherence(texts=dataset.get_corpus(), measure='c_v')
umass = Coherence(texts=dataset.get_corpus(), measure='u_mass')
uci = Coherence(texts=dataset.get_corpus(), measure='c_uci')
npmi = Coherence(texts=dataset.get_corpus())
topic_diversity = TopicDiversity(topk=10)

In [4]:
model = CTM(num_topics=7,inference_type="combined",reduce_on_plateau=True,learn_priors=True,num_epochs=50,use_partitions=False, bert_model='sentence-transformers/bert-large-nli-mean-tokens', bert_path='content/bert_path/nli-bert-large/_sbertlarge')

In [6]:
# Train the model using default partitioning choice 
output = model.train_model(dataset)

Batches: 100%|██████████| 150/150 [52:35<00:00, 21.04s/it] 


In [7]:
for t in output['topics']:
  print(" ".join(t))

ai feature chat user split option snapchat social top remove
game fun roblox really good love sometimes play star character
song playlist music ad spotify youtube listen premium watch skip
work connection phone wifi reinstall cache restart crash device download
managable faire edtion ufixit oufit hatch callback scrape accidentaly undesirable
post photo story video edit message reel notification picture show
account money cash support card customer help email number receive


In [8]:
topic_diversity_score = topic_diversity.score(output)
cv_score = cv.score(output)
npmi_score = npmi.score(output)
uci_score = uci.score(output)

print(f"topic diversity: {topic_diversity_score}")
print(f"coherence CV: {cv_score}")
print(f"coherence NPMI: {npmi_score}")
print(f"coherence UCI: {uci_score}")

topic diversity: 1.0
coherence CV: 0.640557372159874
coherence NPMI: 0.06737627276062373
coherence UCI: -0.3060009117314425
