In [1]:
import pandas as pd
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import pickle

import octis
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.models.HDP import HDP
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [2]:
import inspect
print(inspect.getsource(octis.models.HDP.HDP))

class HDP(AbstractModel):

    id2word = None
    id_corpus = None
    use_partitions = True
    update_with_test = False

    def __init__(self, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1,
                 gamma=1, eta=0.01, scale=1.0, var_converge=0.0001):
        """
        Initialize HDP model

        Parameters
        ----------
        max_chunks (int, optional) – Upper bound on how many chunks to process.
        It wraps around corpus beginning in another corpus pass,
        if there are not enough chunks in the corpus.

        max_time (int, optional) – Upper bound on time (in seconds)
        for which model will be trained.

        chunksize (int, optional) – Number of documents in one chuck.

        kappa (float,optional) – Learning parameter which acts as exponential
        decay factor to influence extent of learning from each batch.

        tau (float, optional) – Learning parameter which down-weights
        early i

In [3]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder("content/corpus")

In [4]:
cv = Coherence(texts=dataset.get_corpus(), measure='c_v')
umass = Coherence(texts=dataset.get_corpus(), measure='u_mass')
uci = Coherence(texts=dataset.get_corpus(), measure='c_uci')
npmi = Coherence(texts=dataset.get_corpus())
topic_diversity = TopicDiversity(topk=10)

In [11]:
# Create Model
model = HDP(T=7)

In [12]:
# Train the model using default partitioning choice 
output = model.train_model(dataset)

In [13]:
len(output['topics'])

7

In [14]:
for t in output['topics']:
  print(" ".join(t))

app game use get video like update time play good
app use video get like game update time good fix
app video use get like time good update work even
app use get video time update like make fix good
app video use like get time update game really work
app get video use update like even work fix time
app use video get time update like good try make


In [15]:
topic_diversity_score = topic_diversity.score(output)
cv_score = cv.score(output)
npmi_score = npmi.score(output)
uci_score = uci.score(output)

print(f"topic diversity: {topic_diversity_score}")
print(f"coherence CV: {cv_score}")
print(f"coherence NPMI: {npmi_score}")
print(f"coherence UCI: {uci_score}")

topic diversity: 0.22857142857142856
coherence CV: 0.3302866233704863
coherence NPMI: -0.018825262517980217
coherence UCI: -0.12049337193272949
