In [2]:
import pandas as pd
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import pickle

import octis
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.models.NMF import NMF
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [3]:
import inspect
print(inspect.getsource(octis.models.NMF.NMF))

class NMF(AbstractModel):

    def __init__(
        self, num_topics=100, chunksize=2000, passes=1, kappa=1.0,
        minimum_probability=0.01, w_max_iter=200,
        w_stop_condition=0.0001, h_max_iter=50, h_stop_condition=0.001,
        eval_every=10, normalize=True, random_state=None,
            use_partitions=True):
        """
        Initialize NMF model

        Parameters
        ----------
        num_topics (int, optional) – Number of topics to extract.

        chunksize (int, optional) – Number of documents to be used in each
        training chunk.

        passes (int, optional) – Number of full passes over the
        training corpus. Leave at default passes=1 if your input
        is an iterator.

        kappa (float, optional) – Gradient descent step size.
        Larger value makes the model train faster, but could
        lead to non-convergence if set too large.

        minimum_probability – If normalize is True, topics with
        smaller probabilities are f

In [4]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder("content/corpus")

In [5]:
cv = Coherence(texts=dataset.get_corpus(), measure='c_v')
umass = Coherence(texts=dataset.get_corpus(), measure='u_mass')
uci = Coherence(texts=dataset.get_corpus(), measure='c_uci')
npmi = Coherence(texts=dataset.get_corpus())
topic_diversity = TopicDiversity(topk=10)

In [6]:
# Create Model
model = NMF(num_topics=7)

In [7]:
# Train the model using default partitioning choice 
output = model.train_model(dataset)

In [8]:
for t in output['topics']:
  print(" ".join(t))

app fix make great please cash issue bug good keep
get back post go try game fix notification also message
like game play make really good one will message video
update work time app send message account issue keep even
video app watch ad work post play youtube edit go
use app ad get even good see update go bad
app try use feature time want phone work also give


In [9]:
topic_diversity_score = topic_diversity.score(output)
cv_score = cv.score(output)
npmi_score = npmi.score(output)
uci_score = uci.score(output)

print(f"topic diversity: {topic_diversity_score}")
print(f"coherence CV: {cv_score}")
print(f"coherence NPMI: {npmi_score}")
print(f"coherence UCI: {uci_score}")

topic diversity: 0.6
coherence CV: 0.3883118638039832
coherence NPMI: -0.0010630785564898792
coherence UCI: -0.0564292041172054
