In [1]:
import pandas as pd
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import pickle

import octis
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.models.LDA import LDA
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [2]:
import inspect
print(inspect.getsource(octis.models.LDA.LDA))

class LDA(AbstractModel):

    id2word = None
    id_corpus = None
    use_partitions = True
    update_with_test = False

    def __init__(
        self, num_topics=100, distributed=False, chunksize=2000,
        passes=1, update_every=1, alpha="symmetric", eta=None, decay=0.5,
        offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001,
            random_state=None):
        """
        Initialize LDA model

        Parameters
        ----------
        num_topics (int, optional) – The number of requested latent topics to
        be extracted from the training corpus.

        distributed (bool, optional) – Whether distributed computing should be
        used to accelerate training.

        chunksize (int, optional) – Number of documents to be used in each
        training chunk.

        passes (int, optional) – Number of passes through the corpus during
        training.

        update_every (int, optional) – Number of documents to be iterated
        through for eac

In [3]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder("content/corpus")

In [4]:
cv = Coherence(texts=dataset.get_corpus(), measure='c_v')
umass = Coherence(texts=dataset.get_corpus(), measure='u_mass')
uci = Coherence(texts=dataset.get_corpus(), measure='c_uci')
npmi = Coherence(texts=dataset.get_corpus())
topic_diversity = TopicDiversity(topk=10)

In [5]:
# Create Model
model = LDA(num_topics=7)

In [6]:
# Train the model using default partitioning choice 
output = model.train_model(dataset)

In [7]:
for t in output['topics']:
  print(" ".join(t))

game good roblox like really make play love get add
app fix work problem nt try even say update crash
game app get use make like good time money great
app account use good one download online time can phone
screen app go fix back update get time use bug
play fix game app please download video character update sometimes
playlist video ad app want youtube screen watch like use


In [8]:
topic_diversity_score = topic_diversity.score(output)
cv_score = cv.score(output)
npmi_score = npmi.score(output)
uci_score = uci.score(output)

print(f"topic diversity: {topic_diversity_score}")
print(f"coherence CV: {cv_score}")
print(f"coherence NPMI: {npmi_score}")
print(f"coherence UCI: {uci_score}")

topic diversity: 0.6142857142857143
coherence CV: 0.4195004506279526
coherence NPMI: 0.009623189261826785
coherence UCI: 0.02821437644993594
