In [1]:
import pandas as pd
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import pickle

import octis
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.models.LSI import LSI
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [2]:
import inspect
print(inspect.getsource(octis.models.LSI.LSI))

class LSI(AbstractModel):

    id2word = None
    id_corpus = None
    hyperparameters = {}
    use_partitions = True
    update_with_test = False

    def __init__(self, num_topics=200, chunksize=20000, decay=1.0,
                 distributed=False, onepass=True, power_iters=2,
                 extra_samples=100):
        """
        Initialize LSI model

        Parameters
        ----------
        num_topics (int, optional) – Number of requested factors

        chunksize (int, optional) – Number of documents to be used in each
        training chunk.

        decay (float, optional) – Weight of existing observations relatively
        to new ones.

        distributed (bool, optional) – If True - distributed mode (parallel
        execution on several machines) will be used.

        onepass (bool, optional) – Whether the one-pass algorithm should be
        used for training. Pass False to force a multi-pass stochastic
        algorithm.

        power_iters (int, optional) – Num

In [3]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder("content/corpus")

In [4]:
cv = Coherence(texts=dataset.get_corpus(), measure='c_v')
umass = Coherence(texts=dataset.get_corpus(), measure='u_mass')
uci = Coherence(texts=dataset.get_corpus(), measure='c_uci')
npmi = Coherence(texts=dataset.get_corpus())
topic_diversity = TopicDiversity(topk=10)

In [5]:
# Create Model
model = LSI(num_topics=7)

In [6]:
# Train the model using default partitioning choice 
output = model.train_model(dataset)

In [7]:
for t in output['topics']:
  print(" ".join(t))

app use get video like update time game even good
app use message send open cash work video reinstall notification
app game cash money love fun roblox open good great
video app game play ad watch good youtube amazing edit
song use like ad music playlist want spotify listen play
ad get song play time even watch every app listen
update song play fix music playlist please work new spotify


In [8]:
topic_diversity_score = topic_diversity.score(output)
cv_score = cv.score(output)
npmi_score = npmi.score(output)
uci_score = uci.score(output)

print(f"topic diversity: {topic_diversity_score}")
print(f"coherence CV: {cv_score}")
print(f"coherence NPMI: {npmi_score}")
print(f"coherence UCI: {uci_score}")

topic diversity: 0.5428571428571428
coherence CV: 0.45886019613441015
coherence NPMI: 0.02907225588259218
coherence UCI: 0.07320196452086689
