In [1]:
import pandas as pd
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import pickle

import octis
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.models.ETM import ETM
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [2]:
import inspect
print(inspect.getsource(octis.models.ETM.ETM))

class ETM(BaseETM):

    def __init__(
        self, num_topics=10, num_epochs=100, t_hidden_size=800, rho_size=300,
        embedding_size=300, activation='relu', dropout=0.5, lr=0.005,
        optimizer='adam', batch_size=128, clip=0.0, wdecay=1.2e-6, bow_norm=1,
        device='cpu', train_embeddings=True, embeddings_path=None,
            embeddings_type='pickle', binary_embeddings=True,
            headerless_embeddings=False, use_partitions=True):
        """
        initialization of ETM

        :param embeddings_path: string, path to embeddings file.
            Can be a binary file for the 'pickle', 'keyedvectors' and
            'word2vec' types or a text file for 'word2vec'.
            This parameter is only used if 'train_embeddings' is set to False
        :param embeddings_type: string, defines the format of the embeddings
            file. Possible values are 'pickle', 'keyedvectors' or 'word2vec'.
            If set to 'pickle', you must provide a file created with 'p

In [3]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder("content/corpus")

In [4]:
cv = Coherence(texts=dataset.get_corpus(), measure='c_v')
umass = Coherence(texts=dataset.get_corpus(), measure='u_mass')
uci = Coherence(texts=dataset.get_corpus(), measure='c_uci')
npmi = Coherence(texts=dataset.get_corpus())
topic_diversity = TopicDiversity(topk=10)

In [5]:
# Create Model
model = ETM(num_topics=7,use_partitions=False)

In [6]:
# Train the model using default partitioning choice 
output = model.train_model(dataset)

model: ETM(
  (t_drop): Dropout(p=0.5, inplace=False)
  (theta_act): ReLU()
  (rho): Linear(in_features=300, out_features=12147, bias=False)
  (alphas): Linear(in_features=300, out_features=7, bias=False)
  (q_theta): Sequential(
    (0): Linear(in_features=12147, out_features=800, bias=True)
    (1): ReLU()
    (2): Linear(in_features=800, out_features=800, bias=True)
    (3): ReLU()
  )
  (mu_q_theta): Linear(in_features=800, out_features=7, bias=True)
  (logsigma_q_theta): Linear(in_features=800, out_features=7, bias=True)
)
Epoch: 1 .. batch: 20/118 .. LR: 0.005 .. KL_theta: 0.01 .. Rec_loss: 281.24 .. NELBO: 281.25
Epoch: 1 .. batch: 40/118 .. LR: 0.005 .. KL_theta: 0.01 .. Rec_loss: 246.0 .. NELBO: 246.01
Epoch: 1 .. batch: 60/118 .. LR: 0.005 .. KL_theta: 0.01 .. Rec_loss: 233.26 .. NELBO: 233.27
Epoch: 1 .. batch: 80/118 .. LR: 0.005 .. KL_theta: 0.01 .. Rec_loss: 235.72 .. NELBO: 235.73
Epoch: 1 .. batch: 100/118 .. LR: 0.005 .. KL_theta: 0.01 .. Rec_loss: 232.31 .. NELBO: 232

In [7]:
for t in output['topics']:
  print(" ".join(t))

app game like fix video make even also good get
play game good like love thing get also make really
game play app get fun like love try fix video
game play like good fix get app make really love
game play fun account app get thing good like character
game play app get ad like money good make also
app get good like time fix make one really use


In [8]:
topic_diversity_score = topic_diversity.score(output)
cv_score = cv.score(output)
npmi_score = npmi.score(output)
uci_score = uci.score(output)

print(f"topic diversity: {topic_diversity_score}")
print(f"coherence CV: {cv_score}")
print(f"coherence NPMI: {npmi_score}")
print(f"coherence UCI: {uci_score}")

topic diversity: 0.32857142857142857
coherence CV: 0.4210142766974854
coherence NPMI: 0.005855890315829429
coherence UCI: 0.012371004642485176
