In [1]:
import pandas as pd
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import pickle

import octis
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.models.NeuralLDA import NeuralLDA
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [2]:
import inspect
print(inspect.getsource(octis.models.NeuralLDA.NeuralLDA))

class NeuralLDA(AVITM):
    def __init__(
        self, num_topics=10, activation='softplus', dropout=0.2,
        learn_priors=True, batch_size=64, lr=2e-3, momentum=0.99,
        solver='adam', num_epochs=100, reduce_on_plateau=False, prior_mean=0.0,
        prior_variance=None, num_layers=2, num_neurons=100, num_samples=10,
            use_partitions=True):
        super().__init__(
            num_topics=num_topics, model_type='LDA', activation=activation,
            dropout=dropout, learn_priors=learn_priors, batch_size=batch_size,
            lr=lr, momentum=momentum, solver=solver, num_epochs=num_epochs,
            reduce_on_plateau=reduce_on_plateau, prior_mean=prior_mean,
            prior_variance=prior_variance, num_layers=num_layers,
            num_neurons=num_neurons, num_samples=num_samples,
            use_partitions=use_partitions)

    def train_model(self, dataset, hyperparameters=None, top_words=10):
        return super().train_model(
            dataset=dataset,

In [3]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder("content/corpus")

In [4]:
cv = Coherence(texts=dataset.get_corpus(), measure='c_v')
umass = Coherence(texts=dataset.get_corpus(), measure='u_mass')
uci = Coherence(texts=dataset.get_corpus(), measure='c_uci')
npmi = Coherence(texts=dataset.get_corpus())
topic_diversity = TopicDiversity(topk=10)

In [5]:
# Create Model
model = NeuralLDA(num_topics=7,use_partitions=False,num_epochs=50)

In [6]:
# Train the model using default partitioning choice 
output = model.train_model(dataset)

Epoch: [1/50]	Samples: [15000/750000]	Train Loss: 284.3912905598958	Time: 0:00:09.729613
Epoch: [2/50]	Samples: [30000/750000]	Train Loss: 272.96326803385415	Time: 0:00:08.712471
Epoch: [3/50]	Samples: [45000/750000]	Train Loss: 269.8860184244792	Time: 0:00:08.070113
Epoch: [4/50]	Samples: [60000/750000]	Train Loss: 269.4373930013021	Time: 0:00:08.595411
Epoch: [5/50]	Samples: [75000/750000]	Train Loss: 268.38486487630206	Time: 0:00:10.598162
Epoch: [6/50]	Samples: [90000/750000]	Train Loss: 268.0924031575521	Time: 0:00:10.385163
Epoch: [7/50]	Samples: [105000/750000]	Train Loss: 267.91443076171873	Time: 0:00:10.513545
Epoch: [8/50]	Samples: [120000/750000]	Train Loss: 268.3832941080729	Time: 0:00:10.267352
Epoch: [9/50]	Samples: [135000/750000]	Train Loss: 268.33092294921875	Time: 0:00:07.676441
Epoch: [10/50]	Samples: [150000/750000]	Train Loss: 267.95362682291665	Time: 0:00:07.320500
Epoch: [11/50]	Samples: [165000/750000]	Train Loss: 267.9925103190104	Time: 0:00:07.555464
Epoch: [1

In [7]:
for t in output['topics']:
  print(" ".join(t))

progress idk recommendation tho library stuck jump avatar pick wanna
mind bore glitchy repeat begin dislike gets anyway appreciate robux
perfect learn min forward coin highly episode middle awesome cast
disconnect computer album artist reopen happy choice tok trouble shut
telegram card receive customer support number link file cash group
game play good app like use fix video really time
offline exit barely laptop caption easily accidentally four hold bluetooth


In [8]:
topic_diversity_score = topic_diversity.score(output)
cv_score = cv.score(output)
npmi_score = npmi.score(output)
uci_score = uci.score(output)

print(f"topic diversity: {topic_diversity_score}")
print(f"coherence CV: {cv_score}")
print(f"coherence NPMI: {npmi_score}")
print(f"coherence UCI: {uci_score}")

topic diversity: 1.0
coherence CV: 0.3314369671187864
coherence NPMI: -0.18592447144901714
coherence UCI: -5.799499560821853
