In [1]:
import pandas as pd
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import pickle

import octis
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.models.ProdLDA import ProdLDA
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [2]:
import inspect
print(inspect.getsource(octis.models.ProdLDA.ProdLDA))

class ProdLDA(AVITM):
    def __init__(
        self, num_topics=10, activation='softplus', dropout=0.2,
        learn_priors=True, batch_size=64, lr=2e-3, momentum=0.99,
        solver='adam', num_epochs=100, reduce_on_plateau=False, prior_mean=0.0,
        prior_variance=None, num_layers=2, num_neurons=100, num_samples=10,
            use_partitions=True):
        super().__init__(
            num_topics=num_topics, model_type='prodLDA', activation=activation,
            dropout=dropout, learn_priors=learn_priors, batch_size=batch_size,
            lr=lr, momentum=momentum, solver=solver, num_epochs=num_epochs,
            reduce_on_plateau=reduce_on_plateau, prior_mean=prior_mean,
            prior_variance=prior_variance, num_layers=num_layers,
            num_neurons=num_neurons, num_samples=num_samples,
            use_partitions=use_partitions)

    def train_model(self, dataset, hyperparameters=None, top_words=10):
        return super().train_model(dataset, hyperparameters, t

In [3]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder("content/corpus")

In [4]:
cv = Coherence(texts=dataset.get_corpus(), measure='c_v')
umass = Coherence(texts=dataset.get_corpus(), measure='u_mass')
uci = Coherence(texts=dataset.get_corpus(), measure='c_uci')
npmi = Coherence(texts=dataset.get_corpus())
topic_diversity = TopicDiversity(topk=10)

In [5]:
# Create Model
model = ProdLDA(num_topics=7,num_epochs=50, use_partitions=False)

In [6]:
# Train the model using default partitioning choice 
output = model.train_model(dataset)

Epoch: [1/50]	Samples: [15000/750000]	Train Loss: 300.5341977213542	Time: 0:00:06.151467
Epoch: [2/50]	Samples: [30000/750000]	Train Loss: 288.8199872721354	Time: 0:00:05.721865
Epoch: [3/50]	Samples: [45000/750000]	Train Loss: 285.86564563802085	Time: 0:00:06.161407
Epoch: [4/50]	Samples: [60000/750000]	Train Loss: 284.77738372395834	Time: 0:00:06.114523
Epoch: [5/50]	Samples: [75000/750000]	Train Loss: 283.9300390299479	Time: 0:00:06.216062
Epoch: [6/50]	Samples: [90000/750000]	Train Loss: 283.54364661458334	Time: 0:00:06.944169
Epoch: [7/50]	Samples: [105000/750000]	Train Loss: 283.4546566080729	Time: 0:00:06.554561
Epoch: [8/50]	Samples: [120000/750000]	Train Loss: 283.2999038411458	Time: 0:00:06.774831
Epoch: [9/50]	Samples: [135000/750000]	Train Loss: 283.1027125	Time: 0:00:06.394689
Epoch: [10/50]	Samples: [150000/750000]	Train Loss: 282.93064964192706	Time: 0:00:06.329443
Epoch: [11/50]	Samples: [165000/750000]	Train Loss: 282.8739964518229	Time: 0:00:06.093482
Epoch: [12/50]	S

In [7]:
for t in output['topics']:
  print(" ".join(t))

song playlist music spotify ad listen video youtube premium watch
chat ai feature snapchat top option see split post remove
expecially fave aka whichever norma gives evan wether irreversible crown
account money cash support card help email customer never service
work open issue reinstall app notification even phone update try
expecially fave wether norma aka evan milk whichever crown gives
game really roblox fun sometimes love good star thing glitch


In [8]:
topic_diversity_score = topic_diversity.score(output)
cv_score = cv.score(output)
npmi_score = npmi.score(output)
uci_score = uci.score(output)

print(f"topic diversity: {topic_diversity_score}")
print(f"coherence CV: {cv_score}")
print(f"coherence NPMI: {npmi_score}")
print(f"coherence UCI: {uci_score}")

topic diversity: 0.8714285714285714
coherence CV: 0.6074554369776356
coherence NPMI: -0.014789812363450953
coherence UCI: -1.9399664665362988
