In [None]:
import pandas as pd
import ast
import time
import csv
from octis.dataset.dataset import Dataset
from octis.models.LDA import LDA
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

In [None]:
#import cleaned data

def list_converter(text):
    #to revert list->str conversion from pd.read_csv
    return ast.literal_eval(text)


data = pd.read_csv('../Data/lda_train.csv', converters ={'tokens':list_converter})
data = data.drop(columns = ['index'])
data.head()

In [None]:
#octis expects a folder with two files.
#The corpus in a .tsv file and vocabulary in a .txt file, with each word on a different line

In [None]:
#create tsv file and 'label', 80% as train, 20% as validation data

In [None]:
#data
text = data['clean_text']
#create labels 
label = ['train'] * len(text)

#store data in .tsv file
corpus_tsv = pd.DataFrame({'text': text, 'label': label})
corpus_tsv.to_csv('../octis_data/corpus.tsv', sep = '\t', header = False, index = False)

In [None]:
#create .txt vocabulary file
all_words = ' '.join(text).split()
vocabulary = set(all_words)

with open('../octis_data/vocabulary.txt', 'w') as file:
    for word in vocabulary:
        file.write (f'{word}\n')

In [None]:
#build octis pipeline

In [None]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder('../octis_data')

In [None]:
#initiate model
lda_model = LDA(passes = 10,
                chunksize = 5000,
                eval_every = None,
                iterations = 400, 
                random_state = 45
               )

In [None]:
#hyperparameter optimization to get best model
search_space = {
    "num_topics":Categorical({5,10,15,20,25,30}),
    "eta": Real(low=0.01, high = 5.0),
    "alpha": Real(low=0.01, high = 5.0)
}

npmi = Coherence(texts = dataset.get_corpus(), topk = 10, measure = 'c_npmi')
cv = Coherence(texts = dataset.get_corpus(), topk = 10, measure = 'c_v')
diversity = TopicDiversity(topk = 10)

extra_metric = [diversity]

##
start_time = time.time() #start counter for training time

optimizer = Optimizer()

optimization_result = optimizer.optimize(lda_model, dataset,
                                         search_space = search_space,
                                         metric = cv,
                                         model_runs = 5, number_of_call = 45,
                                         extra_metrics = extra_metric,
                                         save_models = False,
                                         save_path = "results/lda/",
                                         plot_best_seen = True
                                            )

print(f"This model took {(time.time() - start_time)/60 :.2f} minutes to train")

optimization_result.save_to_csv("lda_results.csv")

print ("results saved")

In [None]:
optimizer?

In [None]:
LDA?

In [None]:
print((1,3))

In [None]:
Categorical?