<font size="+12"><center>
    Data Science Packages Latent Dirichlet Allocation (LDA)
</font></center>

## Import libraries

In [1]:
import os
import json

from datetime import datetime
from pathlib import Path

from gensim import corpora, models
from sklearn.model_selection import train_test_split

In [24]:
LDA_PERCENTAGE_TEST_DATASET = 0.1 or os.getenv("LDA_PERCENTAGE_TEST_DATASET")

In [5]:
# Retrieve clean dataset
current_path = Path.cwd().parents[0]
data_path = current_path.joinpath("data/processed")

with open(f"{data_path}/clean_dataset.json") as json_file:
    clean_dataset = json.load(json_file)

texts_names = []
texts = []
for file_name, file_vocabulary in clean_dataset.items():
    texts.append(file_vocabulary)
    texts_names.append(file_name)

# Process data for LDA
    
# Assign a unique integer id to all words appearing in the corpus, creating a vocabulary corpus
dictionary = corpora.Dictionary(texts)
print("Number of unique tokens: %d" % len(dictionary))
# print(f"Token ID map:\n {dictionary.token2id}")

# Bag of Words (BoW) Representation
corpus = [dictionary.doc2bow(tokens) for tokens in texts]

lda_percentage_training_dataset = (1 - float(LDA_PERCENTAGE_TEST_DATASET)) * 100
print("Training Dataset percentage is: %d" % lda_percentage_training_dataset)

lda_percentage_test_dataset = float(LDA_PERCENTAGE_TEST_DATASET) * 100
print("Test Dataset percentage is: %d" % lda_percentage_test_dataset)

corpus_train, corpus_test = train_test_split(corpus, test_size=LDA_PERCENTAGE_TEST_DATASET)

Number of unique tokens: 2949
Training Dataset percentage is: 90
Test Dataset percentage is: 10


## Set inputs for LDA

In [10]:
# HYPERPARAMETERS

# Number of topics
NUMBER_TOPICS = int(os.getenv("NUMBER_TOPICS", 10))

# ALPHA: Dirichlet hyperparameter alpha, Document-Topic Density.

# alpha controls the mixture of topics for any given document. 
# Turn it down, and the documents will likely have less of a mixture of topics.
# Turn it up, and the documents will likely have more of a mixture of topics.
LDA_ALPHA = os.getenv("LDA_ALPHA", 0.5)

# ETA: Dirichlet hyperparameter alpha, Topic-Word Density.

# The beta/eta hyperparameter controls the distribution of words per topic.
# Turn it down, and the topics will likely have less words.
# Turn it up, and the topics will likely have more words.
LDA_ETA = os.getenv("LDA_ETA", 0.001)

# Ideally, we want our composites to be made up of only a few topics
# and our parts to belong to only some of the topics. With this in mind,
# alpha and beta are typically set below one.

print(f"LDA hyperparameter Number of topics selected is:{NUMBER_TOPICS}")
print(f"LDA hyperparameter Alpha selected is: {LDA_ALPHA}")
print(f"LDA hyperparameter Eta selected is: {LDA_ETA}")

# Number of passes through the corpus during training.
passes = 100

# Maximum number of iterations through the corpus
# when inferring the topic distribution of a corpus.
iterations = 200

# Number of documents to be used in each training chunk.
chunksize = 100

inputs = {
    "corpus": corpus,
    "num_topics": NUMBER_TOPICS,
    "id2word": dictionary,
    "passes": passes,
    "chunksize": chunksize,
    "iterations": iterations,
    "alpha": LDA_ALPHA,
    "eta": LDA_ETA
}

MODEL_NAME = "model" or os.getenv("MODEL_NAME")
model_name = MODEL_NAME + "_" + f"t{NUMBER_TOPICS}" "_" + datetime.utcnow().strftime(
    "%Y-%m-%d_%H:%M:%S"
)

LDA hyperparameter Number of topics selected is:10
LDA hyperparameter Alpha selected is: 0.5
LDA hyperparameter Eta selected is: 0.001


## Run Latent Dirichlet Allocation (LDA)

In [11]:
ldamodel = models.ldamodel.LdaModel(**inputs)

## Store model

In [17]:
# Store LDA model
current_path = Path.cwd().parents[0]
models_path = current_path.joinpath("models")

model_repo_path = models_path.joinpath(model_name)
if not model_repo_path.exists():
    os.makedirs(model_repo_path)

complete_file_path = model_repo_path.joinpath(f"{model_name}_lda_model")

ldamodel.save(str(complete_file_path))

## Show topics

In [23]:
topics = ldamodel.print_topics()
for topic in topics:
    print(f"\nTopic: {topic}")


Topic: (0, '0.030*"visualization" + 0.027*"badge" + 0.026*"network" + 0.022*"current" + 0.020*"optional" + 0.019*"discussion" + 0.018*"speed" + 0.018*"continuous" + 0.018*"extension" + 0.017*"function"')

Topic: (1, '0.027*"bash" + 0.022*"script" + 0.016*"information" + 0.015*"instruction" + 0.014*"feature" + 0.013*"torch" + 0.013*"azure" + 0.013*"following" + 0.013*"content" + 0.012*"management"')

Topic: (2, '0.016*"classification" + 0.015*"order" + 0.014*"able" + 0.014*"method" + 0.014*"prediction" + 0.013*"regression" + 0.013*"linear" + 0.013*"learning" + 0.012*"models" + 0.011*"link"')

Topic: (3, '0.013*"quick" + 0.013*"training" + 0.010*"true" + 0.009*"compute" + 0.008*"feature" + 0.008*"master" + 0.007*"large" + 0.007*"layer" + 0.007*"fast" + 0.007*"start"')

Topic: (4, '0.310*"image" + 0.203*"official" + 0.099*"wiki" + 0.060*"workshop" + 0.049*"nips" + 0.047*"asset" + 0.045*"array" + 0.030*"interpreter" + 0.030*"systems" + 0.020*"dimensional"')

Topic: (5, '0.044*"great" + 0.