<font size="+12"><center>
    Data Science Packages Latent Dirichlet Allocation (LDA) Hyperparameters Tuning
</font></center>

## Import libraries

In [32]:
import os
import json
import random

import tqdm
import numpy as np
from datetime import datetime
from pathlib import Path

from gensim import corpora, models
from sklearn.model_selection import train_test_split

In [9]:
LDA_PERCENTAGE_TEST_DATASET = 0.1 or os.getenv("LDA_PERCENTAGE_TEST_DATASET")

In [10]:
# Retrieve clean dataset
current_path = Path.cwd().parents[0]
data_path = current_path.joinpath("data/processed")

with open(f"{data_path}/clean_dataset.json") as json_file:
    clean_dataset = json.load(json_file)

texts_names = []
texts = []
for file_name, file_vocabulary in clean_dataset.items():
    texts.append(file_vocabulary)
    texts_names.append(file_name)

# Process data for LDA
    
# Assign a unique integer id to all words appearing in the corpus, creating a vocabulary corpus
dictionary = corpora.Dictionary(texts)
print("Number of unique tokens: %d" % len(dictionary))
# print(f"Token ID map:\n {dictionary.token2id}")

# Bag of Words (BoW) Representation
corpus = [dictionary.doc2bow(tokens) for tokens in texts]

lda_percentage_training_dataset = (1 - float(LDA_PERCENTAGE_TEST_DATASET)) * 100
print("Training Dataset percentage is: %d" % lda_percentage_training_dataset)

lda_percentage_test_dataset = float(LDA_PERCENTAGE_TEST_DATASET) * 100
print("Test Dataset percentage is: %d" % lda_percentage_test_dataset)

corpus_train, corpus_test = train_test_split(corpus, test_size=LDA_PERCENTAGE_TEST_DATASET)

Number of unique tokens: 2949
Training Dataset percentage is: 90
Test Dataset percentage is: 10


## Define inputs for hyperparameter tuning

In [35]:
# HYPERPARAMETERS

step_size = 1

# Number of topics
NUMBER_TOPICS_MIN = int(os.getenv("NUMBER_TOPICS", 10))
NUMBER_TOPICS_MAX = int(os.getenv("NUMBER_TOPICS", 10))

topics_range = range(NUMBER_TOPICS_MIN, NUMBER_TOPICS_MAX + step_size, step_size)

print(f"Range of topics selected between {NUMBER_TOPICS_MIN} and {NUMBER_TOPICS_MAX} with step {step_size}")

# ALPHA: Dirichlet hyperparameter alpha, Document-Topic Density.

alpha_step = 0.8

# alpha controls the mixture of topics for any given document. 
# Turn it down, and the documents will likely have less of a mixture of topics.
# Turn it up, and the documents will likely have more of a mixture of topics.
LDA_ALPHA_MIN = os.getenv("LDA_ALPHA", 0.2)
LDA_ALPHA_MAX = os.getenv("LDA_ALPHA", 1)

alpha_spectrum = list(np.arange(LDA_ALPHA_MIN, LDA_ALPHA_MAX, alpha_step))
alpha_spectrum.append("symmetric")
alpha_spectrum.append("asymmetric")

# ETA: Dirichlet hyperparameter alpha, Topic-Word Density.

eta_step = 0.8
    
# The beta/eta hyperparameter controls the distribution of words per topic.
# Turn it down, and the topics will likely have less words.
# Turn it up, and the topics will likely have more words.
LDA_ETA_MIN = os.getenv("LDA_ETA", 0.001)
LDA_ETA_MAX = os.getenv("LDA_ETA", 1)

eta_spectrum = list(np.arange(LDA_ETA_MIN, LDA_ETA_MAX, eta_step))
eta_spectrum.append("symmetric")

# Ideally, we want our composites to be made up of only a few topics
# and our parts to belong to only some of the topics. With this in mind,
# alpha and beta are typically set below one.

# Number of passes through the corpus during training.
passes = 100

# Maximum number of iterations through the corpus
# when inferring the topic distribution of a corpus.
iterations = 200

# Number of documents to be used in each training chunk.
chunksize = 100

total = len(topics_range) * len(alpha_spectrum) * len(eta_spectrum)
print(f"Total number of LDA run: {total}")

Range of topics selected between 10 and 10 with step 1
Total number of LDA run: 9


In [39]:
import math
from gensim.models import CoherenceModel

def _evaluate_metrics(
    ldamodel,
    corpus,
    texts,
    dictionary,
):
    """Evaluate metrics for the LDA model trained."""
    # Model Perplexity, a measure of how good the model is. lower the better.
    perplexity = ldamodel.log_perplexity(corpus)
    perplexity_exponential = math.exp(perplexity)

    # Coherence measure
    COHERENCE_MEASURE = "c_v" or os.getenv("COHERENCE")

    print(f"\nCoherence quantity used is: {COHERENCE_MEASURE}")
    # Model Coherence Score
    coherence_model_lda = CoherenceModel(
        model=ldamodel, texts=texts, dictionary=dictionary, coherence=COHERENCE_MEASURE
    )
    coherence = coherence_model_lda.get_coherence()
    print(f"\nCoherence Score: {coherence}")

    return perplexity, coherence

## Hyperparameter Tuning

In [38]:
results = []

# Create Hyperparamter repo
current_path = Path.cwd().parents[0]

hp_model_path = current_path.joinpath(f"models/hyperparameters_{random.getrandbits(64):08x}")

if not hp_model_path.exists():
    os.makedirs(hp_model_path)
                
pbar = tqdm.tqdm(total=total)

for num_topics in topics_range:

    for alpha in alpha_spectrum:

        for eta in eta_spectrum:
            
            inputs = {
                "corpus": corpus,
                "num_topics": num_topics,
                "id2word": dictionary,
                "passes": passes,
                "chunksize": chunksize,
                "iterations": iterations,
                "alpha": alpha,
                "eta": eta
            }
            
            MODEL_NAME = "model" or os.getenv("MODEL_NAME")
            model_name = MODEL_NAME + "_" + f"t{num_topics}" "_" + datetime.utcnow().strftime(
                "%Y-%m-%d_%H:%M:%S"
            )
            
            ldamodel = models.ldamodel.LdaModel(**inputs)

            perplexity, coherence = _evaluate_metrics(
                ldamodel=ldamodel, corpus=corpus, texts=texts, dictionary=dictionary
            )

            print(f"Number of Topics {num_topics}")
            print(f"Alpha {alpha} and eta {eta}")
            print(f"Perplexity {perplexity}")
            print(f"Coherence {coherence}")

            results.append([num_topics, alpha, eta, perplexity, coherence])

            complete_results_and_inputs["results"] = results

            # Store LDA model
            model_repo_path = hp_model_path.joinpath(model_name)

            if not model_repo_path.exists():
                os.makedirs(model_repo_path)

            complete_file_path = model_repo_path.joinpath(f"{model_name}_lda_model")

            ldamodel.save(str(complete_file_path))
            
            complete_results_and_inputs = {
                "topics_range": list(topics_range),
                "alpha_spectrum": alpha_spectrum,
                "eta_spectrum": eta_spectrum,
                "results": results,
            }
            with open(f"{hp_model_path}/hypeparameters_inputs.json", mode="w") as outfile:
                json.dump(complete_results_and_inputs, outfile)

            pbar.update(1)

pbar.close()

  0%|          | 0/9 [00:00<?, ?it/s]


Coherence quantity used is: c_v


 33%|███▎      | 3/9 [02:06<04:13, 42.21s/it]
 11%|█         | 1/9 [00:15<02:04, 15.60s/it]


Coherence Score: 0.29450834482470023
Number of Topics 10
Alpha 0.2 and eta 0.001
Perplexity -44.3736509631979
Coherence 0.29450834482470023

Coherence quantity used is: c_v


 22%|██▏       | 2/9 [00:28<01:39, 14.23s/it]


Coherence Score: 0.43583886574624325
Number of Topics 10
Alpha 0.2 and eta 0.801
Perplexity -7.823765027819895
Coherence 0.43583886574624325

Coherence quantity used is: c_v


 33%|███▎      | 3/9 [00:44<01:29, 14.89s/it]


Coherence Score: 0.42709673702023887
Number of Topics 10
Alpha 0.2 and eta symmetric
Perplexity -8.169935648876049
Coherence 0.42709673702023887

Coherence quantity used is: c_v


 44%|████▍     | 4/9 [00:59<01:15, 15.06s/it]


Coherence Score: 0.3077708500093541
Number of Topics 10
Alpha symmetric and eta 0.001
Perplexity -39.51110117086189
Coherence 0.3077708500093541

Coherence quantity used is: c_v


 56%|█████▌    | 5/9 [01:14<00:59, 14.85s/it]


Coherence Score: 0.35684465741286636
Number of Topics 10
Alpha symmetric and eta 0.801
Perplexity -7.836504838180229
Coherence 0.35684465741286636

Coherence quantity used is: c_v


 67%|██████▋   | 6/9 [01:27<00:43, 14.38s/it]


Coherence Score: 0.38730469733846884
Number of Topics 10
Alpha symmetric and eta symmetric
Perplexity -8.141127914818254
Coherence 0.38730469733846884

Coherence quantity used is: c_v


 78%|███████▊  | 7/9 [01:43<00:29, 14.65s/it]


Coherence Score: 0.27951869844049676
Number of Topics 10
Alpha asymmetric and eta 0.001
Perplexity -41.15276883720188
Coherence 0.27951869844049676

Coherence quantity used is: c_v


 89%|████████▉ | 8/9 [01:57<00:14, 14.57s/it]


Coherence Score: 0.6182993935883683
Number of Topics 10
Alpha asymmetric and eta 0.801
Perplexity -7.811780744683924
Coherence 0.6182993935883683

Coherence quantity used is: c_v


100%|██████████| 9/9 [02:08<00:00, 14.30s/it]


Coherence Score: 0.3587372475072122
Number of Topics 10
Alpha asymmetric and eta symmetric
Perplexity -8.056136496009634
Coherence 0.3587372475072122





## Visualize results

In [43]:
results = complete_results_and_inputs["results"]
# Sort by coeherence
results.sort(key=lambda x: x[4], reverse=True)

print(f"Topics range: {topics_range}")
n = 1
for result in results:
    print(f"Sorted parameters by coherence n.{n}: {result}")
    n += 1

optimized_per_topic = []
for topic_number in sorted(topics_range, reverse=True):
    for result in results:
        if result[0] == topic_number:
            optimized_per_topic.append(result)
            break

optimized_per_topic.sort(key=lambda x: x[4], reverse=True)

for optimized in optimized_per_topic:
    print(f"Optimized parameters for {optimized[0]} topics is: {optimized}")
print(f"Parameters for max coherence: {results[0]}")
print(f"Max Num topics: {results[0][0]}")
# TODO Add visualizations plots

Topics range: range(10, 11)
Sorted parameters by coherence n.1: [10, 'asymmetric', 0.801, -7.811780744683924, 0.6182993935883683]
Sorted parameters by coherence n.2: [10, 0.2, 0.801, -7.823765027819895, 0.43583886574624325]
Sorted parameters by coherence n.3: [10, 0.2, 'symmetric', -8.169935648876049, 0.42709673702023887]
Sorted parameters by coherence n.4: [10, 'symmetric', 'symmetric', -8.141127914818254, 0.38730469733846884]
Sorted parameters by coherence n.5: [10, 'asymmetric', 'symmetric', -8.056136496009634, 0.3587372475072122]
Sorted parameters by coherence n.6: [10, 'symmetric', 0.801, -7.836504838180229, 0.35684465741286636]
Sorted parameters by coherence n.7: [10, 'symmetric', 0.001, -39.51110117086189, 0.3077708500093541]
Sorted parameters by coherence n.8: [10, 0.2, 0.001, -44.3736509631979, 0.29450834482470023]
Sorted parameters by coherence n.9: [10, 'asymmetric', 0.001, -41.15276883720188, 0.27951869844049676]
Optimized parameters for 10 topics is: [10, 'asymmetric', 0.8