# Hyperparameter Tuning
We tune the following hyperparameters:
- number of topics
- dirichlet hyperparam alpha
- dirichlet hyperparam beta

#### Future tuning:
- Dictionary filter params
    - no_below
    - no_above
    - keep_n

Inspiration: https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

More Info: https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

In [None]:
# Helper function
def compute_coherence_values(corpus, text_corpus, dictionary, k, a, b):
    """
    Input:
        corpus - list containing a list for each doc with tuples of (word_id, freq)
        text_corpus - list containing a list for each doc with the cleaned text
        k - number of topics
        a - alpha hyperparam
        b - beta hyperparam
    Output:
        coherence score
    """
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b,
                                           per_word_topics=True,
                                           workers=7)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=text_corpus, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

## Grid Search

In [None]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

In [None]:
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

In [None]:
# Honestly no point in changing data set size IMO
# Validation sets
num_of_docs = len(bow_corpus)
val_splits = [0.75, 1.0]
corpus_title = [f'{i*100}% Corpus' for i in val_splits]

val_lengths = [int(num_of_docs*i) for i in val_splits]
corpus_sets = [bow_corpus[:i] for i in val_lengths]
text_corpus_sets = [post_to_comments_list[:i] for i in val_lengths]

In [None]:
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

In [None]:
%%time
# Can take a long time to run
if 0 == 1:
    # progress bar
    pbar = tqdm.tqdm(total=540) # update this total value to be calculated
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], text_corpus=text_corpus_sets[i],
                                                  dictionary=dictionary, k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('./output/lda_model_1_tuning_results.csv', index=False)
    pbar.close()

In [None]:
model_results_df = pd.DataFrame(model_results)

In [None]:
model_results_df

## Final Model

In [None]:
max_coherence = model_results_df['Coherence'].max()

In [None]:
final_model_params = model_results_df.loc[model_results_df['Coherence'] == max_coherence]

In [None]:
final_n_topics = final_model_params["Topics"].item()
final_alpha = final_model_params["Alpha"].item()
final_beta = final_model_params["Beta"].item()
print(f'Num Topics {final_n_topics} - Alpha {final_alpha} - Beta {final_beta}')

### Train Final Model

In [None]:
final_lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                       id2word=dictionary,
                                       num_topics=final_n_topics, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       alpha=final_alpha,
                                       eta=final_beta,
                                       per_word_topics=True,
                                       workers=7)

### Visualize Topics

In [None]:
plot_top_words(final_lda_model, dictionary, final_n_topics, 'Topics in LDA Model - Final')

In [None]:
LDAvis_prepared = gensim_lda.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.save_html(LDAvis_prepared, './output/lda_vis_model_final.html'

### Save Model

In [None]:
lda_model_final.save("./model/lda_model_final")

### Evaluate Model

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=post_to_comments_list, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)