# Computing different metrics to evaluate topic models

In [None]:
import nlp
import json
    
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

### Load and clean data

In [None]:
data = nlp.load_file('../data/lee.txt')
docs = nlp.preprocess(data)
len(data), data[0], docs[0]

### Load computed models

In [None]:
alphas = [0.01, 0.1, 1, 10]
betas = [0.01, 0.1, 1, 10]
num_topics = [5, 10, 15, 20]
models = []

for a in alphas:
    for b in betas:
        for k in num_topics:
            name = '../data/models/lee/{}-{}-{}'.format(a, b, k)
            models.append({ 'alpha': a, 'beta': b, 'num_topics': k, 'lda': LdaModel.load(name) })

### Compute coherence metrics

In [None]:
def add_metrics(model, measure_names, docs):
    'Compute each measure and add it to the given model.'
    for m in measure_names:
        cm = CoherenceModel(model=model['lda'], texts=docs, coherence=m)
        model[m] = cm.get_coherence()

This could take some time to compute 5 x 4 x 4 x 4 = 384 metrics.

In [None]:
measure_names = ['u_mass', 'c_v', 'c_uci', 'c_npmi', 'c_w2v']
for model in models:
    add_metrics(model, measure_names, docs)    

### Add other metrics

In [None]:
def add_other_metrics(models, corpus):
    # Perplexity
    for model in models:
        model['perplexity'] = model['lda'].log_perplexity(corpus)

In [None]:
corpus, dictionary = nlp.build_corpus_dictionary(docs, min_count=10)
add_other_metrics(models, corpus)

### Exports metrics

In [None]:
for i, model in enumerate(models):
    model.pop('lda', None)
    
with open('../data/lee-metrics.json', 'w') as f:
    json.dump(models, f)

---

In [None]:
%load_ext autoreload
%autoreload 2