In [1]:
%pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import itertools
import os

from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

In [3]:
lemma_grouped = pd.read_pickle("../data/df_lemma_grouped_from2018.pkl")

In [4]:
lemma_grouped = lemma_grouped[lemma_grouped.apply(len) >= 10]

In [5]:
docs = lemma_grouped

In [6]:
def calc_model(docs, no_above, num_topics, alpha, eta, no_below=10, topn=10):
    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token
    model = LdaMulticore(
        corpus=corpus,
        id2word=id2word,
        chunksize=600,
        #alpha='auto',
        alpha=alpha,
        eta=eta,
        iterations=400,
        num_topics=num_topics,
        passes=20,
        eval_every=None
    )
    coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v', topn=topn)
    return coherence_model_lda.get_coherence()

In [7]:
# print(calc_model(docs=docs, no_above=0.15, num_topics=100, alpha=1, eta=0.001))

In [8]:
import random

no_above=[0.1, 0.15]
num_topics=[50, 100, 125, 150, 175, 200, 250]
alpha=[1, 0.5, 2, 0.75, 0.25, 0.01, 0.001, 0.0001]
eta=[0.001, 0.01, 0.1, 1, 0.0001, 0.00001]
no_below=[10]
topn=[10]
columns=['no_above', 'num_topics', 'alpha', 'eta', ' no_below', 'topn', 'score']

random.shuffle(no_above)
random.shuffle(num_topics)
random.shuffle(alpha)
random.shuffle(eta)

model_perf_file_path = "../data/model_perf.csv"
if os.path.isfile(model_perf_file_path):
    df = pd.read_csv(model_perf_file_path)
else:
    df = pd.DataFrame(columns=columns)

for c in itertools.product(no_above, num_topics, alpha, eta, no_below, topn):
    if random.choice([True, False, False, False]):
        score = calc_model(docs=docs, no_above=c[0], num_topics=c[1], alpha=c[2], eta=c[3], no_below=c[4], topn=c[5])
        values = list(c)
        values.append(score)
        dfrow = pd.DataFrame([values], columns=columns)
        df = pd.concat([df, dfrow])
        df.to_csv(model_perf_file_path, index=False)
        df.reset_index(inplace=True, drop=True)
        print(df.iloc[[df['score'].idxmax(), -1]])
    

     no_above  num_topics  alpha     eta   no_below  topn     score
86        0.1         125   0.75  0.0001         10    10  0.619031
295       0.1         125   1.00  0.0001         10    10  0.572558
     no_above  num_topics  alpha     eta   no_below  topn     score
86        0.1         125   0.75  0.0001         10    10  0.619031
296       0.1         125   1.00  1.0000         10    10  0.501350
     no_above  num_topics  alpha      eta   no_below  topn     score
86        0.1         125   0.75  0.00010         10    10  0.619031
297       0.1         125   0.50  0.00001         10    10  0.533866
     no_above  num_topics  alpha     eta   no_below  topn     score
86        0.1         125   0.75  0.0001         10    10  0.619031
298       0.1         125   0.25  1.0000         10    10  0.531735
     no_above  num_topics  alpha     eta   no_below  topn     score
86        0.1         125   0.75  0.0001         10    10  0.619031
299       0.1         125   0.25  0.1000     