In [None]:
from gensim.models import LdaModel
import pickle as pkl
import numpy as np
from tqdm.notebook import tqdm
from pathlib import Path

# Load Data

In [None]:
# Load raw corpus dataframe
with open('data/cleaned_corpus.pkl', 'rb') as f:
    corpus = pkl.load(f)

In [None]:
max_topics = 40
min_topics = 2

In [None]:
for nTopics in range(min_topics, max_topics):
    print(nTopics)
    
    # load the model
    try: 
        fname = f'trained_models/trained_lda_model_{nTopics}'
        lda_model = LdaModel.load(fname)
    except:
        print(f'No trained model for {nTopics} topics')
        continue

    # output file name
    distribution_file_name = Path(f'data/topic_distributions_{lda_model.num_topics}.npy')
    if not distribution_file_name.is_file():

        # run the model
        doc_lda = lda_model[corpus]

        # extract the topic distributions for each paper as numpy array
        hm = np.zeros([len(corpus), lda_model.num_topics])
        for i in tqdm(range(len(doc_lda))):
            for topic_pair in doc_lda[i][0]:
                hm[i, topic_pair[0]] = topic_pair[1]

        # save topic distributions as numpy array
        np.save(distribution_file_name, hm)
    else:
        print(f'Distribution for {nTopics} already exists.')
