In [1]:
from gensim.models import LdaModel
import pickle as pkl
import numpy as np
from tqdm.notebook import tqdm
from pathlib import Path

# Load Data

In [2]:
# Load raw corpus dataframe
with open('data/cleaned_corpus.pkl', 'rb') as f:
    corpus = pkl.load(f)

In [3]:
max_topics = 33
min_topics = 2

In [5]:
for nTopics in range(min_topics, max_topics):
    print(nTopics)
    
    # load the model
    try: 
        fname = f'trained_models/trained_lda_model_{nTopics}'
        lda_model = LdaModel.load(fname)
    except:
        print(f'No trained model for {nTopics} topics')
        continue

    # output file name
    distribution_file_name = Path(f'data/topic_distributions_{lda_model.num_topics}.npy')
    if not distribution_file_name.is_file():

        # run the model
        doc_lda = lda_model[corpus]

        # extract the topic distributions for each paper as numpy array
        hm = np.zeros([len(corpus), lda_model.num_topics])
        for i in tqdm(range(len(doc_lda))):
            for topic_pair in doc_lda[i][0]:
                hm[i, topic_pair[0]] = topic_pair[1]

        # save topic distributions as numpy array
        np.save(distribution_file_name, hm)
    else:
        print(f'Distribution for {nTopics} already exists.')


2
Distribution for 2 already exists.
3
Distribution for 3 already exists.
4
Distribution for 4 already exists.
5
Distribution for 5 already exists.
6
Distribution for 6 already exists.
7
Distribution for 7 already exists.
8
Distribution for 8 already exists.
9
Distribution for 9 already exists.
10
Distribution for 10 already exists.
11
Distribution for 11 already exists.
12
Distribution for 12 already exists.
13
Distribution for 13 already exists.
14
Distribution for 14 already exists.
15


HBox(children=(FloatProgress(value=0.0, max=42154.0), HTML(value='')))


16


HBox(children=(FloatProgress(value=0.0, max=42154.0), HTML(value='')))


17


HBox(children=(FloatProgress(value=0.0, max=42154.0), HTML(value='')))


18


HBox(children=(FloatProgress(value=0.0, max=42154.0), HTML(value='')))


19
Distribution for 19 already exists.
20
Distribution for 20 already exists.
21
Distribution for 21 already exists.
22
Distribution for 22 already exists.
23
Distribution for 23 already exists.
24
Distribution for 24 already exists.
25
Distribution for 25 already exists.
26
Distribution for 26 already exists.
27
Distribution for 27 already exists.
28
Distribution for 28 already exists.
29
Distribution for 29 already exists.
30
Distribution for 30 already exists.
31
Distribution for 31 already exists.
32
Distribution for 32 already exists.
