In [1]:
import logging
import pickle as pkl
import gensim
import numpy as np
from gensim.models import LdaSeqModel

In [2]:
# enable logging
logging.basicConfig(filename='log_files/gensim_prespecified_topics.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO)

In [3]:
# load cleaned corpus
with open('data/raw_corpus_dtm.pkl', 'rb') as f:
    corpus_df = pkl.load(f)
with open('data/cleaned_corpus_dtm.pkl', 'rb') as f:
    corpus = pkl.load(f)
with open("data/id2word_dtm.pkl", 'rb') as f:
    id2word= pkl.load(f)

In [4]:
#takes all unique values in data - year as well as how often they occur and returns them as an array.
uniqueyears, time_slices = np.unique(corpus_df.Year, return_counts=True) 
#this array will be used for time slicing while training the LDA sequential model
print(np.asarray((uniqueyears, time_slices)).T) 


[['1991' 512]
 ['1992' 577]
 ['1993' 650]
 ['1994' 596]
 ['1995' 860]
 ['1996' 756]
 ['1997' 833]
 ['1998' 746]
 ['1999' 837]
 ['2000' 884]
 ['2001' 852]
 ['2002' 969]
 ['2003' 1056]
 ['2004' 1055]
 ['2005' 1052]
 ['2006' 1392]
 ['2007' 1491]
 ['2008' 1621]
 ['2009' 1589]
 ['2010' 1634]
 ['2011' 1770]
 ['2012' 2038]
 ['2013' 2123]
 ['2014' 2486]
 ['2015' 2365]
 ['2016' 2323]
 ['2017' 2376]
 ['2018' 2348]
 ['2019' 2485]]


In [5]:
# Choose the number of topics
nTopics = 40

In [9]:
# Train the LDA model with a prespecified number of topics
lda_model =                   LdaSeqModel(corpus=corpus,
                                          time_slice=time_slices,
                                           id2word=id2word,
                                           num_topics=nTopics, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=3000)

KeyboardInterrupt: 

In [None]:
# def train_dtm(articles, n_topics, outfile="dtm", dates=None, time_slices=None):
#     if time_slices is None and dates is not None:
#         counts = Counter(dates)
#         time_slices = list(counts.values())
#         print("Dates: ", counts)
#     print("Time slices:", time_slices)
#     chain_var = 0.1
#     common_dictionary = corpora.Dictionary(articles)
#     common_corpus = [common_dictionary.doc2bow(a) for a in articles]
#     ldaseq = LdaSeqModel(corpus=common_corpus, time_slice=time_slices,
#                          num_topics=n_topics, id2word=common_dictionary,
#                          chain_variance=chain_var)
#     model_file = "trained_models/"+outfile
#     ldaseq.save(model_file)
#     dict_filename = model_file+"_dict.pkl"
#     pickle.dump(common_dictionary, open(dict_filename, "wb"))
#     dict_filename = model_file+"_corpus.pkl"
#     pickle.dump(common_corpus, open(dict_filename, "wb"))

#     print("Saved DTM model as", model_file, "!")

In [None]:
# Save the trained DTM
lda_model.save(f"trained_models/trained_DTM_{lda_model.num_topics}")

In [None]:
# Extract the topic distributions for each paper as numpy array
hm = np.zeros([len(corpus), lda_model.num_topics])
for i in range(len(doc_lda)):
    for topic_pair in doc_lda[i][0]:
        hm[i, topic_pair[0]] = topic_pair[1]


In [None]:
# Save topic distributions as numpy array
np.save(f'data/topic_distributions_DTM_{lda_model.num_topics}', hm)