In [None]:
import logging
import pickle as pkl
import gensim
import numpy as np
from gensim.models.ldamulticore import LdaMulticore
import argparse

In [None]:
# For time_slice to work properly with ldaseqmodel we need to first order our dataframe ascending, 
# i.e. from oldest to newest.
# Create a time_slice variable so we can later feed it back into the model
# Suppose our corpus has 30 documents, with 5 in the first time-slice, 10 in the second, and 15 in the third.
# our time_slice argument is time_slice=[5,10,15]

#takes all unique values in data.Year as well as how often they occur and returns them as an array.
uniqueyears, time_slices = np.unique(data.Year, return_counts=True) 

#see what youve made
print(np.asarray((uniqueyears, time_slices)).T) 


In [None]:
# enable logging
logging.basicConfig(filename='log_files/gensim_prespecified_topics.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO)

In [None]:
# load cleaned corpus
with open('data/cleaned_corpus.pkl', 'rb') as f:
    corpus = pkl.load(f)
with open("data/id2word.pkl", 'rb') as f:
    id2word= pkl.load(f)

In [None]:
# Choose the number of topics
nTopics = 25

In [None]:
# Train the LDA model with a prespecified number of topics
lda_model =                   LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=nTopics, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=3000,
#                                            iterations=10000,
#                                            minimum_probability=0,
                                           per_word_topics=True)

In [None]:
#slicing 


In [None]:
def train_dtm(articles, n_topics, outfile="dtm", dates=None, time_slices=None):
    if time_slices is None and dates is not None:
        counts = Counter(dates)
        time_slices = list(counts.values())
        print("Dates: ", counts)
    print("Time slices:", time_slices)
    chain_var = 0.1
    common_dictionary = corpora.Dictionary(articles)
    common_corpus = [common_dictionary.doc2bow(a) for a in articles]
    ldaseq = LdaSeqModel(corpus=common_corpus, time_slice=time_slices,
                         num_topics=n_topics, id2word=common_dictionary,
                         chain_variance=chain_var)
    model_file = "trained_models/"+outfile
    ldaseq.save(model_file)
    dict_filename = model_file+"_dict.pkl"
    pickle.dump(common_dictionary, open(dict_filename, "wb"))
    dict_filename = model_file+"_corpus.pkl"
    pickle.dump(common_corpus, open(dict_filename, "wb"))

    print("Saved DTM model as", model_file, "!")

In [None]:
# Save the trained LDA model
lda_model.save(f"trained_models/trained_lda_model_final_{lda_model.num_topics}")

In [None]:
# Extract the topic distributions for each paper as numpy array
hm = np.zeros([len(corpus), lda_model.num_topics])
for i in range(len(doc_lda)):
    for topic_pair in doc_lda[i][0]:
        hm[i, topic_pair[0]] = topic_pair[1]


In [None]:
# Save topic distributions as numpy array
np.save(f'data/topic_distributions_final_{lda_model.num_topics}', hm)