In [None]:
import gensim
import numpy as np
import pandas as pd

import itertools

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def add_topics_vector(corpus, ldamodel):
    all_topics_csr = gensim.matutils.corpus2csc(ldamodel.get_document_topics(corpus))
    all_topics_numpy = all_topics_csr.T.toarray()

    topic_labels = ['Topic {}'.format(i+1) for i in range(all_topics_numpy.shape[1])]
    topic_vectors = pd.DataFrame(all_topics_numpy, columns=topic_labels)

    return topic_vectors

In [None]:
def visualize(topic_vectors):
    plt.figure(figsize=(6, 15))
    plt.title('Document topic heatmap')
    hm = sns.heatmap(topic_vectors) #, xticklabels=ind)

    cm = sns.clustermap(topic_vectors, col_cluster=True, figsize=(6, 15))
    cm.fig.suptitle('Document topic heatmap - clustered') 

    return hm, cm

In [None]:
def get_datetime(topic_vectors, data):
    topic_vectors = pd.concat([topic_vectors, data['Date']], axis=1)
    topic_vectors['Date'] = pd.to_datetime(topic_vectors['Date'])
    topic_vectors = topic_vectors.set_index('Date')
    topic_vectors.sort_index(inplace=True)
    return topic_vectors

In [None]:
def heat_map_time(start_date, end_date, data):
    subset = data.loc[start_date : end_date]
    ind = list(data)
    time_cm = sns.clustermap(subset[ind], col_cluster=False)
    return time_cm

In [None]:
def get_dominant_topic(ldamodel, corpus, data):
    sent_topics_df = pd.DataFrame()

    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), 
                                                                  round(prop_topic,4), 
                                                                  topic_keywords]), 
                                                       ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    sent_topics_df = pd.concat([sent_topics_df, data['Abstract'], data['Title']], axis=1)
    sent_topics_df.reset_index(inplace=True)
    sent_topics_df.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text', 'Title']
    return sent_topics_df

In [None]:
def get_representative_doc(sent_topics_df):
    repr_doc = pd.DataFrame()

    sent_topics_outdf_grpd = sent_topics_df.groupby('Dominant_Topic')

    for i, grp in sent_topics_outdf_grpd:
        repr_doc = pd.concat([repr_doc, grp.sort_values(['Topic_Perc_Contrib'], ascending=[0]).head(1)], axis=0)
 
    repr_doc.reset_index(drop=True, inplace=True)
    repr_doc.columns = ['Document_No', 'Topic_Num', 'Topic_Perc_Contrib', 'Keywords', 'Text', 'Title']
    return repr_doc

In [None]:
def get_topic_distribution(sent_topics_df, repr_doc):
    topic_counts = sent_topics_df['Dominant_Topic'].value_counts()
    topic_contribution = round(topic_counts/topic_counts.sum(), 4)
    topic_num_keywords = repr_doc[['Topic_Num', 'Keywords']]

    df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)
    df_dominant_topics.columns = ['Topic_Num', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

    return df_dominant_topics