In [1]:
# import pandas as pd
# import glob
# import numpy as np
# import pickle as pkl
# import matplotlib.pyplot as plt
# from tqdm import tqdm

# # gensim
# import gensim
# from gensim.utils import simple_preprocess
# from gensim.models import LdaModel


In [2]:
# load model
fname = 'ldamodel_output_hiddenstories'
lda_model = LdaModel.load(fname)

In [3]:
# define plotting colors
import matplotlib.colors as mcolors

tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),  
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),  
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),  
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),  
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)] 
# colorz=[color for name, color in mcolors.TABLEAU_COLORS.items()]

# define topic names
labels = ['Precipitation Extremes & Distributions',
          'Climate Change',
          'River Networks, Topography, & Scaling',
          'Hydrogeology',
          'Forecasting',
          'Energy Balance (Hydrometeorology, Land Surface)',
          'Snow and Ice',
          'Sediment & Erosion',
          'Water Resources Management',
          'Hydrogeomorphology',
          'Uncertainty and Calibration',
          'Statistical Hydrology',
          'Quality & Transport',
          'Miscellaneous',
          'Soil Moisture',
          'Floods and Discharge',
          'Precipitation Trends & Climatology']

In [4]:
# load raw corpus dataframe
with open('corpus_raw.pkl', 'rb') as f:
    corpus_df = pkl.load(f)
    
# load cleaned corpus
with open('corpus_hiddenstories.pkl', 'rb') as f:
    corpus = pkl.load(f)[0]

In [5]:
# run the model
doc_lda = lda_model[corpus]

In [6]:
# grab the topic probabilities for each paper

# init storage
hm = np.zeros([len(corpus), lda_model.num_topics])

# loop over papers
for paper in tqdm(range(len(corpus))):
    pairs = lda_model[corpus[paper]][0]
    for pair in pairs:
        hm[paper, pair[0]] = pair[1]

100%|██████████| 45686/45686 [03:00<00:00, 252.44it/s]


In [16]:
print(hm[1][2])

0.024660997092723846


In [9]:
# create correlation time series

# all years where we have data
years = np.unique(corpus_df['Year'])

# init storage (as a dictionary)
correlations_years = np.zeros([len(years), lda_model.num_topics, lda_model.num_topics])

# all correlations
correlations = np.corrcoef(np.transpose(hm)) 

# correlations by time
y = -1
for year in years:
    y = y+1
    
    # extract all papers in this year
    hm_year = hm[corpus_df['Year'] == year,:]

    # calculate topic popularity for this year
    correlations_years[y,:,:] = np.corrcoef(np.transpose(hm_year)) 

In [10]:
# init storage
links = pd.DataFrame(columns = ['source','target','value','n_value','p_value'])

# fill out links dataframe
row = -1
for i in range(hm.shape[1]):
    for j in range(i,hm.shape[1]):
        if not (i==j):
            row = row+1
            links.loc[row,'source'] = j    
            links.loc[row,'target'] = i
            links.loc[row,'n_value'] = np.max([0, -correlations[i,j]*100])
            links.loc[row,'p_value'] = np.max([0, correlations[i,j]*100])

links.value = links.p_value

# change data type
links = links.astype('int64')

In [11]:
# init storage
nodes = pd.DataFrame(columns = ['group','name'])

# fill out nodes dataframe
row = -1
for i in range(hm.shape[1]):
    row = row+1
    nodes.loc[row,'name'] = labels[i] 
    nodes.loc[row,'group'] = 0 

# change data type
nodes.group = nodes.group.astype('int64')

In [12]:
import pandas as pd
import holoviews as hv
from holoviews import opts, dim
from bokeh.sampledata.les_mis import data

hv.extension('bokeh')
hv.output(size = 200)

# hv.Chord(links)

# nodes = hv.Dataset(pd.DataFrame(nodes), 'index')
# nodes.data.head()

# plot positive correlations
links.value = links.p_value
chord = hv.Chord((links, hv.Dataset(pd.DataFrame(nodes), 'index'))).select(value=(5, None))
chord.opts(
    opts.Chord(cmap='Category20', edge_cmap='Category20', 
               edge_color=dim('source').str(), 
               labels='name', 
               node_color=dim('index').str()))
# plt.title('Positive Correlations', fontsize=22)
# plt.savefig(f'figures/chord_diagram_positive.png')




In [13]:
# plot negative correlations
links.value = links.n_value
chord = hv.Chord((links, hv.Dataset(pd.DataFrame(nodes), 'index'))).select(value=(15, None))
chord.opts(
    opts.Chord(cmap='Category20', edge_cmap='Category20', 
               edge_color=dim('source').str(), 
               labels='name', 
               node_color=dim('index').str()))
# plt.title('Negative Correlations', fontsize=22)
# plt.savefig(f'figures/chord_diagram_negative.png')
# hv.save(chord, 'penguin_plot.png')