In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import holoviews as hv
from holoviews import opts, dim
from gensim.models import LdaModel

In [2]:
# Optimal number of topics
nTopics = 16

In [3]:
# Define topic names
topic_names = [
                'Land Surface',
                'Hydrogeology',
                'Modeling',
                'Soil Moisture',
                'Stochastic Hydrology',
                'Rainfall and Interception',
                'Precipitation',
                'Watershed Hydrology',
                'Water Resources Management',
                'Channel Flow',
                'Snow Hydrology',
                'Groundwater',
                'Rivers and Streams', 
                'Floods',
                'Sediment Transport',
                'Climate Change',
               ]

# Load Data

In [4]:
# Load model
lda_model = LdaModel.load(f'trained_models/trained_lda_model_{nTopics}')

In [5]:
# Load topic distributions
topic_distributions = np.load(f'data/topic_distributions_{lda_model.num_topics}.npy')

In [6]:
# Pull topics
topics = lda_model.show_topics(formatted=False, num_topics=nTopics, num_words=20)

# Create Chord Diagrams

In [7]:
# Measure correlations
correlations = np.corrcoef(np.transpose(topic_distributions))# init storage
low_idx = np.where(np.abs(correlations) < 0.05)
correlations[low_idx] = 0

In [8]:
# Links for Chord Diagrams
links = pd.DataFrame(columns = ['source','target','value','n_value','p_value'])
row = -1
for i in range(topic_distributions.shape[1]):
    for j in range(i,topic_distributions.shape[1]):
        if not (i==j):
            row = row+1
            links.loc[row,'source'] = j    
            links.loc[row,'target'] = i
            links.loc[row,'n_value'] = np.max([0, -correlations[i,j]*100])
            links.loc[row,'p_value'] = np.max([0, correlations[i,j]*100])
links.value = links.p_value

# change data type
links = links.astype('int64')

In [9]:
# Nodes for Chord Diagrams
nodes = pd.DataFrame(columns = ['group','name'])
row = -1
for i in range(topic_distributions.shape[1]):
    row = row+1
    nodes.loc[row,'name'] = topic_names[i] 
    nodes.loc[row,'group'] = 0 

# change data type
nodes.group = nodes.group.astype('int64')

In [10]:
hv.extension('bokeh')
hv.output(size = 200)

In [11]:
# plot positive correlations
links.value = links.p_value
chord = hv.Chord((links, hv.Dataset(pd.DataFrame(nodes), 'index'))).select(value=(5, None))
chord.opts(
    opts.Chord(cmap='glasbey_light', 
               edge_cmap='glasbey_light', 
               edge_color=dim('source').str(), 
               labels='name', 
               node_color=dim('index').str()))

In [12]:
import colorcet
# plot positive correlations
links.value = links.n_value
chord = hv.Chord((links, hv.Dataset(pd.DataFrame(nodes), 'index'))).select(value=(5, None))
chord.opts(
    opts.Chord(cmap='glasbey_light', 
               edge_cmap='glasbey_light', 
               edge_color=dim('source').str(), 
               labels='name', 
               node_color=dim('index').str()))