In [None]:
import numpy as np
import pickle as pkl
import pandas as pd 
from gensim.models import LdaModel
import json
import re
import warnings

In [None]:
nTopics = 25

# Load model
lda_model = LdaModel.load(f'trained_models/trained_lda_model_{nTopics}')

# Load topic distributions
topic_distributions = np.load(f'data/topic_distributions_{lda_model.num_topics}.npy')

# Pull topics
topics = lda_model.show_topics(formatted=False, num_topics=nTopics, num_words=20)

# load raw corpus dataframe
with open('data/raw_corpus.pkl', 'rb') as f:
    corpus_df = pkl.load(f)

In [None]:
# Define topic names
topic_names = [
    'Precip Variability & Extr',
    'Hydrogeochemistry',
    'Uncertainty',
    'Soil Moisture',
    'Statistical Hydrology',
    'Rainfall-Runoff',
    'Precip Observation',
    'Modeling & Calibration',
    'Water Management',
    'Snow Hydrology',
    'Streamflow Processes',
    'Water Quality',
    'Channel Flow',
    'Floods',
    'Sediment & Erosion',
    'Climate Change',
    'Subsurface Flow & Trans',
    'Scaling & Spatial Variabil',
    'Land Surface Fluxes',
    'Hydrogeology',
    'Human Interv & Eff',
    'Land Cover',
    'Systems Hydrology',
    'Modeling & Forecasting',
    'Groundwater'
]

In [None]:
# Define colors to associate with each topic
custom_colors = {
 'burlywood': '#DEB887',
 'chocolate': '#D2691E',
 'crimson': '#DC143C',
 'darkgreen': '#006400',
 'darkorange': '#FF8C00',
 'darkslategrey': '#2F4F4F',
 'deepskyblue': '#00BFFF',
 'dimgray': '#696969',
 'firebrick': '#B22222',
 'gold': '#FFD700',
 'goldenrod': '#DAA520',
 'lawngreen': '#7CFC00',
 'lightcoral': '#F08080',
 'lightpink': '#FFB6C1',
 'mediumvioletred': '#C71585',
 'orangered': '#FF4500',
 'orchid': '#DA70D6',
 'royalblue': '#4169E1',
 'slateblue': '#6A5ACD',
 'springgreen': '#00FF7F',
 'steelblue': '#4682B4',
 'teal': '#008080',
 'turquoise': '#40E0D0',
 'yellow': '#FFFF00',
 'blueviolet': '#8A2BE2',
 'yellowgreen': '#9ACD32'}

# turn into a list
colorlist = []
for i, color in enumerate(custom_colors.values()):
    colorlist.append(tuple(int(color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4)))
    colorlist[i] = (colorlist[i][0] / 256, colorlist[i][1] / 256, colorlist[i][2] / 256)

In [None]:
#initiate individual lists for nodes and links
node_list = []
link_list = []

In [None]:
#create a dataframe for link_list where
#cols: [number, source, target, value]
#number = number
#source and target are paper ids in pairs
#value = 1/JSD?
#group = topics

#create another dataframe for node_list, where
#cols : [group, name]
#group = topics
#name = title

In [None]:
#calculate JSD for all pairs of papers
def calc_KL_divergence(p1,p2):
    return -np.nansum(p1 * np.log(p2/p1))
def jensen_shannon_distance(p1,p2):
    M=0.5*(p1+p2)
    D1=calc_KL_divergence(p1,M)
    D2=calc_KL_divergence(p2,M)
    JSDiv = 0.5*D1+0.5*D2
    JSD = np.sqrt(JSDiv)
    return JSD

In [None]:
for p1, paper1 in enumerate(corpus):
    record = {"name": paper1, "group": np.argmax(topic_distributions[paper1, :])}
    nodes_list.append(record)
    for p2,paper2 in enumerate(corpus):
        distance = 1/JSD(paper1,paper2)
        record = {"value": distance, "source": paper1, "target": paper2}