In [1]:
import numpy as np
import pickle as pkl
import pandas as pd 
from gensim.models import LdaModel
import json

In [2]:
nTopics = 25

In [3]:
# Load model
lda_model = LdaModel.load(f'trained_models/trained_lda_model_{nTopics}')

# Load topic distributions
topic_distributions = np.load(f'data/topic_distributions_{lda_model.num_topics}.npy')

# Pull topics
topics = lda_model.show_topics(formatted=False, num_topics=nTopics, num_words=20)

# load raw corpus dataframe
with open('data/raw_corpus.pkl', 'rb') as f:
    corpus_df = pkl.load(f)

In [4]:
#convert all nans to zeros and all zeros to a very small number
# topic_distributions = np.nan_to_num(topic_distributions)
topic_distributions = np.where(topic_distributions == 0, 0.000001, topic_distributions)
# topic_distributions

In [5]:
# np.where(topic_distributions == 0)[0]

In [6]:
# # Define topic names
# topic_names = [
#     'Precip Variability & Extr',
#     'Hydrogeochemistry',
#     'Uncertainty',
#     'Soil Moisture',
#     'Statistical Hydrology',
#     'Rainfall-Runoff',
#     'Precip Observation',
#     'Modeling & Calibration',
#     'Water Management',
#     'Snow Hydrology',
#     'Streamflow Processes',
#     'Water Quality',
#     'Channel Flow',
#     'Floods',
#     'Sediment & Erosion',
#     'Climate Change',
#     'Subsurface Flow & Trans',
#     'Scaling & Spatial Variabil',
#     'Land Surface Fluxes',
#     'Hydrogeology',
#     'Human Interv & Eff',
#     'Land Cover',
#     'Systems Hydrology',
#     'Modeling & Forecasting',
#     'Groundwater'
# ]

In [7]:
# # Define colors to associate with each topic
# custom_colors = {
#  'burlywood': '#DEB887',
#  'chocolate': '#D2691E',
#  'crimson': '#DC143C',
#  'darkgreen': '#006400',
#  'darkorange': '#FF8C00',
#  'darkslategrey': '#2F4F4F',
#  'deepskyblue': '#00BFFF',
#  'dimgray': '#696969',
#  'firebrick': '#B22222',
#  'gold': '#FFD700',
#  'goldenrod': '#DAA520',
#  'lawngreen': '#7CFC00',
#  'lightcoral': '#F08080',
#  'lightpink': '#FFB6C1',
#  'mediumvioletred': '#C71585',
#  'orangered': '#FF4500',
#  'orchid': '#DA70D6',
#  'royalblue': '#4169E1',
#  'slateblue': '#6A5ACD',
#  'springgreen': '#00FF7F',
#  'steelblue': '#4682B4',
#  'teal': '#008080',
#  'turquoise': '#40E0D0',
#  'yellow': '#FFFF00',
#  'blueviolet': '#8A2BE2',
#  'yellowgreen': '#9ACD32'}

# # turn into a list
# colorlist = []
# for i, color in enumerate(custom_colors.values()):
#     colorlist.append(tuple(int(color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4)))
#     colorlist[i] = (colorlist[i][0] / 256, colorlist[i][1] / 256, colorlist[i][2] / 256)

In [8]:
#calculate JSD for all pairs of papers
#the max force values (dist) are capped to 1000 later on
def calc_KL_divergence(paper1,paper2):
    return -np.nansum(paper1 * np.log(paper2/paper1))
def jensen_shannon_distance(paper1,paper2):
    M=0.5*(paper1+paper2)
    D1=calc_KL_divergence(paper1,M)
    D2=calc_KL_divergence(paper2,M)
    JSDiv = 0.5*D1+0.5*D2
    JSD = np.sqrt(JSDiv)
    return JSD

In [9]:
#Select by year and journal
year_id = input("Insert year of publication: ")
journal_id = input("Insert journal id (options: WRR, HESS, JHM, HSJ, JH, HP): ")
df_year = corpus_df.loc[corpus_df['Year'] == year_id]
df_year_journal = df_year.loc[df_year['Journal'] == journal_id]
df_year_journal

Insert year of publication: 2011
Insert journal id (options: WRR, HESS, JHM, HSJ, JH, HP): HESS


Unnamed: 0,DOI,Year,Journal,Title,Abstract,Affiliation
1947,10.5194/hess-15-2763-2011,2011,HESS,Interpolation of groundwater quality parameter...,"For many environmental variables, measurements...",
1948,10.5194/hess-15-2777-2011,2011,HESS,Spectral representation of the annual cycle in...,The annual cycle of temperature and precipitat...,
1949,10.5194/hess-15-2821-2011,2011,HESS,Sediment transport modelling in a distributed ...,Bedload sediment transport and erosion process...,
1950,10.5194/hess-15-2853-2011,2011,HESS,Low-frequency variability of European runoff,This study investigates the low-frequency comp...,
1951,10.5194/hess-15-2871-2011,2011,HESS,Analyses of impacts of China's international t...,This study provides an insight into the impact...,
...,...,...,...,...,...,...
2212,10.5194/hess-15-3829-2011,2011,HESS,Assimilation of ASCAT near- surface soil moist...,This study examines whether the assimilation o...,
2213,10.5194/hess-15-3843-2011,2011,HESS,Integral quantification of seasonal soil moist...,Soil moisture at the plot or hill-slope scale ...,
2214,10.5194/hess-15-3861-2011,2011,HESS,What can we learn from long-term groundwater d...,"Future risks for groundwater resources, due to...",
2215,10.5194/hess-15-3877-2011,2011,HESS,Parameterization of a bucket model for soil-ve...,We investigate the potential impact of account...,


In [10]:
#Select minimum paper correlation cutoff point (use quantiles instead)
min_cutoff = input("Select minimum paper distance cutoff (Options: High, Medium, Low): ")
min_cutoff = min_cutoff.lower()

if min_cutoff == 'high':
    cut_val = 2.5
elif min_cutoff == 'medium':
    cut_val = 2.0
elif min_cutoff == 'low':
    cut_val == 1.5   


#select topic
sel_topic = input("Select topic; refer to the list of topics below and choose your number(for all topics, input 'all' ")
# '0'     'Precip Variability & Extr',
# '1'     'Hydrogeochemistry',
# '2'     'Uncertainty',
# '3'    'Soil Moisture',
# '4'    'Statistical Hydrology',
# '5'    'Rainfall-Runoff',
# '6'    'Precip Observation',
# '7'    'Modeling & Calibration',
# '8'    'Water Management',
# '9'    'Snow Hydrology',
# '10'    'Streamflow Processes',
# '11'   'Water Quality',
# '12'     'Channel Flow',
# '13'     'Floods',
# '14'     'Sediment & Erosion',
# '15'     'Climate Change',
# '16'     'Subsurface Flow & Trans',
# '17'     'Scaling & Spatial Variabil',
# '18'     'Land Surface Fluxes',
# '19'     'Hydrogeology',
# '20'     'Human Interv & Eff',
# '21'     'Land Cover',
# '22'     'Systems Hydrology',
# '23'     'Modeling & Forecasting',
# '24'     'Groundwater'

Select minimum paper distance cutoff (Options: High, Medium, Low): Medium
Select topic; refer to the list of topics below and choose your number(for all topics, input 'all' all


In [11]:
# jsd_np_array[jsd_np_array =< cutoff_value] = np.nan

In [21]:
#initiate individual lists for nodes and links
node_list = []
link_list = []

# dist_values = np.full([corpus_df.shape[0], corpus_df.shape[0]], np.nan)
# dist_values = np.full([1000, 1000], -0.1)
    
for p1, paper1 in enumerate(df_year_journal["Title"][:]):
    max_topic = np.argmax(topic_distributions[p1])
    grp = {"group" : max_topic, "name": paper1}
    node_list.append(grp)
    for p2, paper2 in enumerate(df_year_journal["Title"][p1:]):
        if p1 == p2:
            dist = 0
        else:
            #round to 2 decimal places
            if sel_topic == 'all':
                JSD = jensen_shannon_distance(topic_distributions[p1, :], topic_distributions[p2, :])
            else:
                JSD = jensen_shannon_distance(topic_distributions[p1, sel_topic], topic_distributions[p2, sel_topic])
            dist = round(1/JSD, 2)
            if dist >= cut_val and dist <= 10:
                link = {"source": p1, "target": p2, "value": dist}
                link_list.append(link)
            continue


In [22]:
link_list

[{'source': 0, 'target': 4, 'value': 4.02},
 {'source': 0, 'target': 38, 'value': 2.73},
 {'source': 0, 'target': 51, 'value': 2.27},
 {'source': 0, 'target': 193, 'value': 2.03},
 {'source': 1, 'target': 12, 'value': 2.71},
 {'source': 1, 'target': 19, 'value': 2.7},
 {'source': 1, 'target': 27, 'value': 3.63},
 {'source': 1, 'target': 41, 'value': 2.36},
 {'source': 1, 'target': 58, 'value': 2.98},
 {'source': 1, 'target': 64, 'value': 2.16},
 {'source': 1, 'target': 71, 'value': 2.29},
 {'source': 1, 'target': 74, 'value': 2.23},
 {'source': 1, 'target': 102, 'value': 2.07},
 {'source': 1, 'target': 123, 'value': 2.27},
 {'source': 1, 'target': 166, 'value': 2.23},
 {'source': 1, 'target': 213, 'value': 2.95},
 {'source': 1, 'target': 217, 'value': 2.17},
 {'source': 1, 'target': 242, 'value': 2.28},
 {'source': 1, 'target': 244, 'value': 3.27},
 {'source': 1, 'target': 267, 'value': 2.0},
 {'source': 2, 'target': 124, 'value': 2.0},
 {'source': 3, 'target': 13, 'value': 2.38},
 {'s

In [23]:
# # save the lists
# with open("node_list_full20.txt", "wb") as fp:
#     pkl.dump(node_list, fp)
# with open("link_list_full20.txt", "wb") as fp:
#     pkl.dump(link_list, fp)

In [24]:
#initiate json file
json_prep = {"links":link_list, "nodes":node_list}
# json_prep = {"links":link_list}
#json does not recognize NumPy data types; defining own encoder
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

#dumping the data into json file
json_dump = json.dumps(json_prep, indent=1, sort_keys=True, cls=NpEncoder)

In [25]:
#pd.DataFrame(json_prep['nodes']).head()

In [26]:
# pd.DataFrame(json_prep['links']).head()

In [27]:
#save output
filename_out = 'hiddenstories_year_journal.json'
json_out = open(filename_out,'w')
json_out.write(json_dump)
json_out.close()