In [1]:
import numpy as np
import pickle as pkl
import pandas as pd 
from gensim.models import LdaModel
import json

In [2]:
nTopics = 25

In [3]:
# Load model
lda_model = LdaModel.load(f'trained_models/trained_lda_model_{nTopics}')

# Load topic distributions
topic_distributions = np.load(f'data/topic_distributions_{lda_model.num_topics}.npy')

# Pull topics
topics = lda_model.show_topics(formatted=False, num_topics=nTopics, num_words=20)

# load raw corpus dataframe
with open('data/raw_corpus.pkl', 'rb') as f:
    corpus_df = pkl.load(f)

In [4]:
#convert all nans to zeros and all zeros to a very small number
# topic_distributions = np.nan_to_num(topic_distributions)
topic_distributions = np.where(topic_distributions == 0, 0.000001, topic_distributions)
# topic_distributions

In [5]:
# np.where(topic_distributions == 0)[0]

In [6]:
# # Define topic names
# topic_names = [
#     'Precip Variability & Extr',
#     'Hydrogeochemistry',
#     'Uncertainty',
#     'Soil Moisture',
#     'Statistical Hydrology',
#     'Rainfall-Runoff',
#     'Precip Observation',
#     'Modeling & Calibration',
#     'Water Management',
#     'Snow Hydrology',
#     'Streamflow Processes',
#     'Water Quality',
#     'Channel Flow',
#     'Floods',
#     'Sediment & Erosion',
#     'Climate Change',
#     'Subsurface Flow & Trans',
#     'Scaling & Spatial Variabil',
#     'Land Surface Fluxes',
#     'Hydrogeology',
#     'Human Interv & Eff',
#     'Land Cover',
#     'Systems Hydrology',
#     'Modeling & Forecasting',
#     'Groundwater'
# ]

In [7]:
# # Define colors to associate with each topic
# custom_colors = {
#  'burlywood': '#DEB887',
#  'chocolate': '#D2691E',
#  'crimson': '#DC143C',
#  'darkgreen': '#006400',
#  'darkorange': '#FF8C00',
#  'darkslategrey': '#2F4F4F',
#  'deepskyblue': '#00BFFF',
#  'dimgray': '#696969',
#  'firebrick': '#B22222',
#  'gold': '#FFD700',
#  'goldenrod': '#DAA520',
#  'lawngreen': '#7CFC00',
#  'lightcoral': '#F08080',
#  'lightpink': '#FFB6C1',
#  'mediumvioletred': '#C71585',
#  'orangered': '#FF4500',
#  'orchid': '#DA70D6',
#  'royalblue': '#4169E1',
#  'slateblue': '#6A5ACD',
#  'springgreen': '#00FF7F',
#  'steelblue': '#4682B4',
#  'teal': '#008080',
#  'turquoise': '#40E0D0',
#  'yellow': '#FFFF00',
#  'blueviolet': '#8A2BE2',
#  'yellowgreen': '#9ACD32'}

# # turn into a list
# colorlist = []
# for i, color in enumerate(custom_colors.values()):
#     colorlist.append(tuple(int(color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4)))
#     colorlist[i] = (colorlist[i][0] / 256, colorlist[i][1] / 256, colorlist[i][2] / 256)

In [8]:
#calculate JSD for all pairs of papers
#the max force values (dist) are capped to 1000 later on
def calc_KL_divergence(paper1,paper2):
    return -np.nansum(paper1 * np.log(paper2/paper1))
def jensen_shannon_distance(paper1,paper2):
    M=0.5*(paper1+paper2)
    D1=calc_KL_divergence(paper1,M)
    D2=calc_KL_divergence(paper2,M)
    JSDiv = 0.5*D1+0.5*D2
    JSD = np.sqrt(JSDiv)
    return JSD

In [9]:
#Select by year and journal
year_id = input("Insert year of publication: ")
journal_id = input("Insert journal id (options: WRR, HESS, JHM, HSJ, JH, HP): ")
df_year = corpus_df.loc[corpus_df['Year'] == year_id]
df_year_journal = df_year.loc[df_year['Journal'] == journal_id]
df_year_journal

Insert year of publication: 2012
Insert journal id (options: WRR, HESS, JHM, HSJ, JH, HP): HSJ


Unnamed: 0,DOI,Year,Journal,Title,Abstract,Affiliation
12359,10.1080/02626667.2012.727091,2012,HSJ,Improving the calibration strategy of the phys...,The use of a physically-based hydrological mod...,
12360,10.1080/02626667.2012.726357,2012,HSJ,Design event selection in bivariate hydrologic...,In the bivariate analysis of hydrological even...,
12361,10.1080/02626667.2012.727212,2012,HSJ,Modelling streamflow trends for a watershed wi...,Streamflow variability in the Upper and Lower ...,
12362,10.1080/02626667.2012.726993,2012,HSJ,Hydrological regime of remote catchments with ...,The Baker basin (27 000 km(2)) is located in o...,
12363,10.1080/02626667.2012.728707,2012,HSJ,Modelling the impacts of land-cover change on ...,A modelling experiment is used to examine diff...,
...,...,...,...,...,...,...
12465,10.1080/02626667.2012.710335,2012,HSJ,A new measure for assessing the efficiency of ...,There is a lack of consistency and generality ...,
12466,10.1080/02626667.2012.710334,2012,HSJ,Incorporating elevation in rainfall interpolat...,This paper compares the performance of three g...,
12467,10.1080/02626667.2012.712740,2012,HSJ,Optimization of cumulative trapped sediment cu...,Reservoir silting is one of the principal prob...,
12468,10.1080/02626667.2012.715747,2012,HSJ,"Modelling indicators of water security, water ...",The GWAVA (Global Water AVailability Assessmen...,


In [10]:
#Select minimum paper correlation cutoff point
min_cutoff = input("Select minimum paper distance cutoff (Options: High, Medium, Low): ")

if min_cutoff == 'High':
    cut_val = 2.5
elif min_cutoff == 'Medium':
    cut_val = 2.0
elif min_cutoff == 'Low':
    cut_val == 1.5   

#select topic
sel_topic = input("Select topic; refer to the list of topics below and choose your number: ")
# '0'     'Precip Variability & Extr',
# '1'     'Hydrogeochemistry',
# '2'     'Uncertainty',
# '3'    'Soil Moisture',
# '4'    'Statistical Hydrology',
# '5'    'Rainfall-Runoff',
# '6'    'Precip Observation',
# '7'    'Modeling & Calibration',
# '8'    'Water Management',
# '9'    'Snow Hydrology',
# '10'    'Streamflow Processes',
# '11'   'Water Quality',
# '12'     'Channel Flow',
# '13'     'Floods',
# '14'     'Sediment & Erosion',
# '15'     'Climate Change',
# '16'     'Subsurface Flow & Trans',
# '17'     'Scaling & Spatial Variabil',
# '18'     'Land Surface Fluxes',
# '19'     'Hydrogeology',
# '20'     'Human Interv & Eff',
# '21'     'Land Cover',
# '22'     'Systems Hydrology',
# '23'     'Modeling & Forecasting',
# '24'     'Groundwater'

Select minimum paper distance cutoff (Options: High, Medium, Low): Medium
Select topic; refer to the list of topics below and choose your number: 5


In [None]:
#initiate individual lists for nodes and links
node_list = []
link_list = []

# dist_values = np.full([corpus_df.shape[0], corpus_df.shape[0]], np.nan)
# dist_values = np.full([1000, 1000], -0.1)
    
for p1, paper1 in enumerate(df_year_journal["Title"][:]):
    max_topic = np.argmax(topic_distributions[p1])
    grp = {"group" : max_topic, "name": paper1}
    node_list.append(grp)
    for p2, paper2 in enumerate(df_year_journal["Title"][p1:]):
        if p1 == p2:
            dist = 0
        else:
            #round to 2 decimal places
            JSD = jensen_shannon_distance(topic_distributions[p1], topic_distributions[p2])
            dist = round(1/JSD, 2)
            if dist >= cut_val and dist <= 10:
                link = {"source": p1, "target": p2, "value": dist}
                link_list.append(link)
            continue


In [None]:
node_list

In [None]:
# # save the lists
# with open("node_list_full20.txt", "wb") as fp:
#     pkl.dump(node_list, fp)
# with open("link_list_full20.txt", "wb") as fp:
#     pkl.dump(link_list, fp)

In [None]:
#initiate json file
json_prep = {"links":link_list, "nodes":node_list}
# json_prep = {"links":link_list}
#json does not recognize NumPy data types; defining own encoder
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

#dumping the data into json file
json_dump = json.dumps(json_prep, indent=1, sort_keys=True, cls=NpEncoder)

In [None]:
#pd.DataFrame(json_prep['nodes']).head()

In [None]:
# pd.DataFrame(json_prep['links']).head()

In [None]:
#save output
filename_out = 'hiddenstories_year_journal.json'
json_out = open(filename_out,'w')
json_out.write(json_dump)
json_out.close()