In [1]:
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt
from gensim.models import LdaModel
import pandas as pd
import ecopy as ep
from scipy import stats

nTopics = 45

# Load model
lda_model = LdaModel.load(f'trained_models/trained_lda_model_new_{nTopics}')
# Load topic distributions
topic_distributions = np.load(f'data/topic_distributions_broad_{lda_model.num_topics}.npy')
topic_distributions_wogw = np.load(f'data/topic_distributions_broad_wogw_{lda_model.num_topics}.npy')
# Pull topics
topics = lda_model.show_topics(formatted=False, num_topics=nTopics, num_words=20)
# load raw corpus dataframe
with open('data/raw_corpus_broad.pkl', 'rb') as f:
    corpus_df = pkl.load(f)
with open('data/raw_corpus_broad_wogw.pkl', 'rb') as f:
    corpus_df_wogw = pkl.load(f)

In [16]:
short_corpus_df = corpus_df[3000:4000]
short_corpus_df

Unnamed: 0,DOI,Year,Journal,Title,Abstract,Affiliation
3052,10.5194/hess-18-1323-2014,2014,HESS,Long term soil moisture mapping over the Tibet...,This paper discusses soil moisture retrievals ...,
3053,10.5194/hess-18-1359-2014,2014,HESS,Portfolio optimisation for hydropower producer...,"In deregulated electricity markets, hydropower...",
3054,10.5194/hess-18-1369-2014,2014,HESS,A journey of a thousand miles begins with one ...,When simulating social action in modeling effo...,
3055,10.5194/hess-18-1383-2014,2014,HESS,Relationships between environmental governance...,We investigate relationships between environme...,
3056,10.5194/hess-18-1397-2014,2014,HESS,Hydrodynamic controls on oxygen dynamics in a ...,Oxygen depletion in coastal and estuarine wate...,
3057,10.5194/hess-18-1413-2014,2014,HESS,Socio-hydrology and the science-policy interfa...,While there is a popular perception that Canad...,
3058,10.5194/hess-18-1423-2014,2014,HESS,Does consideration of water routing affect sim...,The cycling of carbon (C) in terrestrial ecosy...,
3059,10.5194/hess-18-1439-2014,2014,HESS,Climate and topographic controls on simulated ...,Natural grasses in semiarid rangelands constit...,
3060,10.5194/hess-18-1457-2014,2014,HESS,Water balance of selected floodplain lake basi...,This study is the first attempt in the literat...,
3061,10.5194/hess-18-1467-2014,2014,HESS,Technical Note: Alternative in-stream denitrif...,The Integrated Catchment model for Nitrogen (I...,


In [19]:
# Pull years
years = np.unique(short_corpus_df['Year'])
years

array(['1998', '1999', '2000', '2001', '2002', '2014', '2015', '2016'],
      dtype=object)

In [30]:
# Pull papers
papers = np.unique(short_corpus_df['Title'])
papers

array(['A 2-D process-based model for suspended sediment dynamics: a first step\n',
       'A 2600-year history of floods in the Bernese Alps, Switzerland:\n',
       'A Bayesian approach to flow record infilling and extension for reservoir\n',
       'A Bayesian consistent dual ensemble Kalman filter for state-parameter\n',
       'A Bayesian technique for conditioning radar precipitation estimates to\n',
       'A Lagrangian model for soil water dynamics during rainfall-driven\n',
       "A ``mental models{''} approach to the communication of subsurface\n",
       'A century-scale, human-induced ecohydrological evolution of wetlands of\n',
       'A comparison of artificial neural networks used for river flow\n',
       'A comparison of interpolation methods on the basis of data obtained from\n',
       'A comparison of methods for determining field evapotranspiration:\n',
       'A comparison of methods for estimating soil characteristics in regional\n',
       'A comparison of the 

In [35]:
topic_distributions_short = topic_distributions[3000:4000]
topic_distributions_short.shape

(1000, 45)

In [28]:
# Paper wise diversity metrics
diversity_metrics = ['shannon', 'simpson', 'gini-simpson', 'dominance', 'even']

shannon_diversity_paper = {}
simpson_diversity_paper = {}
gini_diversity_paper = {}
dominance_paper = {}
shannon_diversity_mean_paper = []
simpson_diversity_mean_paper = []
gini_diversity_mean_paper = []
dominance_mean_paper = []
topic_distributions_paper_list = []
#creating an array of nans with dimensions of years and topics
#the year-paper topic dist will be appended to it later
# arr = np.full([len(years),nTopics], np.nan)
                           
for y, year in enumerate(years):
    for p, paper in enumerate(papers):
        #making an array of year-paper wise topic distributions 
        topic_distributions_paper = topic_distributions_short[(short_corpus_df['Title'] == paper),:]
#         print(topic_distributions_paper)
        #np.append(arr,topic_distributions_paper,axis=0)
        topic_distributions_paper_list.append(topic_distributions_paper)
        
paper_dist_arr = np.array(topic_distributions_paper_list)
paper_dist_arr
#saving the array because it takes a while to populate
# with open('topic_distribution_paperwise.npy', 'wb') as f:
#     np.save(f, topic_distributions_paper)

array([array([], shape=(0, 45), dtype=float64),
       array([], shape=(0, 45), dtype=float64),
       array([], shape=(0, 45), dtype=float64), ...,
       array([], shape=(0, 45), dtype=float64),
       array([], shape=(0, 45), dtype=float64),
       array([[0.        , 0.        , 0.        , 0.20053232, 0.        ,
        0.11871985, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.0300748 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.07951614, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.04090529, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.14634413, 0.        , 0.02654486, 0.        ,
        0.        , 0.01310461, 0.        , 0.15011118, 0.        ,
        0.0295524 , 0.09293776, 0.06633557, 0.        , 0.        ]])],
      dtype=object)

In [22]:
# topic_distributions_paper = np.load(f'topic_distribution_paperwise.npy')
# topic_distributions_paper

In [36]:
paper_dist_arr.shape

(7912,)

In [24]:
#Calculating diversity metrics of papers per year from the year-paper topic dist array
#Calculate the mean and append them
for y, year in enumerate(years):
    
    for p, paper in enumerate(papers):
        shannon_diversity_paper[year] = ep.diversity(paper_dist_arr[p], method = 'shannon', breakNA=False, num_equiv=False)
#     simpson_diversity_paper[year] = ep.diversity(topic_distributions_paper, method = 'simpson', breakNA=False, num_equiv=False)
#     gini_diversity_paper[year] = ep.diversity(topic_distributions_paper, method = 'gini_simpson', breakNA=False, num_equiv=False)
#     dominance[year] = ep.diversity(topic_distributions_paper, method = 'dominance', breakNA=False, num_equiv=False)
    
        shannon_diversity_mean_paper.append(np.mean(shannon_diversity_paper[year]))
#     simpson_diversity_mean_paper.append(np.mean(simpson_diversity_paper[year]))
#     gini_diversity_mean_paper.append(np.mean(gini_diversity_paper[year]))
#     dominance_mean_paper.append(np.mean(dominance_paper[year]))

shannon_diversity_mean_paper

# fig, axs = plt.subplots(2,2,figsize=(15,15))    
# axs[0, 0].plot(shannon_diversity_mean_paper[:-1])
# axs[0, 0].set_title('Shannon (paperwise)')
# axs[0, 1].plot(simpson_diversity_mean_paper[:-1], 'tab:orange')
# axs[0, 1].set_title('Simpson')
# axs[1, 0].plot(gini_diversity_mean_paper[:-1],'tab:green')
# axs[1, 0].set_title('Gini-Simpson')
# axs[1, 1].plot(dominance_mean_paper[:-1], 'tab:red')
# axs[1, 1].set_title('Dominance')
# plt.savefig('figures/diversity_paper_year_mean.png')
    

ValueError: Cannot apply_along_axis when any iteration dimensions are 0

In [13]:
div = ep.diversity(paper_dist_arr[199], method = 'shannon', breakNA=False, num_equiv=False)
div

array([1.79595125])