In [1]:
import os, sys, time, resource, re, gc, shutil
import operator
from multiprocess import Pool
from functools import partial
from urllib.parse import urlparse, parse_qsl
from django.db.models import Count
import matplotlib
matplotlib.use('pgf')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import django
from adjustText import adjust_text

sys.path.append('/home/galm/software/django/tmv/BasicBrowser/')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BasicBrowser.settings")
django.setup()

from scoping.models import *
from tmv_app.models import *
from django.db.models import F, Sum, Count
from matplotlib import gridspec

from utils.utils import *


import statsmodels.formula.api as sm, statsmodels.stats.api as sms

pgf_with_latex = {
    "text.usetex": True,            # use LaTeX to write all text
    "pgf.rcfonts": False,           # Ignore Matplotlibrc
    "text.latex.unicode": True,
    "pgf.preamble": [
        #r"\usepackage[utf8x]{inputenc}",
        r"\usepackage{xcolor}"
    ],
    "pgf.texsystem" : "xelatex",
    "figure.figsize": [7.2,5]
}
matplotlib.rcParams.update(pgf_with_latex)


from run_id import run_id

run_id = 1056

runstat = RunStats.objects.get(pk=run_id)
topics = DynamicTopic.objects.filter(run_id=run_id)

  from pandas.core import datetools


In [2]:
tsums = topics.aggregate(
    ip_score=Sum('ipcc_score'),
    score=Sum('ipcc_time_score')
)
tsums

topics.update(
    ipcc_share=F('ipcc_score')/tsums['ip_score'],
    share=F('ipcc_time_score')/tsums['score']
)

100

In [3]:
def calculate_deviations(df):
    df['deviation'] = df['ipcc_share'] - df['share']
    df['abs_md'] = abs(df['deviation'])

    md = df['deviation'].max()
    rae = df['abs_md'].mean()
    lh = df['abs_md'].sum() / 2

    df['representation'] = df['ipcc_share'] / df['share']  
    
    df_disp = {'MD':md,'Rae':rae,'L-H':lh}
    return [df,df_disp]


In [4]:



df = pd.DataFrame.from_dict(
    list(topics.values(
        'title',
        'score',
        'ipcc_coverage',
        'share',
        'ipcc_score',
        'ipcc_share',
        'ipcc_time_score',
        'primary_wg'
    ))
)

df, df_disp  = calculate_deviations(df)


df.sort_values('representation').head()

Unnamed: 0,ipcc_coverage,ipcc_score,ipcc_share,ipcc_time_score,primary_wg,score,share,title,deviation,abs_md,representation
4,0.038956,14.536417,0.002499,373.149456,1,837.086135,0.009454,"{reaction, catalyst, sorbent}",-0.006954,0.006954,0.264364
3,0.043934,4.468382,0.000768,101.707543,1,338.460305,0.002577,"{membrane, separation, selectivity}",-0.001808,0.001808,0.298144
95,0.047512,2.074573,0.000357,43.664265,2,213.584173,0.001106,"{biochar, amendment, application}",-0.00075,0.00075,0.322427
6,0.049103,12.406336,0.002133,252.659603,2,422.463804,0.006401,"{root, fine, shoot}",-0.004268,0.004268,0.333224
71,0.049797,12.344545,0.002122,247.898329,1,617.389036,0.00628,"{adsorption, capacity, adsorbent}",-0.004158,0.004158,0.337932


In [5]:

from matplotlib import ticker


plt.rcParams["figure.figsize"] = [7.2,5]

def plot_representation(df, ax, nts=5, xspace=3,yspace = 0.15, fmin=None, fmax=None):
    
    md = df['deviation'].max()
    rae = df['abs_md'].mean()
    lh = df['abs_md'].sum() / 2

    pdf = df.sort_values('representation',ascending=False).reset_index()#.set_index('title')

    pdf = pdf[pdf['share'] > 0.001]

    pdf['lrep'] = np.log(pdf['representation'])
    #pdf['lrep'] = pdf['representation']
    pdf.set_index('title')['lrep'].plot(kind="bar",ax=ax,color="grey")

    i = 0

    rmax = pdf['lrep'].max()
    if fmax:
        rmax = fmax
    rmin = pdf['lrep'].min()
    if fmin:
        rmin = -fmin
    
    for index, row in pdf.head(nts).iterrows():
        i+=1
        s = round(row['ipcc_share']*100,1)
        v = round(row['share']*100,1)
        ax.annotate(
            s="{} [{},{}]".format(row['title'],s,v),
            xy=(-0.5+i,row['lrep']),
            xytext=((xspace-3)*-1+i*xspace,rmax-rmax*yspace*(i-1)),
            arrowprops=dict(
                facecolor='black', 
                #shrink=0.05,
                width=0.1,
                headwidth=0.2
            ),
            ha="left",
            va="bottom"
        )

    i = 0
    for index, row in pdf.sort_values('representation').head(nts).iterrows():
        i+=1
        s = round(row['ipcc_share']*100,1)
        v = round(row['share']*100,1)
        ax.annotate(
            s="{} [{},{}]".format(row['title'],s,v),
            xy=(len(pdf)-0.5-i,row['lrep']),
            xytext=(len(pdf)+xspace-3-i*xspace,rmin-rmin*yspace*(i-1)),
            arrowprops=dict(
                facecolor='black', 
                #shrink=0.05,
                width=0.1,
                headwidth=0.2
            ),
            ha="right",
            va="top"
        )
        
    #ax.set_yscale('log')
    
    
    if rmin > -1:
        rmin = -1
    if np.exp(rmax) < 2:
        rmax = np.log(2.1)
    
    lmin = int(1/np.exp(rmin)//1)*-1
    lmax = int(np.exp(rmax)//1)
    


    ytick_labels = [i for i in range(lmin,lmax+1) if i not in [-1,1]]
    yticks = []
    for x in ytick_labels:
        if x < 0:
            y = np.log(-1/x)
        elif x==0:
            y = 0
        else:
            y = np.log(x)
        yticks.append(y)
    
    ax.set_yticks(yticks)
    ax.set_yticklabels(ytick_labels)
    
    #ax.yaxis.set_major_formatter(ticker.FormatStrFormatter("%.1f"))
        
    rmax = pdf['lrep'].max()
    rmin = pdf['lrep'].min()
    
    ax.text(
        len(pdf)*0.066,-0.2,
        "MD: {:.3f}\nRae: {:.3f}\nL-H: {:.3f}".format(md,rae,lh),
        va="top",
        ha="left",
        bbox={'facecolor':'red', 'alpha':0.3,'pad':10}
   )

    #ax.text(1,-1*0.2,"MD: {:.3f}".format(md))   
    #ax.text(1,-1*0.4,"Rae: {:.3f}".format(rae))   
    #ax.text(1,-1*0.6,"L-H: {:.3f}".format(lh))   
    
    ax.get_xaxis().set_visible(False)#.set_ticks([])
    
    ax.set_ylim((rmin+rmin*0.1,rmax+rmax*0.15))

fig, ax = plt.subplots()
plot_representation(df,ax,nts=10,xspace=6,yspace=0.07)
plt.tight_layout()
plt.savefig('../plots/ipcc_representation/ipcc_rep_{}_all.pdf'.format(run_id),bbox_inches='tight')
plt.show()

In [6]:
rmin = -1.5
rmax = 1
lmin = int(1/np.exp(rmin)//1)*-1
lmax = int(np.exp(rmax)//1)

ytick_labels = [i for i in range(lmin,lmax) if i not in [-1,1]]
print(ytick_labels)
yticks = []
for x in ytick_labels:
    if x < 0:
        x = np.log(-1/x)
    elif x==0:
        x = 0
    else:
        x = np.log(x)
    yticks.append(x)
#yticks = [np.log(x) if x!=0 else 0 for x in ytick_labels ]
print(yticks)

[-4, -3, -2, 0]
[-1.3862943611198906, -1.0986122886681098, -0.6931471805599453, 0]


plt.rcParams["figure.figsize"] = [16,12]

stat = RunStats.objects.get(pk=run_id)

fig = plt.figure()


disp = pd.DataFrame(columns=['AR','MD','Rae','L-H'])
#fig, axs = plt.subplots(2,3,sharey=True)

for i, tp in enumerate(stat.periods.filter(n__lt=6)):
    if i == 0:
        ax = fig.add_subplot(2,3,i+1)
        ax1 = ax
    else:
        ax = fig.add_subplot(2,3,i+1,sharey=ax1)
    print(tp)
    tdts = TimeDTopic.objects.filter(dtopic__run_id=run_id,period=tp)
    
    df = pd.DataFrame.from_dict(
        list(tdts.values(
            'dtopic__title',
            'score',
            'share',
            'ipcc_score',
            'ipcc_share',
            #'ipcc_time_score'
        ))
    )
    
    df = df.rename(columns={'dtopic__title':'title'})
    
    df, df_disp  = calculate_deviations(df)
    df_disp['AR'] = tp.title   
    disp = disp.append(pd.DataFrame.from_dict([df_disp]))

    plot_representation(df, ax, 2.3,1)
    
    ax.set_title(tp.title)
    
    if tp.title=="AR5":
        ax = fig.add_subplot(2,3,6)
        
        disp = disp.set_index('AR')
        
        disp.plot(ax=ax)
        
        ax.set_title("Disproportionality")
    
    
    
    #plt.savefig('../plots/ipcc_representation/ipcc_rep_{}_{}.png'.format(run_id,tp.title),bbox_inches='tight')
    #plt.show()

plt.savefig('../plots/ipcc_representation/ipcc_rep_{}_ARS.png'.format(run_id),bbox_inches='tight')
plt.show()

disp.head()

In [7]:
tds = topics.filter(timedtopic__period__n__lt=6).values(
    'title','timedtopic__period__title','timedtopic__period__n','timedtopic__score','score'
).order_by('id','timedtopic__period__n')

tdf = pd.DataFrame.from_dict(list(tds))

#tdf['ys'] = tdf[]

tdf['share'] = tdf['timedtopic__score'] / tdf['score']

tdf['ys'] = tdf['timedtopic__period__n'] * tdf['share']


tdf.head(12)
#tdf.groupby('')

Unnamed: 0,score,timedtopic__period__n,timedtopic__period__title,timedtopic__score,title,share,ys
0,551.007169,1,AR1,0.210497,"{absorption, solvent, amine}",0.000382,0.000382
1,551.007169,2,AR2,1.329605,"{absorption, solvent, amine}",0.002413,0.004826
2,551.007169,3,AR3,8.701525,"{absorption, solvent, amine}",0.015792,0.047376
3,551.007169,4,AR4,11.694946,"{absorption, solvent, amine}",0.021225,0.084899
4,551.007169,5,AR5,115.116711,"{absorption, solvent, amine}",0.208921,1.044603
5,756.179748,1,AR1,0.17961,"{material, waste, concrete}",0.000238,0.000238
6,756.179748,2,AR2,2.74337,"{material, waste, concrete}",0.003628,0.007256
7,756.179748,3,AR3,14.185333,"{material, waste, concrete}",0.018759,0.056278
8,756.179748,4,AR4,27.380182,"{material, waste, concrete}",0.036209,0.144834
9,756.179748,5,AR5,204.953712,"{material, waste, concrete}",0.271038,1.355192


In [8]:
means = tdf.groupby('title')['ys'].mean()

means = pd.DataFrame({'ys' : tdf.groupby('title')['ys'].mean()}).reset_index()

mdf = df.merge(means)

mdf.head()

means.sort_values('ys',ascending=False).head()



Unnamed: 0,title,ys
84,"{specie, community, distribution}",0.664012
83,"{solar, radiation, cycle}",0.629354
56,"{ozone, stratospheric, tropospheric}",0.564229
24,"{delta, isotope, value}",0.552021
68,"{record, holocene, cal}",0.539743


In [9]:
for name, group in tdf.groupby('title'):
    l = []
    for index, y in group.iterrows():
        print(y.share)
        l = l + [y.timedtopic__period__n] * round(y.share*100)
        
    print(l)
    
    print(np.mean(l))
    
    mdf.loc[mdf['title']==name]['av_year'] = np.mean(l)
    
    break

0.0003820214495843115
0.002413044655095269
0.015792035624465243
0.021224670710399733
0.20892053250460182
[3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
4.76


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [10]:
def year_av(x):
    group = tdf[tdf['title']==x['title']]
    l = []
    for index, y in group.iterrows():
        l = l + [y.timedtopic__period__n] * round(y.share*100)
    return np.mean(l)

mdf['year_av'] = df.apply(year_av,axis=1)

mdf.head()

Unnamed: 0,ipcc_coverage,ipcc_score,ipcc_share,ipcc_time_score,primary_wg,score,share,title,deviation,abs_md,representation,ys,year_av
0,0.171033,43.05518,0.007402,251.735971,2,545.019761,0.006378,"{drought, index, severe}",0.001025,0.001025,1.160669,0.409082,4.6
1,0.188841,15.690584,0.002698,83.088751,2,480.721366,0.002105,"{household, farmer, farm}",0.000593,0.000593,1.28152,0.161454,4.9375
2,0.091848,22.612274,0.003888,246.193343,2,582.905857,0.006237,"{fish, fishery, marine}",-0.00235,0.00235,0.623299,0.404555,4.902439
3,0.043934,4.468382,0.000768,101.707543,1,338.460305,0.002577,"{membrane, separation, selectivity}",-0.001808,0.001808,0.298144,0.289349,4.866667
4,0.038956,14.536417,0.002499,373.149456,1,837.086135,0.009454,"{reaction, catalyst, sorbent}",-0.006954,0.006954,0.264364,0.425139,4.840909


In [11]:
x = 'year_av'
y = 'representation'

fig, ax = plt.subplots()

cmap = {1: "#66c2a5", 2: "#fc8d62" , 3: "#8da0cb"}
colors = [cmap[i] for i in mdf['primary_wg']]


mdf.plot.scatter(
    x,y,s=mdf['score']*0.03,
    c = colors,ax=ax
)

z = np.polyfit(x=mdf[x], y=mdf[y], deg=1)
p = np.poly1d(z)
mdf['trendline'] = p(mdf.loc[:, x])

plt.plot(mdf[x],mdf['trendline'])

qs = 5

mdf['x_q'] = pd.qcut(mdf[x], qs, labels=False)
mdf['y_q'] = pd.qcut(mdf[y], qs, labels=False)
texts = []


#for i, x in mdf[mdf['year_av']> 5.42].iterrows():
for i, row in mdf[(mdf['x_q'].isin([0,qs-1])) & (mdf['y_q'].isin([0,qs-1]))].iterrows():
    texts.append(plt.text(row[x],row[y],row['title'],ha='center', va='center'))
    
adjust_text(texts, 
            arrowprops=dict(arrowstyle="-", color='black', lw=0.5))

plt.axhline(1,c="grey")
plt.axvline(mdf[x].median(),c="grey")



# Put labels on the quadrants
# x
d1 = mdf[x].median()-plt.xlim()[0]
d2 = mdf[x].median()-plt.xlim()[1]

xlabpoints = [mdf[x].median()-d1/2,mdf[x].median()-d2/2]
xrange = mdf[x].max()-mdf[x].min()
xlabpoints = [mdf[x].min()+xrange*0.15,mdf[x].min()+xrange*0.85]
labels = [
    ['Under-represented,\nolder topics','Under-represented,\nnewer topics'],
    ['Over-represented,\nolder topics','Over-represented,\nnewer topics']
]

pylims = plt.ylim()

tpad = (pylims[1]-pylims[0])*0.13


ops = [operator.lt,operator.gt]


for i in [0,1]:
    for j in [0,1]:
        if j==0:
            tpadx=tpad*-1
        else:
            tpadx=tpad*0.75
            
        q = mdf[(ops[i](mdf[x],mdf[x].median())) & (ops[j](mdf[y],1))]
        qshare = q['score'].sum()/mdf['score'].sum()
        plt.text(
            xlabpoints[i],
            pylims[j]+tpadx,
            labels[j][i] + " {:.0%}".format(qshare),
            va="center",ha="center",
            bbox={'facecolor':'red', 'alpha':qshare*2-0.25, 'pad':10},
            fontsize=7
        )

        

#ax.get_yaxis().set_visible(False)
#ax.tick_params(axis=u'both', which=u'both',length=0)
#ax2.tick_params(axis=u'both', which=u'both',length=0)
plt.xlabel('Assessment period\noccurence')
plt.ylabel('Representation')

plt.tight_layout(h_pad=18)

fig.patch.set_facecolor('#f0f0f0')    
#plt.tight_layout()

plt.savefig(
    '../plots/ipcc_representation/ipcc_rep_new{}_all.pdf'.format(run_id),
    bbox_inches='tight',
    facecolor=fig.get_facecolor(),
    pad_inches=0.2
)

plt.show()

In [12]:
plt.close()

In [13]:
mdf.sort_values('year_av').head(15)

Unnamed: 0,ipcc_coverage,ipcc_score,ipcc_share,ipcc_time_score,primary_wg,score,share,title,deviation,abs_md,representation,ys,year_av,trendline,x_q,y_q
28,0.15204,47.712085,0.008203,313.812664,2,307.807518,0.00795,"{specie, community, distribution}",0.000253,0.000253,1.031778,0.664012,3.397959,1.386148,0,3
10,0.055043,22.050028,0.003791,400.59424,2,636.465243,0.010149,"{leaf, photosynthetic, photosynthesis}",-0.006358,0.006358,0.373536,0.520974,4.193548,1.093805,0,0
59,0.093869,26.48893,0.004554,282.189169,1,539.99404,0.007149,"{ch4, oxidation, uptake}",-0.002595,0.002595,0.637019,0.43134,4.215686,1.085671,0,1
62,0.287525,115.086966,0.019786,400.26715,1,534.824203,0.010141,"{ozone, stratospheric, tropospheric}",0.009646,0.009646,1.951213,0.564229,4.223881,1.08266,0,4
79,0.302665,109.726271,0.018865,362.533631,1,544.623473,0.009185,"{cloud, feedback, radiative}",0.00968,0.00968,2.053954,0.510667,4.229508,1.080592,0,4
6,0.049103,12.406336,0.002133,252.659603,2,422.463804,0.006401,"{root, fine, shoot}",-0.004268,0.004268,0.333224,0.503083,4.237288,1.077733,0,0
19,0.058937,14.525168,0.002497,246.452329,1,451.888276,0.006244,"{n2o, denitrification, nitrous-oxide}",-0.003746,0.003746,0.39996,0.456248,4.240741,1.076464,0,0
37,0.114995,41.774649,0.007182,363.272442,1,605.525292,0.009203,"{methane, concentration, oxidation}",-0.002021,0.002021,0.780385,0.492041,4.258621,1.069894,0,2
24,0.122154,24.089763,0.004142,197.208792,2,352.273767,0.004996,"{rice, paddy, field}",-0.000855,0.000855,0.828962,0.466521,4.290909,1.05803,0,2
42,0.267495,137.001454,0.023554,512.164849,1,764.521203,0.012975,"{ocean, circulation, deep}",0.010579,0.010579,1.815281,0.521795,4.295082,1.056496,0,4


In [14]:
mdf[mdf[y].isna()]

Unnamed: 0,ipcc_coverage,ipcc_score,ipcc_share,ipcc_time_score,primary_wg,score,share,title,deviation,abs_md,representation,ys,year_av,trendline,x_q,y_q
