In [12]:
import os, sys, time, resource, re, gc, shutil

mp_pdf = True
if mp_pdf:
    extension = "pdf"
    import matplotlib
    matplotlib.use('pgf')
    pgf_with_latex = {
        "text.usetex": True,            # use LaTeX to write all text
        "pgf.rcfonts": False,           # Ignore Matplotlibrc
        "text.latex.unicode": True,
        "pgf.preamble": [
            #r"\usepackage[utf8x]{inputenc}",
            r"\usepackage{xcolor}"
        ],
        "pgf.texsystem" : "xelatex",
        "figure.figsize": [12,7]
    }
    matplotlib.rcParams.update(pgf_with_latex)
else:
    extension = "png"
    
import operator
from multiprocess import Pool
from functools import partial
from urllib.parse import urlparse, parse_qsl
from django.db.models import Count

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import django
from adjustText import adjust_text

sys.path.append('/home/galm/software/django/tmv/BasicBrowser/')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BasicBrowser.settings")
django.setup()

from scoping.models import *
from tmv_app.models import *
from django.db.models import F, Sum, Count
from matplotlib import gridspec

from utils.utils import *
from copy import deepcopy


from run_id import run_id

run_id = 1056

run_id = 1429

run_id = 1771
run_id = 1810
run_id = 1817

runstat = RunStats.objects.get(pk=run_id)

if runstat.method=="DT":
    topics = DynamicTopic.objects.filter(run_id=run_id)
else:
    topics = Topic.objects.filter(run_id=run_id)

In [2]:
if runstat.method=="DT":
    
    tsums = topics.aggregate(
        ip_score=Sum('ipcc_score'),
        score=Sum('ipcc_time_score')
    )
    tsums
    
    topics.update(
        ipcc_share=F('ipcc_score')/tsums['ip_score'],
        share=F('ipcc_time_score')/tsums['score']
    )
    

In [3]:
def calculate_deviations(df):
    df['deviation'] = df['ipcc_share'] - df['share']
    df['abs_md'] = abs(df['deviation'])

    md = df['deviation'].max()
    rae = df['abs_md'].mean()
    lh = df['abs_md'].sum() / 2

    df['representation'] = df['ipcc_share'] / df['share']  
    
    df_disp = {'MD':md,'Rae':rae,'L-H':lh}
    return [df,df_disp]


In [4]:


if runstat.method=="DT":
    df = pd.DataFrame.from_dict(
        list(topics.values(
            'title',
            'score',
            'ipcc_coverage',
            'share',
            'ipcc_score',
            'ipcc_share',
            'ipcc_time_score',
            'primary_wg'
        ))
    )
else:
    df = pd.DataFrame.from_dict(
        list(topics.values(
            'title',
            'score',
            'ipcc_coverage',
            'share',
            'ipcc_score',
            'ipcc_share',
            'primary_wg'
        ))
    )
    
    
    

df, df_disp  = calculate_deviations(df)


df.sort_values('representation').head()

Unnamed: 0,ipcc_coverage,ipcc_score,ipcc_share,primary_wg,score,share,title,deviation,abs_md,representation
80,0.017321,1.506018,0.000428,1,240.982405,0.004735,"{membrane, separation, performance}",-0.004307,0.004307,0.090396
13,0.018144,2.239638,0.000637,3,242.60493,0.004767,"{hydrogen, cell, steam}",-0.00413,0.00413,0.13353
87,0.036192,4.989478,0.001418,1,326.482114,0.006415,"{engine, diesel, combustion}",-0.004997,0.004997,0.221054
30,0.031483,4.460379,0.001268,3,282.225417,0.005545,"{coal, combustion, gasification}",-0.004278,0.004278,0.228601
68,0.03635,5.365833,0.001525,3,325.977451,0.006405,"{waste, landfill, treatment}",-0.00488,0.00488,0.238096


In [5]:

from matplotlib import ticker


plt.rcParams["figure.figsize"] = [7.2,5]

def plot_representation(df, ax, nts=5, xspace=3,yspace = 0.15, fmin=None, fmax=None):
    
    md = df['deviation'].max()
    rae = df['abs_md'].mean()
    lh = df['abs_md'].sum() / 2

    pdf = df.sort_values('representation',ascending=False).reset_index()#.set_index('title')

    pdf = pdf[pdf['share'] > 0.001]

    pdf['lrep'] = np.log(pdf['representation'])
    #pdf['lrep'] = pdf['representation']
    pdf.set_index('title')['lrep'].plot(kind="bar",ax=ax,color="grey")

    i = 0

    rmax = pdf['lrep'].max()
    if fmax:
        rmax = fmax
    rmin = pdf['lrep'].min()
    if fmin:
        rmin = -fmin
    
    for index, row in pdf.head(nts).iterrows():
        i+=1
        s = round(row['ipcc_share']*100,1)
        v = round(row['share']*100,1)
        ax.annotate(
            s="{} [{},{}]".format(row['title'],s,v),
            xy=(-0.5+i,row['lrep']),
            xytext=((xspace-3)*-1+i*xspace,rmax-rmax*yspace*(i-1)),
            arrowprops=dict(
                facecolor='black', 
                #shrink=0.05,
                width=0.1,
                headwidth=0.2
            ),
            ha="left",
            va="bottom"
        )

    i = 0
    for index, row in pdf.sort_values('representation').head(nts).iterrows():
        i+=1
        s = round(row['ipcc_share']*100,1)
        v = round(row['share']*100,1)
        ax.annotate(
            s="{} [{},{}]".format(row['title'],s,v),
            xy=(len(pdf)-0.5-i,row['lrep']),
            xytext=(len(pdf)+xspace-3-i*xspace,rmin-rmin*yspace*(i-1)),
            arrowprops=dict(
                facecolor='black', 
                #shrink=0.05,
                width=0.1,
                headwidth=0.2
            ),
            ha="right",
            va="top"
        )
        
    #ax.set_yscale('log')
    
    
    if rmin > -1:
        rmin = -1
    if np.exp(rmax) < 2:
        rmax = np.log(2.1)
    
    lmin = int(1/np.exp(rmin)//1)*-1
    lmax = int(np.exp(rmax)//1)
    


    ytick_labels = [i for i in range(lmin,lmax+1) if i not in [-1,1]]
    yticks = []
    for x in ytick_labels:
        if x < 0:
            y = np.log(-1/x)
        elif x==0:
            y = 0
        else:
            y = np.log(x)
        yticks.append(y)
    
    ax.set_yticks(yticks)
    ax.set_yticklabels(ytick_labels)
    
    #ax.yaxis.set_major_formatter(ticker.FormatStrFormatter("%.1f"))
        
    rmax = pdf['lrep'].max()
    rmin = pdf['lrep'].min()
    
    ax.text(
        len(pdf)*0.066,-0.2,
        "MD: {:.3f}\nRae: {:.3f}\nL-H: {:.3f}".format(md,rae,lh),
        va="top",
        ha="left",
        bbox={'facecolor':'red', 'alpha':0.3,'pad':10}
   )

    #ax.text(1,-1*0.2,"MD: {:.3f}".format(md))   
    #ax.text(1,-1*0.4,"Rae: {:.3f}".format(rae))   
    #ax.text(1,-1*0.6,"L-H: {:.3f}".format(lh))   
    
    ax.get_xaxis().set_visible(False)#.set_ticks([])
    
    ax.set_ylim((rmin+rmin*0.1,rmax+rmax*0.15))

fig, ax = plt.subplots()
plot_representation(df,ax,nts=10,xspace=6,yspace=0.07)
plt.tight_layout()
if mp_pdf:
    plt.savefig('../plots/ipcc_representation/ipcc_rep_{}_all.pdf'.format(run_id),bbox_inches='tight')
else:
    plt.savefig('../plots/ipcc_representation/ipcc_rep_{}_all.png'.format(run_id),bbox_inches='tight')
plt.show()

In [6]:
fig, axs = plt.subplots(3, figsize=(12,24))
for i, (name, group) in enumerate(df.groupby('primary_wg')):
    ax = axs[i]
    plot_representation(group,ax,nts=10,xspace=2,yspace=0.07)
    
plt.tight_layout()
    #plt.savefig(f'../plots/ipcc_representation/ipcc_rep_{run_id}_wg_{}.pdf',bbox_inches='tight')

In [7]:
rmin = -1.5
rmax = 1
lmin = int(1/np.exp(rmin)//1)*-1
lmax = int(np.exp(rmax)//1)

ytick_labels = [i for i in range(lmin,lmax) if i not in [-1,1]]
print(ytick_labels)
yticks = []
for x in ytick_labels:
    if x < 0:
        x = np.log(-1/x)
    elif x==0:
        x = 0
    else:
        x = np.log(x)
    yticks.append(x)
#yticks = [np.log(x) if x!=0 else 0 for x in ytick_labels ]
print(yticks)

[-4, -3, -2, 0]
[-1.3862943611198906, -1.0986122886681098, -0.6931471805599453, 0]


In [8]:
import pandas as pd
if runstat.method=="DT":

    tds = topics.filter(timedtopic__period__n__lt=6).values(
        'title','timedtopic__period__title','timedtopic__period__n','timedtopic__score','score'
    ).order_by('id','timedtopic__period__n')

    tdf = pd.DataFrame.from_dict(list(tds))

    #tdf['ys'] = tdf[]

    tdf['share'] = tdf['timedtopic__score'] / tdf['score']

    tdf['ys'] = tdf['timedtopic__period__n'] * tdf['share']


    tdf.head(12)
    #tdf.groupby('')
else:
    topic_period_scores = []
    periods = TimePeriod.objects.filter(title__regex="^AR",n__lt=6,n__gt=0).distinct('title')
    ttps = TopicTimePeriodScores.objects.filter(topic__run_id=run_id,score__isnull=False)
    if ttps.count() < (topics.count() * periods.count()-2): 
        print("calculating topictimeperiodscores")
        for p in TimePeriod.objects.filter(title__regex="^AR",n__lt=6,n__gt=0).distinct('title'):
            pdts = DocTopic.objects.filter(
                topic__run_id=run_id,
                doc__PY__in=p.ys
            ).values(
                'topic__title','topic__score'
            ).annotate(
                timedtopic__score = Sum('score')
            )
            for pdt in list(pdts):
                pdt['timedtopic__period__n'] = p.n
                t = Topic.objects.get(run_id=run_id,title=pdt['topic__title'])
                ttps, created = TopicTimePeriodScores.objects.get_or_create(
                    topic=t,
                    period=p
                )
                ttps.score = pdt['timedtopic__score']
                ttps.save()
                topic_period_scores.append(pdt)
        tdf = pd.DataFrame.from_dict(topic_period_scores)
        tdf['share'] = tdf['timedtopic__score'] / tdf['topic__score']

        tdf['ys'] = tdf['timedtopic__period__n'] * tdf['share']     
    else:
        print("getting topictimeperiodscores")
        tds = topics.filter(topictimeperiodscores__period__n__lt=6).values(
            'title','topictimeperiodscores__period__title',
            'topictimeperiodscores__period__n','topictimeperiodscores__score','score'
        ).order_by('id','topictimeperiodscores__period__n')   
        
        tdf = pd.DataFrame.from_dict(list(tds))
            
        tdf['share'] = tdf['topictimeperiodscores__score'] / tdf['score']

        tdf['ys'] = tdf['topictimeperiodscores__period__n'] * tdf['share']
    tdf = tdf.rename(columns={
        "topic__title":"title",
        "topictimeperiodscores__period__n":"timedtopic__period__n"
    }).dropna().reset_index(drop=True)
    
tdf.head()

calculating topictimeperiodscores


Unnamed: 0,timedtopic__period__n,timedtopic__score,topic__score,title,share,ys
0,1,0.005783,431.169955,"{adaptation, vulnerability, farmer}",1.3e-05,1.3e-05
1,1,0.049895,386.448988,"{adsorption, capacity, adsorbent}",0.000129,0.000129
2,1,0.061564,314.498023,"{aerosol, forcing, radiative}",0.000196,0.000196
3,1,0.112939,444.522727,"{air, pollution, quality}",0.000254,0.000254
4,1,0.176755,343.495484,"{arctic, permafrost, warm}",0.000515,0.000515


In [9]:
means = tdf.groupby('title')['ys'].mean()

means = pd.DataFrame({'ys' : tdf.groupby('title')['ys'].mean()}).reset_index()

mdf = df.merge(means)

mdf.head()

means.sort_values('ys',ascending=False).head()



Unnamed: 0,title,ys
55,"{ice, sheet, sea}",0.510694
67,"{ozone, stratospheric, tropospheric}",0.504946
77,"{record, glacial, late}",0.503512
10,"{cal, pollen, holocene}",0.497324
65,"{ocean, surface, deep}",0.497109


In [10]:
def year_av(x):
    group = tdf[tdf['title']==x['title']]
    l = []
    for index, y in group.iterrows():
        l = l + [y.timedtopic__period__n] * round(y.share*100)
    return np.mean(l)

mdf['year_av'] = df.apply(year_av,axis=1)

print(mdf[pd.isna(mdf['year_av'])].shape)

mdf.head()

(0, 12)


Unnamed: 0,ipcc_coverage,ipcc_score,ipcc_share,primary_wg,score,share,title,deviation,abs_md,representation,ys,year_av
0,0.066673,19.255231,0.005472,2,554.874805,0.010902,"{plant, seed, stress}",-0.00543,0.00543,0.501945,0.435662,4.4
1,0.11662,33.604496,0.00955,1,457.805413,0.008995,"{flux, measurement, surface}",0.000555,0.000555,1.061741,0.495118,4.258621
2,0.147173,18.484558,0.005253,2,366.670462,0.007204,"{urban, city, urbanization}",-0.001951,0.001951,0.729181,0.311181,4.757576
3,0.14254,24.011693,0.006824,1,325.479917,0.006395,"{wind, speed, turbine}",0.000429,0.000429,1.067089,0.434953,4.520833
4,0.150288,55.688689,0.015827,2,742.046488,0.01458,"{effect, treatment, response}",0.001247,0.001247,1.08552,0.407993,4.456522


In [11]:
x = 'year_av'
y = 'representation'
y = 'lrep'

mdf['lrep'] = np.log(mdf['representation'])

fig, ax = plt.subplots()

cmap = {1: "#66c2a5", 2: "#fc8d62" , 3: "#8da0cb"}
colors = [cmap[i] for i in mdf['primary_wg']]


mdf.plot.scatter(
    x,y,s=mdf['score']*0.03,
    c = colors,ax=ax
)


# Fit a trendline
#z = np.polyfit(x=mdf[x], y=mdf[y], deg=1)
#p = np.poly1d(z)
#mdf['trendline'] = p(mdf.loc[:, x])
#plt.plot(mdf[x],mdf['trendline'])


# Quantile x and y
qs = 6
mdf['x_q'] = pd.qcut(mdf[x], qs, labels=False)
mdf['y_q'] = pd.qcut(mdf[y], qs, labels=False)
texts = []

# write texts for dots in outer quantiles
for i, row in mdf[(mdf['x_q'].isin([0,qs-1])) & (mdf['y_q'].isin([0,qs-1]))].iterrows(): 
    texts.append(plt.text(row[x],row[y],row['title'],ha='center', va='center'))
    
# Tweak the labels to avoid overlap    
adjust_text(
    texts, 
    arrowprops=dict(arrowstyle="-", color='black', lw=0.5)
)


# Split the graph into quadrants
ymid = 0
plt.axhline(ymid,c="grey")
plt.axvline(mdf[x].median(),c="grey")



# Put labels on the quadrants
# x
d1 = mdf[x].median()-plt.xlim()[0]
d2 = mdf[x].median()-plt.xlim()[1]

xlabpoints = [mdf[x].median()-d1/2,mdf[x].median()-d2/2]
xrange = mdf[x].max()-mdf[x].min()
xlabpoints = [mdf[x].min()+xrange*0.15,mdf[x].min()+xrange*0.85]
labels = [
    ['Under-represented,\nolder topics','Under-represented,\nnewer topics'],
    ['Over-represented,\nolder topics','Over-represented,\nnewer topics']
]

pylims = plt.ylim()

tpad = (pylims[1]-pylims[0])*0.13


ops = [operator.lt,operator.gt]


for i in [0,1]:
    for j in [0,1]:
        if j==0:
            tpadx=tpad*-1
        else:
            tpadx=tpad*0.66

            
        q = mdf[(ops[i](mdf[x],mdf[x].median())) & (ops[j](mdf[y],ymid))]
        qshare = q['score'].sum()/mdf['score'].sum()
        plt.text(
            xlabpoints[i],
            pylims[j]+tpadx,
            labels[j][i] + " {:.0%}".format(qshare),
            va="center",ha="center",
            bbox={'facecolor':'red', 'alpha':qshare*2-0.25, 'pad':6},
            fontsize=7
        )

        
fmax = None
fmin = None
rmax = mdf['lrep'].max()
if fmax:
    rmax = fmax
rmin = mdf['lrep'].min()
if fmin:
    rmin = -fmin
    
if rmin > -1:
    rmin = -1
if np.exp(rmax) < 2:
    rmax = np.log(2.1)

lmin = int(1/np.exp(rmin)//1)*-1
lmax = int(np.exp(rmax)//1)



ytick_labels = [i for i in range(lmin,lmax+1) if i not in [-1,1]]
yticks = []
for x in ytick_labels:
    if x < 0:
        y = np.log(-1/x)
    elif x==0:
        y = 0
    else:
        y = np.log(x)
    yticks.append(y)
    
ax.set_yticks(yticks)
ax.set_yticklabels(ytick_labels)

#ax.get_yaxis().set_visible(False)
#ax.tick_params(axis=u'both', which=u'both',length=0)
#ax2.tick_params(axis=u'both', which=u'both',length=0)
plt.xlabel('Assessment period\noccurence')
plt.ylabel('Representation')

plt.tight_layout(h_pad=100)

fig.patch.set_facecolor('#f0f0f0')    


plt.savefig(
    '../plots/ipcc_representation/ipcc_rep_new{}_all.{}'.format(run_id,extension),
    bbox_inches='tight',
    facecolor=fig.get_facecolor(),
    pad_inches=0.2
)

plt.show()

ValueError: RGBA values should be within 0-1 range

In [None]:
mdf.to_csv('../tables/newness_representation.csv',index=False)

In [None]:
for name, group in mdf.groupby('primary_wg'):
    group = group.sort_values('lrep', ascending=False).reset_index(drop=True)
    fig, ax = plt.subplots(figsize=(12,8))
    
    #plot_representation(group,ax,nts=10,xspace=2,yspace=0.07)
    plt.xticks(rotation=45,ha="right")
    ax.bar(group.index, group['lrep'], color=cmap[group['primary_wg'][0]])
    ax.set_xticks(group.index)
    ax.set_xticklabels(group['title'])
    fig.patch.set_facecolor('#f0f0f0') 
    
    plt.savefig(
        '../plots/ipcc_representation/ipcc_rep_wg_{}_{}_all.{}'.format(name,run_id,extension),
        bbox_inches='tight',
        facecolor=fig.get_facecolor(),
        pad_inches=0.2
    )
    
    plt.show()
    

In [None]:
plt.close()

In [None]:
mdf.sort_values('year_av', ascending=False).tail(15)

In [None]:
mdf = mdf.sort_values(['primary_wg','year_av'],ascending=False).reset_index(drop=True)

mdf.head()

mdf[['ipcc_share','share','representation','primary_wg','title','year_av']].to_latex(
    '../tables/ipcc_shares_time.tex',
    index_names=False,
)

In [None]:
q = runstat.query
colors = ['#7fc97f','#beaed4','#fdc086','#ffff99','#386cb0','#f0027f','#F0F0F0']

oecd_cats = [
    {'wc__oecd': 'Agricultural Sciences'},
    {'wc__oecd': 'Engineering and Technology'},
    {'wc__oecd': 'Humanities'},
    {'wc__oecd': 'Medical and Health Sciences'},
    {'wc__oecd': 'Natural Sciences'},
    {'wc__oecd': 'Social Sciences'},
    #{'wc__oecd': None}
]

d_count = Doc.objects.filter(query=q,PY__lte=AR.objects.get(ar=5).end).count()

ip_d_count = Doc.objects.filter(query=q, ipccref__isnull=False).count()

oecd_df = pd.DataFrame()
for i,c in enumerate(oecd_cats):
    ds = Doc.objects.filter(
        query=q,
        wc__oecd=c['wc__oecd'],
        PY__lte=AR.objects.get(ar=5).end
    )
    ipds = ds.filter(ipccref__isnull=False)
    c['share'] = ds.count() / d_count
    c['ipcc_share'] = ipds.count() / ip_d_count
    c['color'] = colors[i]
    
oecd_df = pd.DataFrame.from_dict(oecd_cats)

oecd_df.head()

In [None]:
oecd_df['representation'] = oecd_df['ipcc_share'] / oecd_df['share']

oecd_df['lrep'] = np.log(oecd_df['representation'])

oecd_df = oecd_df.sort_values('representation', ascending=False).reset_index(drop=True)

fig, ax = plt.subplots(figsize=(8,8))

ax.bar(
    oecd_df.index, 
    oecd_df['lrep'],
    color=oecd_df['color'],
    alpha=0.7,
    linewidth=1.5,
    edgecolor="grey"
)

ax.set_xticks(oecd_df.index)
ax.set_xticklabels(oecd_df['wc__oecd'])

ax.set_xlabel('Category')
ax.set_ylabel('Log representation')

ax.axhline(0,c="grey")

ax.set_title("Disciplinary representation in IPCC")

plt.xticks(rotation=45,ha="right")

fig.patch.set_facecolor('#f0f0f0')    


In [None]:
q = runstat.query
colors = ['#7fc97f','#beaed4','#fdc086','#ffff99','#386cb0','#f0027f','#F0F0F0']
oecd_cats = [
    {'wc__oecd': 'Agricultural Sciences'},
    {'wc__oecd': 'Engineering and Technology'},
    {'wc__oecd': 'Humanities'},
    {'wc__oecd': 'Medical and Health Sciences'},
    {'wc__oecd': 'Natural Sciences'},
    {'wc__oecd': 'Social Sciences'},
    {'wc__oecd': None}
]



oecd_df = pd.DataFrame()

ar_oe_cats = []

all_docs_count =  Doc.objects.filter(query=q).count()

for ar in AR.objects.filter(ar__gt=0,ar__lt=6).order_by('ar'):
    print(ar)
    ys = range(ar.start,ar.end+1)
    d_count = Doc.objects.filter(query=q, PY__in=ys).count()
    ip_d_count = Doc.objects.filter(query=q, PY__in=ys, ipccref__isnull=False).count()
    for i,c in enumerate(oecd_cats):
        d = {"wc__oecd": c["wc__oecd"]}
        ds = Doc.objects.filter(
            query=q,
            wc__oecd=c['wc__oecd'],
            PY__in=ys,
            PY__lte=AR.objects.get(ar=5).end
        )
        ipds = ds.filter(ipccref__isnull=False)
        all_ars_share = Doc.objects.filter(query=q,wc__oecd=c['wc__oecd']).count() / all_docs_count
        d['ar'] = ar.ar
        d['share'] = ds.count() / d_count
        d['ipcc_share'] = ipds.count() / ip_d_count
        d['color'] = colors[i]
        d['total_share'] = all_ars_share
        
        ar_oe_cats.append(d)

oecd_df_ys = pd.DataFrame.from_dict(ar_oe_cats)

oecd_df_ys.head()

In [None]:
oecd_df_ys['representation'] = oecd_df_ys['ipcc_share'] / oecd_df_ys['share']

oecd_df_ys.sort_values('representation').head(15)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))

oecd_df_ys['lrep'] = np.log(oecd_df_ys['representation'])

fdf = oecd_df_ys

fdf['ar_share_total'] = fdf.groupby('ar')['share'].transform('sum')

for name, group in fdf.groupby('wc__oecd'):
    color = group.reset_index()['color'][0]
    group['s_share'] = group['share'] / group['ar_share_total']
    ax.plot(
        group['ar'],
        group['lrep'],
        label='_nolegend_',
        lw=2,
        color="grey"
    )
    ax.plot(
        group['ar'],
        group['lrep'],
        #label=name,
        label='_nolegend_',
        color=color
    )
    ax.scatter(
        group['ar'],
        group['lrep'],
        label=name,
        color=color,
        s = 150*group['s_share'],
        edgecolor="grey"
    )
    
    ax.set_xticks([1,2,3,4,5])
    
ax.axhline(0,c="grey")
    
plt.legend()


ax.set_xlabel('Assessment Period')
ax.set_ylabel('Log representation')

fig.patch.set_facecolor('#f0f0f0')    

plt.savefig(
    '../plots/ipcc_representation/ipcc_rep_oecds_time.{}'.format(extension),
    bbox_inches='tight',
    facecolor=fig.get_facecolor(),
    pad_inches=0.2
)


In [None]:
cats = list(WC.objects.distinct('oecd_fos_text').values('oecd_fos_text','oecd_fos','oecd'))
cats

In [None]:
q = runstat.query
colors = ['#386cb0','#beaed4','#ffff99','#7fc97f','#f0027f','#fdc086','#F0F0F0']

d_count = Doc.objects.filter(query=q,PY__lte=AR.objects.get(ar=5).end).count()

ip_d_count = Doc.objects.filter(query=q, ipccref__isnull=False).count()

oecd_df = pd.DataFrame()
for i,c in enumerate(cats):
    ds = Doc.objects.filter(
        query=q,
        wc__oecd_fos_text=c['oecd_fos_text'],
        PY__lte=AR.objects.get(ar=5).end
    )
    ipds = ds.filter(ipccref__isnull=False)
    c['share'] = ds.count() / d_count
    c['ipcc_share'] = ipds.count() / ip_d_count
    c['color'] = colors[int(c['oecd_fos'])-1]
    
    
cat_df = pd.DataFrame.from_dict(cats)

cat_df.head()

In [None]:
cat_df['representation'] = cat_df['ipcc_share'] / cat_df['share']

cat_df['lrep'] = np.log(cat_df['representation'])

cat_df = cat_df.sort_values('representation', ascending=False).reset_index(drop=True)

fig, ax = plt.subplots(figsize=(8,8))

ax.bar(
    cat_df.index, 
    cat_df['lrep'],
    alpha=0.7,
    linewidth=1.5,
    edgecolor="grey",
    color=cat_df['color']
)

ax.set_xticks(cat_df.index)
ax.set_xticklabels(cat_df['oecd_fos_text'])

ax.set_xlabel('Category')
ax.set_ylabel('Log representation')

ax.axhline(0,c="grey")

ax.set_title("Disciplinary representation in IPCC")

plt.xticks(rotation=45,ha="right")

fig.patch.set_facecolor('#f0f0f0')    


In [None]:
fig = plt.figure(figsize=(12,16))

ax = None

for i, (name, group) in enumerate(cat_df.groupby('oecd')):
    if ax:
        ax = fig.add_subplot(3,2,i+1, sharey=ax)
    else:
        ax = fig.add_subplot(3,2,i+1)
    group = group.reset_index(drop=True)
    ax.bar(
        group.index, 
        group['lrep'],
        alpha=0.7,
        linewidth=1.5,
        edgecolor="grey",
        color=group['color']
    )

    ax.set_xticks(group.index)
    ax.set_xticklabels(group['oecd_fos_text'])
    

    ax.set_xlabel('Category')
    ax.set_ylabel('Log representation')

    ax.axhline(0,c="grey")

    ax.set_title("Disciplinary representation in IPCC")

    plt.xticks()

    fig.patch.set_facecolor('#f0f0f0')    

    plt.setp(ax.get_xticklabels(), rotation=30, ha="right")
fig.tight_layout()


plt.savefig(
    f'../plots/ipcc_representation/ipcc_subcats_{run_id}.{extension}',
    bbox_inches='tight',
    facecolor=fig.get_facecolor(),
    pad_inches=0.2
)

In [None]:
q = runstat.query

cat_df = pd.DataFrame()

ar_cats = []

for ar in AR.objects.filter(ar__gt=0,ar__lt=6).order_by('ar'):
    print(ar)
    ys = range(ar.start,ar.end+1)
    d_count = Doc.objects.filter(query=q, PY__in=ys).count()
    ip_d_count = Doc.objects.filter(query=q, PY__in=ys, ipccref__isnull=False).count()
    for i,c in enumerate(cats):
        d = {"oecd_fos_text": c["oecd_fos_text"]}
        ds = Doc.objects.filter(
            query=q,
            wc__oecd_fos_text=c['oecd_fos_text'],
            PY__in=ys,
            PY__lte=AR.objects.get(ar=5).end
        )
        ipds = ds.filter(ipccref__isnull=False)
        d['oecd'] = c['oecd']
        d['ar'] = ar.ar
        d['share'] = ds.count() / d_count
        d['ipcc_share'] = ipds.count() / ip_d_count
        d['color'] = colors[int(c['oecd_fos'])-1]
        
        
        ar_cats.append(d)

cat_df_ys = pd.DataFrame.from_dict(ar_cats)

cat_df_ys.head()

In [None]:
cat_df_ys['representation'] = cat_df_ys['ipcc_share'] / cat_df_ys['share']

cat_df_ys.sort_values('representation').head(15)

In [None]:
fdf['share'].sum()

In [None]:

cat_df_ys['lrep'] = np.log(cat_df_ys['representation'])


for dis, fdf in cat_df_ys.groupby('oecd'):
    fig, ax = plt.subplots(figsize=(8,8))
    texts = []
    fdf['ar_share_total'] = fdf.groupby('ar')['share'].transform('sum')
    
    for name, group in fdf.groupby('oecd_fos_text'):
        group['s_share'] = group['share'] / group['ar_share_total']
        group = group[np.isfinite(group['lrep'])].sort_values('ar').reset_index(drop=True)
        
        if len(group) == 0:
            print("empty group")
            continue

        color = group.reset_index()['color'][0]
        ax.plot(
            group['ar'],
            group['lrep'],
            label='_nolegend_',
            lw=2,
            color="grey"
        )
        ax.plot(
            group['ar'],
            group['lrep'],
            label=name,
            color=color
        )
        ax.scatter(
            group['ar'],
            group['lrep'],
            label=name,
            color=color,
            s = 150*group['s_share'],
            edgecolor="grey"
        )
        x = group['ar'][0] 
        y = group['lrep'][0]
        if np.isfinite(y):
            texts.append(
            ax.text(
                x,y,
                name
            )
            )

    adjust_text(
        texts, 
        arrowprops=dict(arrowstyle="-", color='black', lw=1.5)
    )

    ax.set_xticks([1,2,3,4,5])

    ax.axhline(0,c="grey")

    #plt.legend()


    ax.set_xlabel('Assessment Period')
    ax.set_ylabel('Log representation')

    fig.patch.set_facecolor('#f0f0f0')    

    plt.savefig(
        f'../plots/ipcc_representation/ipcc_rep_cats_time_{dis}.{extension}',
        bbox_inches='tight',
        facecolor=fig.get_facecolor(),
        pad_inches=0.2
    )
    
    plt.show()
    plt.close()


In [None]:
fig, ax = plt.subplots(figsize=(8,8))

oecd_df_ys['lrep'] = np.log(oecd_df_ys['representation'])

fdf = oecd_df_ys[oecd_df_ys['ar']>1]

fdf['ar_share_total'] = fdf.groupby('ar')['share'].transform('sum')


ticks = []
labels = []

for i, (name, group) in enumerate(fdf.sort_values('total_share').groupby('wc__oecd', sort=False)):
    group = group.replace(-np.inf, np.nan).dropna().sort_values('ar').reset_index(drop=True)
    name = f"{name} ({group['total_share'][0]:.1%})"
    print(name)
    ticks.append(i)
    labels.append(name)
    color = group.reset_index()['color'][0]
    group['s_share'] = group['share'] / group['ar_share_total']
    
    start = group['lrep'][0]
    end = group['lrep'][len(group)-1]
    size = 75
    ax.plot(
        [i,i],
        [start,end],
        color="grey",
        zorder=-50
    )
    ax.scatter(
        i,
        start,
        label=name,
        color=color,
        s = size,
        edgecolor="grey",
        #marker="x"
    )
    ax.scatter(
        i,
        end,
        label=name,
        color=color,
        s = size,
        edgecolor="grey",
        marker="^"
    )
    
    
ax.set_xticks(ticks)
ax.set_xticklabels(labels, ha="right", rotation=45)
ax.axhline(0,c="grey")

fig.patch.set_facecolor('#f0f0f0') 


plt.savefig(
    '../plots/ipcc_representation/ipcc_rep_oecds_simplified.{}'.format(extension),
    bbox_inches='tight',
    facecolor=fig.get_facecolor(),
    pad_inches=0.2
)


plt.show()



In [None]:
fdf.sort_values('total_share')