# Boxplot

Boxplot of female researchers by seniority

In [1]:
import pandas as pd 
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from collections import Counter

In [2]:
data = pd.read_csv('../data/seniority_researchers.csv')

In [3]:
data.head()

Unnamed: 0,INDICATOR,Indicator,LOCATION,Country,TIME,Time,Value,Flag Codes,Flags
0,21001,Researchers (HC) - Total,AUT,Austria,2011,2011,65609.0,,
1,21001,Researchers (HC) - Total,AUT,Austria,2013,2013,71448.0,,
2,21001,Researchers (HC) - Total,AUT,Austria,2015,2015,78051.0,,
3,21001,Researchers (HC) - Total,BEL,Belgium,2011,2011,63207.0,,
4,21001,Researchers (HC) - Total,BEL,Belgium,2013,2013,66724.0,,


In [4]:
Counter(data.Time)  # use 2015 data 

Counter({2011: 249,
         2013: 246,
         2015: 372,
         2012: 213,
         2014: 217,
         2016: 358,
         2017: 172})

In [5]:
# filtering by seniority groups 
seniority = ['Researchers (HC) - Female - Category A ','Researchers (HC) - Female - Category B ',
 'Researchers (HC) - Female - Category C','Researchers (HC) - Female - Category D',
 'Researchers (HC) - Female - Not specified seniority levels']

# only years with the data 
data_seniority_A = data[(data.Indicator == 'Researchers (HC) - Female - Category A ') & (data.Time.isin([2015, 2016, 2017])) & ~(data.Value.isna())]
data_seniority_B = data[(data.Indicator == 'Researchers (HC) - Female - Category B ') & (data.Time.isin([2015, 2016, 2017])) & ~(data.Value.isna())]
data_seniority_C = data[(data.Indicator == 'Researchers (HC) - Female - Category C') & (data.Time.isin([2015, 2016, 2017])) & ~(data.Value.isna())]
data_seniority_D = data[(data.Indicator == 'Researchers (HC) - Female - Category D') & (data.Time.isin([2015, 2016, 2017])) & ~(data.Value.isna())]



In [6]:
# find the sum to normalize the values 
group_a_sum = dict(data_seniority_A.groupby(['Time'])['Value'].sum())
group_b_sum = dict(data_seniority_B.groupby(['Time'])['Value'].sum())
group_c_sum = dict(data_seniority_C.groupby(['Time'])['Value'].sum())
group_d_sum = dict(data_seniority_D.groupby(['Time'])['Value'].sum())

In [7]:
def normalize_counts(row, sum_dictionary):
    """ Normalize head counts by year and seniority category """
    if row['Time'] == 2015:
        value = row['Value']/sum_dictionary[2015]
    elif row['Time'] == 2016:
        value = row['Value']/sum_dictionary[2016]
    elif row['Time'] == 2017:
        value = row['Value']/sum_dictionary[2017]        
    else:
        value = np.nan  
    return value 


In [8]:
# adding normalized values for boxplot 
data_seniority_A['normalized_val'] = data_seniority_A.apply(lambda row: np.round(normalize_counts(row, group_a_sum),4), axis=1)
data_seniority_B['normalized_val'] = data_seniority_B.apply(lambda row: np.round(normalize_counts(row, group_b_sum),4), axis=1)
data_seniority_C['normalized_val'] = data_seniority_C.apply(lambda row: np.round(normalize_counts(row, group_c_sum),4), axis=1)
data_seniority_D['normalized_val'] = data_seniority_D.apply(lambda row: np.round(normalize_counts(row, group_d_sum),4), axis=1)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [10]:
trace0 = go.Box(
    y=data_seniority_A.normalized_val,
    x=data_seniority_A.Time,
    name='Seniority Category A',
    marker=dict(
        color='#3D9970'
    )
)
trace1 = go.Box(
    y=data_seniority_B.normalized_val,
    x=data_seniority_B.Time,
    name='Seniority Category B',
    marker=dict(
        color='#FF4136'
    )
)
trace2 = go.Box(
    y=data_seniority_C.normalized_val,
    x=data_seniority_C.Time,
    name='Seniority Category C',
    marker=dict(
        color='#FF851B'
    )
)
trace3 = go.Box(
    y=data_seniority_D.normalized_val,
    x=data_seniority_D.Time,
    name='Seniority Category D',
    marker=dict(
        color='#4F86F7'
    )
)

data = [trace0, trace1, trace2, trace3]
layout = go.Layout(
    title = go.layout.Title(
        text = 'Normalized Head Counts of Female Researchers Across Seniority Levels (2015-2017)'
    ),
    xaxis=dict(
        title='Year',
        zeroline=False
    ),
    yaxis=dict(
        title='Normalized Head Counts',
        zeroline=False
    ),
    boxmode='group',
    annotations=[
        dict(
            x=1.11,
            y=1.05,
            align="left",
            valign="top",
            text='Seniority Category',
            showarrow=False,
            xref="paper",
            yref="paper",
            xanchor="center",
            yanchor="top"
        )
    ]
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Normalized Head Counts of Female Researchers Across Seniority Levels (2015-2017)')