# Scatterplot

Scatterplot of Female R&D personnel of different departments over time 

In [1]:
import pandas as pd 
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go

In [2]:
data = pd.read_csv('../data/field_rd_researchers.csv')

In [3]:
data.Indicator.unique()

array(['Researchers (FTE) - Total ', 'Researchers (HC) - Total ',
       'Researchers (FTE) - Female', 'Researchers (FTE) - % Female',
       'Researchers (HC) - Female', 'Researchers (HC) - % Female',
       'Researchers (FTE) - Natural sciences',
       'Researchers (FTE) - Engineering and technology',
       'Researchers (FTE) - Medical and health sciences',
       'Researchers (FTE) - Agricultural and veterinary sciences',
       'Researchers (FTE) - Social sciences',
       'Researchers (FTE) - Humanities and the arts',
       'Researchers (FTE) - Not specified fields',
       'Researchers (FTE) - Natural sciences %',
       'Researchers (FTE) - Engineering and technology %',
       'Researchers (FTE) - Medical and health sciences %',
       'Researchers (FTE) - Agricultural and veterinary sciences %',
       'Researchers (FTE) - Social sciences %',
       'Researchers (FTE) - Humanities and the arts %',
       'Researchers (FTE) - Not specified fields %',
       'Researchers (FTE

In [4]:
# filter data to include female researchers categorized by departments 
data = data[(data.Indicator.isin(['Researchers (HC) - Female - Natural sciences',
       'Researchers (HC) - Female - Engineering and technology',
       'Researchers (HC) - Female - Medical and health sciences',
       'Researchers (HC) - Female - Agricultural and veterinary sciences',
       'Researchers (HC) - Female - Social sciences',
       'Researchers (HC) - Female - Humanities and the arts',
       'Researchers (HC) - Female - Not specified fields'])) & (~data.Value.isna()) ]

In [5]:
data_sci = data[data.Indicator == 'Researchers (HC) - Female - Natural sciences']
data_eng = data[data.Indicator == 'Researchers (HC) - Female - Engineering and technology']
data_med = data[data.Indicator == 'Researchers (HC) - Female - Medical and health sciences']
data_agri = data[data.Indicator == 'Researchers (HC) - Female - Agricultural and veterinary sciences']
data_social = data[data.Indicator == 'Researchers (HC) - Female - Social sciences']
data_hum = data[data.Indicator == 'Researchers (HC) - Female - Humanities and the arts']
data_unspecified = data[data.Indicator == 'Researchers (HC) - Female - Not specified fields']


In [6]:
data_sci = data_sci.groupby(['Time']).sum().reset_index()[['Time', 'Value']]
data_eng = data_eng.groupby(['Time']).sum().reset_index()[['Time', 'Value']]
data_med = data_med.groupby(['Time']).sum().reset_index()[['Time', 'Value']]
data_agri = data_agri.groupby(['Time']).sum().reset_index()[['Time', 'Value']]
data_social = data_social.groupby(['Time']).sum().reset_index()[['Time', 'Value']]
data_hum = data_hum.groupby(['Time']).sum().reset_index()[['Time', 'Value']]
data_unspecified = data_unspecified.groupby(['Time']).sum().reset_index()[['Time', 'Value']]


In [7]:
# science
trace0 = go.Scatter(
    x = [2011, 2012, 2013, 2014, 2015, 2016, 2017],
    y = data_sci.Value,
    name = 'Natural sciences',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = 'rgb(230, 159, 0)',
        line = dict(
            width = 2)
    )
)

trace1 = go.Scatter(
    x = [2011, 2012, 2013, 2014, 2015, 2016, 2017],
    y = data_eng.Value,
    name = 'Engineering and technology',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = 'rgb(86, 180, 233)',
        line = dict(
            width = 2)
    )
)

trace2 = go.Scatter(
    x = [2011, 2012, 2013, 2014, 2015, 2016, 2017],
    y = data_med.Value,
    name = 'Medical and health sciences',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = 'rgb(0, 158, 115)',
        line = dict(
            width = 2)
    )
)

trace3 = go.Scatter(
    x = [2011, 2012, 2013, 2014, 2015, 2016, 2017],
    y = data_agri.Value,
    name = 'Agricultural and veterinary sciences',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = 'rgb(240, 228, 66)',
        line = dict(
            width = 2)
    )
)

trace4 = go.Scatter(
    x = [2011, 2012, 2013, 2014, 2015, 2016, 2017],
    y = data_social.Value,
    name = 'Social sciences',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = 'rgb(0, 114, 178)',
        line = dict(
            width = 2)
    )
)

trace5 = go.Scatter(
    x = [2011, 2012, 2013, 2014, 2015, 2016, 2017],
    y = data_hum.Value,
    name = 'Humanities and the arts',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = 'rgb(213, 94, 0)',
        line = dict(
            width = 2)
    )
)

trace6 = go.Scatter(
    x = [2011, 2012, 2013, 2014, 2015, 2016, 2017],
    y = data_unspecified.Value,
    name = 'Not specified fields',
    mode = 'markers',
    marker = dict(
        size = 10,
        color = 'rgb(204, 121, 167)',
        line = dict(
            width = 2)
    )
)

dummy_trace = go.Scatter(
    x=[None], y=[None],
    name='<b>Fields</b>',
    # set opacity = 0
    line={'color': 'rgba(0, 0, 0, 0)'}
)

data = [dummy_trace, trace0, trace1, trace2, trace3, trace4, trace5, trace6]

layout = dict(title = 'Head Count of Female researchers across fields (2011-2017)',
              yaxis = dict(title = "Total Head Count of Female Researchers", zeroline = False),
              xaxis = dict(title = "Year", zeroline = False)
             )

In [8]:
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='female_researchers_fields')


Consider using IPython.display.IFrame instead

