# CORDIS Data IO

Manual dataset inspection for writing functions to load and parse data.

In [None]:
import pandas as pd
import json

In [None]:
pd.set_option('max_columns', 100)
fps = ['h2020', 'fp7', 'fp6', 'fp5', 'fp4', 'fp3', 'fp2', 'fp1']

In [None]:
project_read_opts = {'sep': ';', 'decimal': ',', 'parse_dates': ['startDate', 'endDate']}
project_parse_opts = {'list_sep': ';', 'list_cols': ['participants', 'participantCountries', 'programme'], 'drop_cols': ['subjects']}


h2020_org_read_opts = {'sep': ';', 'decimal': ',', 'parse_dates': ['lastUpdateDate']}
h2020_org_parse_opts = {'list_sep': ';', 'list_cols': ['programme'], 'drop_cols': []}

fp7_org_read_opts = {'sep': ';', 'decimal': ',', 'parse_dates': []}
fp7_org_parse_opts = {'list_sep': ';', 'list_cols': ['programme'], 'drop_cols': []}

In [None]:
read_opts = {'projects': {f: project_read_opts for f in fps},
             'organizations': {
                 'h2020': h2020_org_read_opts,
                 'fp7'
                              },
            }
parse_opts = {'projects': project_parse_opts,
             'organizations': org_parse_opts,
             }

In [None]:
import json

In [None]:
with open('../../data/aux/cordis_parse_opts.json', 'w') as f:
    json.dump(parse_opts, f)

with open('../../data/aux/cordis_read_opts.json', 'w') as f:
    json.dump(read_opts, f)

In [None]:
cordis_raw_params

## H2020

### H2020 Projects

In [None]:
h2020_projects = pd.read_csv('../../data/raw/cordis/h2020/raw/h2020_projects.csv', sep=';', decimal=',', 
                             parse_dates=['startDate', 'endDate'])

In [None]:
print(h2020_projects.shape)
h2020_projects.head(2)

### H2020 Orgs

In [None]:
h2020_orgs = pd.read_csv('../../data/raw/cordis/h2020/h2020_organizations.csv', sep=';', decimal=',')

In [None]:
print(h2020_orgs.shape)
h2020_orgs.head(2)

## FP7

### FP7 Projects

In [None]:
fp7_projects = pd.read_csv('../../data/raw/cordis/fp7/raw/fp7_projects.csv', sep=';', decimal=',',
                             parse_dates=['startDate', 'endDate'])

In [None]:
print(fp7_projects.shape)
fp7_projects.head(2)

### FP7 Orgs

In [None]:
fp7_orgs = pd.read_csv('../../data/raw/cordis/fp7/fp7_organizations.csv', sep=';', decimal=',')

In [None]:
print(fp7_orgs.shape)
fp7_orgs.head(5)

## FP6

In [None]:
fp6_projects = pd.read_csv('../../data/raw/cordis/fp6/raw/fp6_projects.csv', sep=';', decimal=',',
                             parse_dates=['startDate', 'endDate'])

In [None]:
print(fp6_projects.shape)
fp6_projects.head(2)

In [None]:
fp6_orgs = pd.read_csv(
    '../../data/raw/cordis/fp6/fp6_organizations.csv', sep='\t', decimal=',')

In [None]:
print(fp6_orgs.shape)
fp6_orgs.head(5)

## FP5

In [None]:
fp5_projects = pd.read_csv('../../data/raw/cordis/fp5/raw/fp5_projects.csv', sep=';', decimal=',',
                             parse_dates=['startDate', 'endDate'])

In [None]:
print(fp5_projects.shape)
fp5_projects.head(2)

## FP4

In [None]:
fp4_projects = pd.read_csv('../../data/raw/cordis/fp4/raw/fp4_projects.csv', sep=';', decimal=',',
                             parse_dates=['startDate', 'endDate'])

In [None]:
print(fp4_projects.shape)
fp4_projects.head(2)

## FP3

In [None]:
fp3_projects = pd.read_csv('../../data/raw/cordis/fp3/raw/fp3_projects.csv', sep=';', decimal=',',
                             parse_dates=['startDate', 'endDate'])

In [None]:
print(fp3_projects.shape)
fp3_projects.head(2)

## FP2

In [None]:
fp2_projects = pd.read_csv('../../data/raw/cordis/fp2/raw/fp2_projects.csv', sep=';', decimal=',',
                             parse_dates=['startDate', 'endDate'])

In [None]:
print(fp2_projects.shape)
fp2_projects.head(2)

## FP1

In [None]:
fp1_projects = pd.read_csv('../../data/raw/cordis/fp1/raw/fp1_projects.csv', sep=';', decimal=',',
                             parse_dates=['startDate', 'endDate'])

In [None]:
print(fp1_projects.shape)
fp1_projects.head(2)

In [None]:
rcn = []
for fp in [h2020_projects, fp7_projects, fp6_projects, fp5_projects, fp4_projects, fp3_projects, fp2_projects, fp1_projects]:
    rcn.extend(fp['rcn'].values)

In [None]:
import numpy as np

In [None]:
len(np.unique(rcn))

In [None]:
len(rcn)

In [None]:
set(fp5_projects['rcn']).intersection(set(fp6_projects['rcn']))

In [None]:
fp5_projects[~pd.isnull(fp5_projects.status)]

In [None]:
print(h2020_sdg.shape)
h2020_sdg.head()

In [None]:
pred_cols = [f'{str(i)}_pred' for i in range(1, 17)]
sdg_cols = [f'sdg_{str(i)}' for i in range(1, 17)]

In [None]:
h2020_projects = h2020_projects.set_index('rcn').merge(h2020_sdg.set_index('rcn')[pred_cols], 
                                      left_index=True, right_index=True, how='right')
h2020_projects = h2020_projects.rename(columns={old: new for old, new in zip(pred_cols, sdg_cols)})

In [None]:
h2020_projects[sdg_cols].sum().plot.barh()

In [None]:
h2020_projects.groupby(h2020_projects['startDate'].dt.year)[sdg_cols].sum().sum(axis=1)

In [None]:
h2020_projects.groupby(h2020_projects['startDate'].dt.year)[sdg_cols].sum().divide(
    h2020_projects.groupby(h2020_projects['startDate'].dt.year)['id'].count(), axis=0).plot()