# CORDIS SDG Analysis

In this notebook we perform preliminary analysis to answer the hypothesis: 

_There is a positive link between the R&I funding and national performance on the SDG index._

We investigate the relationship between research specialisation in the H2020 programme and the 2019 SDG Index scores for goals 3, 6, 7 and 11.

## Preamble

In [None]:
%run ../notebook_preamble.ipy

from sdg_mapping.cordis import load_cordis_projects, load_cordis_project_sdgs
from sdg_mapping.cordis.cordis_utils import FRAMEWORK_PROGRAMMES
from sdg_mapping.utils.sdg_utils import sdg_hex_color_codes, sdg_names
from sdg_mapping.sdg_index.sdg_index_utils import load_sdg_index

import os
from itertools import chain
from collections import Counter

import seaborn as sns

In [None]:
fig_dir = os.path.join(project_dir, 'reports', 'analysis_cordis_sdg_index')

In [None]:
project_h2020_df = load_cordis_projects('h2020').set_index('rcn')
project_sdgs_h2020_df = load_cordis_project_sdgs('h2020', 'label').set_index('rcn')

sdg_index_df = load_sdg_index(2019, index_type='report')

project_sdgs_h2020_df[0] = 0
project_sdgs_h2020_df[0][project_sdgs_h2020_df.sum(axis=1) == 0] = 1

sdg_keys = [3, 6, 7, 11]
sdg_index_score_keys = ['goal_{}_score'.format(g) for g in sdg_keys]

## SDG Index Scores

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for ax, col, g in zip(axs, sdg_index_score_keys, sdg_keys):
    hist_opts = {
                 'bins': 20,
                 'color': sdg_hex_color_codes()[g],
                 'ax': ax}
    sdg_index_df[col].plot.hist(**hist_opts)
    ax.set_xlim(0, 100)
    ax.set_xlabel(f'SDG Index Score')
    ax.set_title(f'SDG {g}')
    
plt.tight_layout()
plt.savefig(f'{fig_dir}/sdg_index_score_hist.png', dpi=300);

- The distributions of the SDG Index scores for each goal are very different. In some cases, the distribution is very broad, whereas for others (e.g. Goal 7) the scores are highly concentrated.
- This highlights that the nature of what drives progress in each of these goals is different. This tallies with what we have already seen, which is that the relationship between level of research and performance on the goals is very different.
- The index scores for each SDG are in fact an average of multiple scores that correspond to individual SDG targets and indicators.


## Mapping Country Codes

In [None]:
def replace_coordinator_eu_codes(x):
    d = {'EL': 'GR', 'UK': 'GB'}
    if x in d:
        return d[x]
    else:
        return x

def fillna_list(x):
    if type(x) == list:
        return x
    elif pd.isnull(x):
        return []
    
def replace_participant_eu_codes(x):
    d = {'EL': 'GR', 'UK': 'GB'}
    new = []
    for s in x:
        if s in d:
            new.append(iso2_to_iso3_map[d[s]])
        elif s == 'XK': # Code for Kosovo, which is not in the SDG Index data
            continue
        else:
            new.append(iso2_to_iso3_map[s])
    return new

In [None]:
country_df = pd.read_json(f'{data_path}/raw/countries/countries_restcountries_api.json')
iso2_to_iso3_map = {iso2: iso3 for iso2, iso3 in zip(country_df['alpha2Code'], country_df['alpha3Code'])}

project_h2020_df['coordinator_country'] = (project_h2020_df['coordinator_country']
                                           .apply(lambda x: replace_coordinator_eu_codes(x))
                                           .map(iso2_to_iso3_map))
project_h2020_df['participant_countries'] = project_h2020_df['participant_countries'].apply(lambda x: fillna_list(x))

project_h2020_df['participant_countries'] = (project_h2020_df['participant_countries']
                                             .apply(lambda x: replace_participant_eu_codes(x)))

### Country Location Quotients

In [None]:
def generate_all_countries(projects):
    
    null_participants = pd.isnull(projects['participant_countries']).sum() > 0
    if null_participants:
        projects['participant_countries'] = (projects['participant_countries']
                                             .apply(lambda p: fillna_list(p)))
    all_countries = []
    for c, p in zip(projects['coordinator_country'], projects['participant_countries']):
        if pd.isnull(c):
            all_countries.append(p)
        else:
            all_countries.append([c] + p)
    return all_countries

def create_quotient(X, binary=False):
    """Calculate the location quotient

    Divides the share of activity in a location by the share of activity in the UK total

    Args:
        X (pandas.DataFrame): DataFrame where rows are locations, columns are sectors 
            and values are activity in a given sector at a location.
        binary (bool, optional): If True, discretise the data with a cut-off value of 1

    Returns:
        pandas.DataFrame
    """
    Xm = X.values
    X = pd.DataFrame((Xm/Xm.sum(1)[:, np.newaxis])/(Xm.sum(0)/Xm.sum()),
            index=X.index, columns=X.columns)

    return (X > 1) if binary else X

def create_cordis_country_lq(projects, values, country_col, binary=False):
    '''create_cordis_country_lq
    Calculates the country based location quotient for CORDIS project participants
    or coordinators.
    
    Args:
        projects (pd.DataFrame): CORDIS projects dataframe. Should have
            project rcn as the index.
        country_col (str): Name of `projects` column with countries of interest. 
        values (pd.DataFrame): A dataframe of values to calculate location quotient. 
            Should have project rcn as the index.
        
    Returns:
        (pd.DataFrame): CORDIS country based location quotients
    '''
    countries = projects[country_col].explode()
    country_values = pd.merge(countries, values, left_index=True, right_index=True, how='inner')
    country_values = country_values.groupby(country_col).sum()
    return create_quotient(country_values, binary=binary)

def create_cordis_country_sum(projects, values, country_col):
    '''create_cordis_country_lq
    Calculates the country based location quotient for CORDIS project participants
    or coordinators.
    
    Args:
        projects (pd.DataFrame): CORDIS projects dataframe. Should have
            project rcn as the index.
        country_col (str): Name of `projects` column with countries of interest. 
        values (pd.DataFrame): A dataframe of values to calculate location quotient. 
            Should have project rcn as the index.
        binary: 
        
    Returns:
        (pd.DataFrame): CORDIS country based location quotients
    '''
    countries = projects[country_col].explode()
    country_values = pd.merge(countries, values, left_index=True, right_index=True, how='inner')
    return country_values.groupby(country_col).sum()

def normalise(df, norm=0):
    '''normalise
    
    Args:
        df (pd.DataFrame): A quantitative dataframe.
        norm (int): The axis along which to normalise. 0 normalises along columns and 
            1 normalises along rows.
    '''
    if norm == 0:
        df = df.divide(df.sum(axis=0), axis=1)
    elif norm == 1:
        df = df.divide(df.sum(axis=1), axis=0)
    return df

In [None]:
project_h2020_df['all_countries'] = generate_all_countries(project_h2020_df)

lq_all = create_cordis_country_lq(project_h2020_df, project_sdgs_h2020_df, 'all_countries')
lq_coord = create_cordis_country_lq(project_h2020_df, project_sdgs_h2020_df, 'coordinator_country')
lq_part = create_cordis_country_lq(project_h2020_df, project_sdgs_h2020_df, 'participant_countries')

## Activity LQ and SDG Score

### All Countries

In [None]:
n_all = project_h2020_df['all_countries'].explode().value_counts().shape[0]

print(f'{n_all} countries have coordinated or participated in a project')

In [None]:
fig, ax = plt.subplots()

hist_opts = {'bins': 100, 
             'cumulative': True, 
             'histtype': 'step', 
             'density': 'normed', 
             'linewidth': 2,
             'ax': ax
            }
c = project_h2020_df['all_countries'].explode().value_counts()
c.plot.hist(**hist_opts)
ax.set_xlim(c.min(), c.max())
ax.set_xlabel('Number of Projects')
ax.set_ylabel('Normalised Cumulative Frequency')
plt.tight_layout()

plt.savefig(f'{fig_dir}/cordis_country_coord_part_count_hist.png', dpi=300);

- 151 countries have coordinated or participated in a H2020 project.
- Normalised cumulative frequency distribution of number of projects that each country is involved in shows that:
    - 50% of these countries have been a part of 13 projects or less
    - 75% have been a part of 103 projects or less
- Only the top 12% of countries have participated in 1,000 projects or more.


In [None]:
min_project_count = 100

def filter_countries_by_project_count(projects, country_col, min_count):
    c = projects[country_col].explode().value_counts()
    c = c[c >= min_count]
    return c.index.values

top_countries = filter_countries_by_project_count(project_h2020_df, 'all_countries', min_project_count)

In [None]:
sdg_scores = (sdg_index_df
              .set_index('id')[sdg_index_score_keys]
              .reindex(top_countries)
              .dropna(how='all'))
lq_sdg_score_by_country = (lq_all[sdg_keys]
                           .reindex(top_countries)
                           .merge(sdg_scores, left_index=True, right_index=True, how='inner'))

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for g, s, ax in zip(sdg_keys, sdg_index_score_keys, axs):
    ax.scatter(lq_sdg_score_by_country[g], lq_sdg_score_by_country[s], color=sdg_hex_color_codes()[g], alpha=0.7)
    ax.set_title(sdg_names()[g])
    ax.set_xlabel('LQ')
    ax.set_ylabel('Score')
    ax.set_xlim(0, 2)
    
plt.tight_layout()
plt.savefig(f'{fig_dir}/lq_sdg_index_coord_part_truncated_scatter.png', dpi=300);

- Calculated the relative research specialisation in each SDG topic based on the number of projects  for each country. Subset to countries with ≥100 projects. This left 38 European and other high income countries.
- Strong but small +ve correlation between LQ and Goal 3. Weak and -ve correlation between LQ and Goals 6 and 11. Weak or non-existent correlation with Goal 7. (x axes are limited to exclude LQ outliers)
- Interesting to think about why this may be:
    - Much health research is carried out in medical and public healthcare settings so perhaps greater specialisation in Goal 3 indicates a greater capacity for health and medical research in general
    - Is clean water generally more of a problem in industrialised nations and if so is the higher level of research specialisation driven by necessity?
    
We see more or less the same things when looking at only the coordinators and only the participants below.


### Coordinator Countries

In [None]:
n_coord = project_h2020_df['coordinator_country'].explode().value_counts().shape[0]

print(f'{n_coord} countries have coordinated a project')

In [None]:
fig, ax = plt.subplots()

hist_opts = {'bins': 100, 
             'cumulative': True, 
             'histtype': 'step', 
             'density': 'normed', 
             'linewidth': 2,
             'ax': ax
            }
c = project_h2020_df['coordinator_country'].explode().value_counts()
c.plot.hist(**hist_opts)
ax.set_xlim(c.min(), c.max())
plt.tight_layout()

In [None]:
min_project_count = 50

top_countries = filter_countries_by_project_count(project_h2020_df, 'coordinator_country', min_project_count)

In [None]:
sdg_scores = (sdg_index_df
              .set_index('id')[sdg_index_score_keys]
              .reindex(top_countries)
              .dropna(how='all'))
lq_sdg_score_by_country = (lq_coord[sdg_keys]
                           .reindex(top_countries)
                           .merge(sdg_scores, left_index=True, right_index=True, how='inner'))

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for g, s, ax in zip(sdg_keys, sdg_index_score_keys, axs):
    sns.regplot(
        lq_sdg_score_by_country[g],
        lq_sdg_score_by_country[s],
        color=sdg_hex_color_codes()[g],
#         alpha=0.7
        ax=ax
               )
    ax.set_title(sdg_names()[g])
    ax.set_xlabel('LQ')
    ax.set_ylabel('Score')
#     ax.set_xlim(0, 3)
    
plt.tight_layout()

### Participant Countries

In [None]:
n_part = project_h2020_df['participant_countries'].explode().value_counts().shape[0]

print(f'{n_part} countries have participated in a project')

In [None]:
fig, ax = plt.subplots()

hist_opts = {'bins': 100, 
             'cumulative': True, 
             'histtype': 'step', 
             'density': 'normed', 
             'linewidth': 2,
             'ax': ax
            }
c = project_h2020_df['participant_countries'].explode().value_counts()
c.plot.hist(**hist_opts)
ax.set_xlim(c.min(), c.max())
plt.tight_layout()

In [None]:
min_project_count = 100

top_countries = filter_countries_by_project_count(project_h2020_df, 'participant_countries', min_project_count)

In [None]:
sdg_scores = (sdg_index_df
              .set_index('id')[sdg_index_score_keys]
              .reindex(top_countries)
              .dropna(how='all'))
lq_sdg_score_by_country = (lq_part[sdg_keys]
                           .reindex(top_countries)
                           .merge(sdg_scores, left_index=True, right_index=True, how='inner'))

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for g, s, ax in zip(sdg_keys, sdg_index_score_keys, axs):
    sns.regplot(
        lq_sdg_score_by_country[g],
        lq_sdg_score_by_country[s],
        color=sdg_hex_color_codes()[g],
        ax=ax
               )
    ax.set_title(sdg_names()[g])
    ax.set_xlabel('LQ')
    ax.set_ylabel('Score')
    
plt.tight_layout()

## Funding LQ and SDG Score

In [None]:
xml_parsed_dir = f'{data_path}/processed/cordis/h2020/h2020_orgs_xml'

dfs = []
for file in os.listdir(xml_parsed_dir):
    dfs.append(pd.read_json(os.path.join(xml_parsed_dir, file)))
    
orgs_h2020_df = pd.concat(dfs, sort=True)

del dfs

orgs_h2020_df['iso3_code'] = orgs_h2020_df['iso2_code'].map(iso2_to_iso3_map)
orgs_h2020_df = orgs_h2020_df.set_index('rcn')

In [None]:
sdg_orgs_h2020 = project_sdgs_h2020_df.merge(orgs_h2020_df, left_index=True, right_index=True, how='left')

In [None]:
sdg_project_funds = sdg_orgs_h2020[sdg_keys + [0]].mul(sdg_orgs_h2020['ecContribution'], axis=0)
sdg_project_funds = sdg_project_funds.merge(orgs_h2020_df['iso3_code'], left_index=True, right_index=True, how='left')
sdg_country_funds = sdg_project_funds.groupby('iso3_code').sum()

In [None]:
sdg_scores = (sdg_index_df
              .set_index('id')[sdg_index_score_keys])

fund_lq_all = (create_quotient(sdg_country_funds)
               .dropna()[sdg_keys]
               .reindex(top_countries)
               .merge(sdg_scores, left_index=True, right_index=True, how='inner'))

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for g, s, ax in zip(sdg_keys, sdg_index_score_keys, axs):
#     sns.regplot(fund_lq_all[g], fund_lq_all[s], color=sdg_hex_color_codes()[g], ax=ax)
    ax.scatter(fund_lq_all[g], fund_lq_all[s], color=sdg_hex_color_codes()[g], alpha=0.7)
    ax.set_title(sdg_names()[g])
    ax.set_xlabel('Funding LQ')
    ax.set_ylabel('Score')
#     ax.set_xlim(0, 3)
    
plt.tight_layout()
plt.savefig(f'{fig_dir}/funding_lq_sdg_index_coord_part_scatter.png', dpi=300);

Basing specialisation on the amount of funding recieved instead of the number of projects:

- Broadly the same trends are seen between LQ calculated based on the number of projects and calculated based on the amount of funding.
- We see the same trends regardless of whether we only take into account project coordinators or participants.


## Share of Funding

In [None]:
sdg_funding_share = sdg_country_funds.divide(sdg_country_funds.sum(axis=1), axis=0)[sdg_keys].dropna()

In [None]:
min_project_count = 100
top_countries = filter_countries_by_project_count(project_h2020_df, 'all_countries', min_project_count)

In [None]:
fund_share_all = (
    sdg_funding_share
    .reindex(top_countries)
    .merge(sdg_scores, left_index=True, right_index=True, how='inner'))

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for g, s, ax in zip(sdg_keys, sdg_index_score_keys, axs):
#     sns.regplot(fund_lq_all[g], fund_lq_all[s], color=sdg_hex_color_codes()[g], ax=ax)
    ax.scatter(fund_share_all[g] * 100, fund_share_all[s], color=sdg_hex_color_codes()[g], alpha=0.7)
    ax.set_title(sdg_names()[g])
    ax.set_xlabel('SDG Funding Share (%)')
    ax.set_ylabel('Score')
#     ax.set_xlim(0, 3)
    
plt.tight_layout()
plt.savefig(f'{fig_dir}/fund_share_vs_sdg_index_coord_part_scatter.png', dpi=300);

And we see the same again if we calculate the share of funding given to research on an SDG in each country.

## Distribution of Score Breakdowns

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3), sharey=True)

for g, ax in zip(sdg_keys, axs):
    cols = [c for c in sdg_index_df.columns if (f'sdg{g}' in c) & ('normalized' in c)]
    names = [c.split('_')[-1] for c in cols]
    sns.stripplot(data=sdg_index_df[cols].dropna(), ax=ax, color=sdg_hex_color_codes()[g], alpha=0.2)
    ax.set_xticklabels(names, rotation=45, ha='right')
    ax.set_ylabel('Normalised SDG Index Score')
    ax.set_title(sdg_names()[g])
    
plt.tight_layout()
plt.savefig(f'{fig_dir}/sdg_index_breakdown_strip.png', dpi=300);

We see that there is a great variety between the distributions of individual indicators within each aggregate SDG indicator. This suggests that we should be looking at the relationships between levels of SDG research and individual target areas.

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for g, ax in zip(sdg_keys, axs):
    cols = [c for c in sdg_index_df.columns if (f'sdg{g}' in c) & ('normalized' in c)]
    names = [c.split('_')[-1] for c in cols]
    sns.heatmap(data=sdg_index_df[cols].corr(), ax=ax)
    ax.set_xticklabels(names, rotation=45, ha='right')
    ax.set_yticklabels(names)
    ax.set_title(sdg_names()[g])
    
plt.tight_layout()
plt.savefig(f'{fig_dir}/sdg_index_corr_heatmap.png', dpi=300);

Furthermore, some of the sub-indicators are not correlated showing that they may not necessarily progress by the same mechanisms or at the same rate, or that they might in fact work in contradiction.

## LQ and SDG Indicators

Here we look at the individual indicators contained within each SDG Index score for the four goals.

### Goal 3

In [None]:
goal_3_indicator_cols = [c for c in sdg_index_df.columns if (f'sdg3' in c) & ('normalized' in c)]
goal_3_indicator_names = [c.split('_')[-1] for c in goal_3_indicator_cols]

In [None]:
goal_3_fund_lq_indicators = pd.merge(
                                 fund_lq_all[3],
                                 sdg_index_df.set_index('id')[goal_3_indicator_cols],
                                 left_index=True,
                                 right_index=True,
                                 how='left'
                             )

In [None]:
fig, axs = plt.subplots(nrows=4, ncols=4, figsize=(14, 10), sharex=True, sharey=True)

for col, name, ax in zip(goal_3_indicator_cols, goal_3_indicator_names, axs.ravel()):
    ax.scatter(
        goal_3_fund_lq_indicators[3], 
        goal_3_fund_lq_indicators[col], 
        color=sdg_hex_color_codes()[3],
        alpha=0.7
    )
    ax.set_title(name)
    ax.set_xlabel('Research Funding LQ')
    ax.set_ylabel('Normalised Indicator Score')

plt.tight_layout()
plt.savefig(f'{fig_dir}/funding_lq_sdg_indicators_sdg3_coord_part_scatter.png', dpi=300);

### Goal 6

In [None]:
goal_6_indicator_cols = [c for c in sdg_index_df.columns if (f'sdg6' in c) & ('normalized' in c)]
goal_6_indicator_names = [c.split('_')[-1] for c in goal_6_indicator_cols]

In [None]:
goal_6_fund_lq_indicators = pd.merge(
                                 fund_lq_all[6],
                                 sdg_index_df.set_index('id')[goal_6_indicator_cols],
                                 left_index=True,
                                 right_index=True,
                                 how='left'
                             )

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(11, 5.5), sharex=True, 
                        sharey=True
                       )

for col, name, ax in zip(goal_6_indicator_cols, goal_6_indicator_names, axs.ravel()):
    ax.scatter(
        goal_6_fund_lq_indicators[6], 
        goal_6_fund_lq_indicators[col], 
        color=sdg_hex_color_codes()[6],
        alpha=0.7
    )
    ax.set_title(name)
    ax.set_xlabel('Research Funding LQ')
    ax.set_ylabel('Normalised Indicator Score')

plt.tight_layout()
plt.savefig(f'{fig_dir}/funding_lq_sdg_indicators_sdg6_coord_part_scatter.png', dpi=300);

### Goal 7

In [None]:
goal_7_indicator_cols = [c for c in sdg_index_df.columns if (f'sdg7' in c) & ('normalized' in c)]
goal_7_indicator_names = [c.split('_')[-1] for c in goal_7_indicator_cols]

In [None]:
goal_7_fund_lq_indicators = pd.merge(
                                 fund_lq_all[7],
                                 sdg_index_df.set_index('id')[goal_7_indicator_cols],
                                 left_index=True,
                                 right_index=True,
                                 how='left'
                             )

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(11, 2.75), sharex=True, 
                        sharey=True
                       )

for col, name, ax in zip(goal_7_indicator_cols, goal_7_indicator_names, axs.ravel()):
    ax.scatter(
        goal_7_fund_lq_indicators[7], 
        goal_7_fund_lq_indicators[col], 
        color=sdg_hex_color_codes()[7],
        alpha=0.7
    )
    ax.set_title(name)
    ax.set_xlabel('Research Funding LQ')
    ax.set_ylabel('Normalised Indicator Score')

plt.tight_layout()
plt.savefig(f'{fig_dir}/funding_lq_sdg_indicators_sdg7_coord_part_scatter.png', dpi=300);

### Goal 11

In [None]:
goal_11_indicator_cols = [c for c in sdg_index_df.columns if (f'sdg11' in c) & ('normalized' in c)]
goal_11_indicator_names = [c.split('_')[-1] for c in goal_11_indicator_cols]

goal_11_fund_lq_indicators = pd.merge(
                                 fund_lq_all[11],
                                 sdg_index_df.set_index('id')[goal_11_indicator_cols],
                                 left_index=True,
                                 right_index=True,
                                 how='left'
                             )

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(11, 2.75), sharex=True, 
                        sharey=True
                       )

for col, name, ax in zip(goal_11_indicator_cols, goal_11_indicator_names, axs.ravel()):
    ax.scatter(
        goal_11_fund_lq_indicators[11], 
        goal_11_fund_lq_indicators[col], 
        color=sdg_hex_color_codes()[11],
        alpha=0.7
    )
    ax.set_title(name)
    ax.set_xlabel('Research Funding LQ')
    ax.set_ylabel('Normalised Indicator Score')

plt.tight_layout()
plt.savefig(f'{fig_dir}/funding_lq_sdg_indicators_sdg11_coord_part_scatter.png', dpi=300);

## SDG Index Analysis

Cluster map of countries based on the correlation of their SDG Index score profiles.

In [None]:
sns.clustermap(sdg_index_df.set_index('id')[['goal_{}_score'.format(g) for g in range(1, 17)]].dropna().T.corr())

Pair plot of SDG Index scores showing a wide variety of different relationships between the SDGs in terms of performance. Positive, negative and null correlations of varying degrees of strength!

In [None]:
sns.pairplot(sdg_index_df[['goal_{}_score'.format(g) for g in range(1, 17)]])

### Collecting Organisation Information

In [None]:
from bs4 import BeautifulSoup
from xml.etree import ElementTree
import os

In [None]:
xml_project_dir = f'{data_path}/raw/cordis/h2020/h2020_projects_xml/'

test_file = os.path.join(xml_project_dir, 'project-rcn-193159_en.xml')

In [None]:
def get_text(element, tag):
    d = element.find(tag)
    if d is not None:
        return d.text
    else:
        return None

def parse_xml_project(project_soup):
    orgs = soup.find_all('organization')
    orgs_parsed = []
    if len(orgs) > 0:
        for org in orgs:
            attrs = org.attrs
            attrs['org_id'] = get_text(org, 'id')
            attrs['org_rcn'] = get_text(org, 'rcn')
            attrs['iso2_code'] = get_text(org, 'isoCode')
            attrs['eu_code'] = get_text(org, 'euCode')
            attrs['legal_name'] = get_text(org, 'legalName')
            
            latlon = get_text(org, 'geolocation')
            if latlon is not None:
                latlon = latlon.split(',')
                lat = latlon[0]
                lon = latlon[1]
            else:
                lat = None
                lon = None
            attrs['lat'] = lat
            attrs['lon'] = lon
            orgs_parsed.append(attrs)
    else:
        orgs_parsed.append({})
    return orgs_parsed

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
import json

In [None]:
files = os.listdir(xml_project_dir)
for i, chunk in enumerate(chunks(files, 500)):
    orgs = []
    for file in chunk:
        file = os.path.join(xml_project_dir, file)
        with open(file, 'r') as f:
            soup = BeautifulSoup(f.read(), features='xml')
        parsed = parse_xml_project(soup)
        
        rcn = file.split('-')[-1].split('_')[0]
        for p in parsed:
            p['rcn'] = rcn

        orgs.extend(parsed)
    with open(f'{data_path}/processed/cordis/h2020/h2020_orgs_xml/orgs_parsed_{str(i).zfill(2)}.json', 'w') as f:
        json.dump(orgs, f)

In [None]:
xml_org_df = pd.read_json(f'{data_path}/processed/cordis/h2020/h2020_orgs_xml/orgs_parsed_01.json')
xml_org_df.head()