# CORDIS SDG Analysis

In this notebook we perform preliminary analysis to answer the hypothesis: 

_There is a positive link between the R&I funding and national performance on the SDG index._

We investigate the relationship between research specialisation in the H2020 programme and the 2019 SDG Index scores for goals 3, 6, 7 and 11.

## Preamble

In [None]:
%run ../notebook_preamble.ipy

from sdg_mapping.cordis import load_cordis_projects, load_cordis_project_sdgs
from sdg_mapping.cordis.cordis_utils import FRAMEWORK_PROGRAMMES
from sdg_mapping.utils.sdg_utils import sdg_hex_color_codes, sdg_names
from sdg_mapping.sdg_index.sdg_index_utils import load_sdg_index

import os
from itertools import chain
from collections import Counter

import seaborn as sns

In [None]:
fig_dir = os.path.join(project_dir, 'reports', 'analysis_cordis_sdg_index')

In [None]:
project_h2020_df = load_cordis_projects('h2020').set_index('rcn')
project_sdgs_h2020_df = load_cordis_project_sdgs('h2020', 'label').set_index('rcn')

sdg_index_df = load_sdg_index(2019, index_type='report')

project_sdgs_h2020_df[0] = 0
project_sdgs_h2020_df[0][project_sdgs_h2020_df.sum(axis=1) == 0] = 1

sdg_keys = [3, 6, 7, 11]
sdg_index_score_keys = ['goal_{}_score'.format(g) for g in sdg_keys]

## Mapping Country Codes

In [None]:
def replace_coordinator_eu_codes(x):
    d = {'EL': 'GR', 'UK': 'GB'}
    if x in d:
        return d[x]
    else:
        return x

def fillna_list(x):
    if type(x) == list:
        return x
    elif pd.isnull(x):
        return []
    
def replace_participant_eu_codes(x):
    d = {'EL': 'GR', 'UK': 'GB'}
    new = []
    for s in x:
        if s in d:
            new.append(iso2_to_iso3_map[d[s]])
        elif s == 'XK': # Code for Kosovo, which is not in the SDG Index data
            continue
        else:
            new.append(iso2_to_iso3_map[s])
    return new

In [None]:
country_df = pd.read_json(f'{data_path}/raw/countries/countries_restcountries_api.json')
iso2_to_iso3_map = {iso2: iso3 for iso2, iso3 in zip(country_df['alpha2Code'], country_df['alpha3Code'])}

project_h2020_df['coordinator_country'] = (project_h2020_df['coordinator_country']
                                           .apply(lambda x: replace_coordinator_eu_codes(x))
                                           .map(iso2_to_iso3_map))
project_h2020_df['participant_countries'] = project_h2020_df['participant_countries'].apply(lambda x: fillna_list(x))

project_h2020_df['participant_countries'] = (project_h2020_df['participant_countries']
                                             .apply(lambda x: replace_participant_eu_codes(x)))

### Country Location Quotients

In [None]:
def generate_all_countries(projects):
    
    null_participants = pd.isnull(projects['participant_countries']).sum() > 0
    if null_participants:
        projects['participant_countries'] = (projects['participant_countries']
                                             .apply(lambda p: fillna_list(p)))
    all_countries = []
    for c, p in zip(projects['coordinator_country'], projects['participant_countries']):
        if pd.isnull(c):
            all_countries.append(p)
        else:
            all_countries.append([c] + p)
    return all_countries

def create_quotient(X, binary=False):
    """Calculate the location quotient

    Divides the share of activity in a location by the share of activity in the UK total

    Args:
        X (pandas.DataFrame): DataFrame where rows are locations, columns are sectors 
            and values are activity in a given sector at a location.
        binary (bool, optional): If True, discretise the data with a cut-off value of 1

    Returns:
        pandas.DataFrame
    """
    Xm = X.values
    X = pd.DataFrame((Xm/Xm.sum(1)[:, np.newaxis])/(Xm.sum(0)/Xm.sum()),
            index=X.index, columns=X.columns)

    return (X > 1) if binary else X

def create_cordis_country_lq(projects, values, country_col, binary=False):
    '''create_cordis_country_lq
    Calculates the country based location quotient for CORDIS project participants
    or coordinators.
    
    Args:
        projects (pd.DataFrame): CORDIS projects dataframe. Should have
            project rcn as the index.
        country_col (str): Name of `projects` column with countries of interest. 
        values (pd.DataFrame): A dataframe of values to calculate location quotient. 
            Should have project rcn as the index.
        
    Returns:
        (pd.DataFrame): CORDIS country based location quotients
    '''
    countries = projects[country_col].explode()
    country_values = pd.merge(countries, values, left_index=True, right_index=True, how='inner')
    country_values = country_values.groupby(country_col).sum()
    return create_quotient(country_values, binary=binary)

def create_cordis_country_sum(projects, values, country_col):
    '''create_cordis_country_lq
    Calculates the country based location quotient for CORDIS project participants
    or coordinators.
    
    Args:
        projects (pd.DataFrame): CORDIS projects dataframe. Should have
            project rcn as the index.
        country_col (str): Name of `projects` column with countries of interest. 
        values (pd.DataFrame): A dataframe of values to calculate location quotient. 
            Should have project rcn as the index.
        binary: 
        
    Returns:
        (pd.DataFrame): CORDIS country based location quotients
    '''
    countries = projects[country_col].explode()
    country_values = pd.merge(countries, values, left_index=True, right_index=True, how='inner')
    return country_values.groupby(country_col).sum()

def normalise(df, norm=0):
    '''normalise
    
    Args:
        df (pd.DataFrame): A quantitative dataframe.
        norm (int): The axis along which to normalise. 0 normalises along columns and 
            1 normalises along rows.
    '''
    if norm == 0:
        df = df.divide(df.sum(axis=0), axis=1)
    elif norm == 1:
        df = df.divide(df.sum(axis=1), axis=0)
    return df

In [None]:
project_h2020_df['all_countries'] = generate_all_countries(project_h2020_df)

## Country Collaboration Networks

In [None]:
from collections import Counter
from itertools import combinations
import networkx as nx

In [None]:
def cooccurrence_edges(groups):
    """cooccurrence_edges
    
    """
    edges = chain(*[[tuple(sorted(c)) for c in (combinations(d, 2))] for d in groups])
    edge_counts = Counter(edges)
    return edge_counts

def cooccurence_nodes(groups):
    """cooccurrence_nodes
    """
    nodes = list(chain(*groups))
    node_counts = Counter(nodes)
    return node_counts

def country_collaboration_network(project_df, sdg_df, country_col, goal ):
    """country_collaboration_network
    """
    ids = sdg_df[sdg_df[goal] == 1].index.values
    co_nodes = cooccurence_nodes(project_df[country_col].reindex(ids))
    co_edges = cooccurrence_edges(project_df[country_col].reindex(ids))
    
    g = nx.Graph()

    for node, weight in co_nodes.items():
        g.add_node(node, weight=weight)

    for edge, weight in co_edges.items():
        g.add_edge(edge[0], edge[1], weight=weight)
        
    return g

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 12))

for goal, ax in zip(sdg_keys, axs.ravel()):
    g = country_collaboration_network(project_h2020_df, project_sdgs_h2020_df, 'all_countries', goal)
    giant = max(nx.connected_component_subgraphs(g), key=len)
    nx.draw(giant, node_color=sdg_hex_color_codes()[goal], edge_color='gray', alpha=0.7, ax=ax)
    ax.set_title(sdg_names()[goal])

## Country Diversity

In [None]:
from skbio.diversity.alpha import shannon, simpson
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
mlb = MultiLabelBinarizer()
dums = mlb.fit_transform(project_h2020_df['all_countries'])
dums = pd.DataFrame(dums, columns=mlb.classes_, index=project_h2020_df.index)
sums = dums.sum(axis=1)

### Distribution of Number of Collaborators

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for goal, ax in zip(sdg_keys, axs):    
    ids = project_sdgs_h2020_df[project_sdgs_h2020_df[goal] == 1].index.values
    sums_goal = sums.reindex(ids)
    sums_goal = sums_goal[sums_goal > 1]
    sums_goal.plot.hist(bins=range(1, 31), ax=ax, color=sdg_hex_color_codes()[goal])
    ax.set_title(sdg_names()[goal])
    ax.set_xlabel('Collaborators per Project')
    
plt.tight_layout()
plt.savefig(f'{project_dir}/reports/analysis_cordis_sdg_collaborations/n_collaborators_by_sdg_hist.png', dpi=300);

\*projects with more than one participant

### Collaborator Diversity over Time

In [None]:
dums = pd.get_dummies(orgs_h2020_df['org_type']).groupby('rcn').sum()
simps = dums.apply(simpson, axis=1)
shans = dums.apply(shannon, axis=1)

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for goal, ax in zip(sdg_keys, axs.ravel()):
    ids = project_sdgs_h2020_df[project_sdgs_h2020_df[goal] == 1].index.values
    sums = dums.sum(axis=1)
    simp = simps[sums > 1].reindex(ids).dropna()
    ids = simp.index.values
    
    years = project_h2020_df['start_date'].dt.year.loc[ids]
    
    div = pd.DataFrame({'simp': simp, 'year': years})
    div = div[(div['year'] > 2014) & (div['year'] < 2020)]
    
    sns.stripplot(data=div, y='simp', x='year', ax=ax, color=sdg_hex_color_codes()[goal], alpha=0.3)
    div.groupby('year')['simp'].mean().reset_index(drop=True).plot(color=sdg_hex_color_codes()[goal], 
                                                                   linewidth=2, ax=ax)
    ax.set_xlabel('Year')
    ax.set_ylabel('Simpson Index')
    
plt.tight_layout()
plt.savefig(f'{project_dir}/reports/analysis_cordis_sdg_collaborations/org_simpson_vs_year_by_sdg_stripplot.png', dpi=300);

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for goal, ax in zip(sdg_keys, axs.ravel()):
    ids = project_sdgs_h2020_df[project_sdgs_h2020_df[goal] == 1].index.values
    sums = dums.sum(axis=1)
    shan = shans[sums > 1].reindex(ids).dropna()
    ids = shan.index.values
    
    years = project_h2020_df['start_date'].dt.year.loc[ids]
    
    div = pd.DataFrame({'shan': shan, 'year': years})
    div = div[(div['year'] > 2014) & (div['year'] < 2020)]
    
    sns.stripplot(data=div, y='shan', x='year', ax=ax, color=sdg_hex_color_codes()[goal], alpha=0.3)
    div.groupby('year')['shan'].mean().reset_index(drop=True).plot(color=sdg_hex_color_codes()[goal], 
                                                                   linewidth=2, ax=ax)
    ax.set_xlabel('Year')
    ax.set_ylabel('Shannon Index')
    
plt.tight_layout()
plt.savefig(f'{project_dir}/reports/analysis_cordis_sdg_collaborations/org_shannon_vs_year_by_sdg_stripplot.png', dpi=300);

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3), sharey=True)

for goal, ax in zip(sdg_keys, axs.ravel()):
    ids = project_sdgs_h2020_df[project_sdgs_h2020_df[goal] == 1].index.values
    sums = dums.sum(axis=1)
    
    totals = dums[sums > 1]
    totals = ((totals > 0).sum(axis=1) > 1).astype(int)
    years = project_h2020_df['start_date'].dt.year.loc[ids]
    div = pd.DataFrame({'totals': totals, 'year': years})
    
    div = div[(div['year'] > 2014) & (div['year'] < 2020)]
    counts = div.groupby('year').mean()
    counts.plot(ax=ax, color=sdg_hex_color_codes()[goal], linewidth=2, legend=None, marker='o')
    ax.set_xlabel('Year')
    ax.set_ylabel('% of Multi Org Type Projects')
    
plt.tight_layout()
plt.savefig(f'{project_dir}/reports/analysis_cordis_sdg_collaborations/frac_multi_org_projects_vs_year_by_sdg_line.png', dpi=300);

### Distribution of SDG Share by Country

In [None]:
from operator import itemgetter

In [None]:
country_sdg_counts = create_cordis_country_sum(project_h2020_df, project_sdgs_h2020_df, 'all_countries')
country_sdg_counts = country_sdg_counts[country_sdg_counts.sum(axis=1) >= 10]

sdg_share_country = normalise(country_sdg_counts, norm=1) * 100

fig, ax = plt.subplots(figsize=(6, 4))
data = sdg_share_country[sdg_keys].melt()
palette = itemgetter(*sdg_keys)(sdg_hex_color_codes())
sns.boxplot(x='variable', y='value', data=data, ax=ax, palette=palette)
ax.set_xlabel('Goal')
ax.set_ylabel('Country Project Share (%)')
    
plt.tight_layout()
plt.savefig(f'{project_dir}/reports/analysis_cordis_sdg_collaborations/country_project_share_by_sdg_boxplot.png', dpi=300);

Here we can see each SDG as the share of a country's projects. For exmaple, for Goal 3 we can see that at one country has over 40% of it's projects related to the goal.

This shows that although some goals are much more prevalent than others, there are countries which specialise much more than others.

## Country Clusters

In [None]:
from umap import UMAP
import altair as alt

In [None]:
umap = UMAP()
umap_vecs = umap.fit_transform(dums[dums.sum(axis=1) > 1])
umap_df = pd.DataFrame(umap_vecs, columns=['x', 'y'], index=dums[dums.sum(axis=1) > 1].index)

In [None]:
umap_df['all_countries'] = project_h2020_df['all_countries'].loc[dums[dums.sum(axis=1) > 1].index.values].str.join(', ')

alt.Chart(umap_df.sample(5000)).mark_point().encode(
    x='x',
    y='y',
    tooltip='all_countries'
)

- get country sdg project count by date
- does the number of collaborations drive the future specialisation of projects by that country?

- Does the SDG Index performance of a country go up after its first collaboration with another country on an SDG?
- Is SDG Index performance impacted by participation in a project?
- What is the difference in SDG profiles between private and public sector?
- Does an increase in SDG research correspond to an increase or decrease in SDG Index performance?


1. Calculate the difference in SDG index performance between the first year of a collaboration and the year after.
2. Figure out if the difference is more or less than the average increase that year? Or if the percentage increases in performance are better or worse than those who do not participate. Or does the rate of improvement change?

## Organisation Collaboration

Want to know whether different SDGs have different collaboration structures between organisations. Are there more distinct or centralised communities for one SDG? Can we describe those communities by the distribution of their disciplinarity specialisations (of the organisations). e.g does an SDG have more distinct communities that also have stronger disciplinary specialisations or is there a highly interdisciplinary overlap?

Are there collaboration clusters?

Change in network structure over time:

- Motif analysis?
- Change in centrality
- Distribution of centrality?
- Community detection?

How does this network composition relate to SDG Index performance?

What are the differences between a country's performance on the SDG Index based on it's collaboration (% of projects done in collaboration vs change in SDG Index score). Does collaboration lead a greater than expected increase in share of projects?

In [None]:
xml_parsed_dir = f'{data_path}/processed/cordis/h2020/h2020_orgs_xml'

dfs = []
for file in os.listdir(xml_parsed_dir):
    dfs.append(pd.read_json(os.path.join(xml_parsed_dir, file)))
    
orgs_h2020_df = pd.concat(dfs, sort=True)

del dfs

orgs_h2020_df['iso3_code'] = orgs_h2020_df['iso2_code'].map(iso2_to_iso3_map)
orgs_h2020_df = orgs_h2020_df.set_index('rcn').drop('metadata.xml')
orgs_h2020_df['org_type'] = orgs_h2020_df['org_type'].str.replace('/', '')
project_orgs = orgs_h2020_df.groupby('rcn')['legal_name'].apply(list)

### Number of Participants by Organisation Type

In [None]:
def sdg_project_ids(sdg_df, goal):
    return sdg_df[sdg_df[goal] == 1].index.astype(int).values

In [None]:
orgs_h2020_df.index = orgs_h2020_df.index.astype(int)

In [None]:
ids = sdg_project_ids(project_sdgs_h2020_df, 3)
proj_ids = orgs_h2020_df.index.intersection(ids).astype(int)

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for ax, goal in zip(axs, sdg_keys):

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)
#     org_type_counts = orgs_h2020_df.loc[orgs_h2020_df.index.intersection(ids)]['org_type'].value_counts()
    org_type_counts = orgs_h2020_df.loc[ids]['org_type'].value_counts()
    org_type_counts.sort_index().plot.bar(color=sdg_hex_color_codes()[goal], ax=ax)

    ax.set_xlabel('Org Type')
    ax.set_ylabel('Involvement Frequency')

plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/n_org_type_by_sdg_bar.png', 
    dpi=300
);

In [None]:
fig, ax = plt.subplots()

df = pd.DataFrame()

for goal in sdg_keys:

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)
#     org_type_counts = orgs_h2020_df.loc[orgs_h2020_df.index.intersection(ids)]['org_type'].value_counts()
    org_type_counts = orgs_h2020_df.loc[ids]['org_type'].value_counts()
    org_type_counts = (org_type_counts / org_type_counts.sum()).sort_index()
    df[goal] = org_type_counts * 100
    
df = df.loc[df.sum(axis=1).sort_values(ascending=False).index]

sns.heatmap(df.T, ax=ax, cmap='viridis', annot=True)
ax.set_xlabel('Org Type')
ax.set_ylabel('Goal')
plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/frac_org_type_by_sdg_heatmap.png', 
    dpi=300
);

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(10, 4))

df = pd.DataFrame()

for goal in sdg_keys:

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)
#     org_type_counts = orgs_h2020_df.loc[orgs_h2020_df.index.intersection(ids)]['org_type'].value_counts()
    org_type_counts = orgs_h2020_df.loc[ids]['org_type'].value_counts()
    org_type_counts = (org_type_counts / org_type_counts.sum()).sort_index()
    df[goal] = org_type_counts
    
order = df.sum(axis=1).sort_values(ascending=False).index
df = df.loc[order]
all_org_frac = orgs['org_type'].value_counts() / orgs['org_type'].value_counts().sum()
df_spec = df.divide(all_org_frac, axis=0).loc[order]

sns.heatmap(df.T * 100, ax=ax[0], cmap='viridis', annot=True,
            cbar_kws={'label': 'Share of Organisations'})
sns.heatmap(df_spec.T, ax=ax[1], cmap='bwr_r', annot=True, 
            cbar_kws={'label': 'Representation'}, center=1
           )
ax[0].set_xlabel('Org Type')
ax[1].set_ylabel('Goal')
plt.tight_layout()
# plt.savefig(
#     f'{project_dir}/reports/analysis_cordis_sdg_collaborations/frac_org_type_by_sdg_heatmap.png', 
#     dpi=300
# );

The number of instances of different organisation types participating in a project shows that some SDGs are much more likely to be carried out by certain types of organisation.

### Number of Project Coordinators by Organisation Type

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for ax, goal in zip(axs, sdg_keys):

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)
    orgs = orgs_h2020_df[orgs_h2020_df['type'] == 'coordinator']
    org_type_counts = orgs.loc[ids]['org_type'].value_counts()
    org_type_counts.sort_index().plot.bar(color=sdg_hex_color_codes()[goal], ax=ax)
    ax.set_xlabel('Org Type')
    ax.set_ylabel('Coordinator Frequency')
    
plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/n_coord_vs_org_type_by_sdg_bar.png', 
    dpi=300
);

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(10, 4))

df = pd.DataFrame()

for goal in sdg_keys:

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)
    orgs = orgs_h2020_df[orgs_h2020_df['type'] == 'coordinator']
#     org_type_counts = orgs_h2020_df.loc[orgs_h2020_df.index.intersection(ids)]['org_type'].value_counts()
    org_type_counts = orgs.loc[ids]['org_type'].value_counts()
    org_type_counts = (org_type_counts / org_type_counts.sum()).sort_index()
    df[goal] = org_type_counts

# order = df.sum(axis=1).sort_values(ascending=False).index
df = df.loc[order]
all_org_frac = orgs['org_type'].value_counts() / orgs['org_type'].value_counts().sum()
df_spec = df.divide(all_org_frac, axis=0).loc[order]

sns.heatmap(df.T * 100, ax=ax[0], cmap='viridis', annot=True,
            cbar_kws={'label': 'Share of Coordinators'})
sns.heatmap(df_spec.T, ax=ax[1], cmap='bwr_r', annot=True, 
            cbar_kws={'label': 'Representation'}, center=1
           )
ax[0].set_xlabel('Org Type')
ax[1].set_ylabel('Goal')
plt.tight_layout()
# plt.savefig(
#     f'{project_dir}/reports/analysis_cordis_sdg_collaborations/frac_org_type_by_sdg_heatmap.png', 
#     dpi=300
# );

In [None]:
coords_all = orgs_h2020_df[orgs_h2020_df['type'] == 'coordinator']
coord_counts_all = coords_all['org_type'].value_counts() / coords_all.shape[0] * 100
coord_counts_all = coord_counts_all.sort_index()

fig, axs = plt.subplots(ncols=4, figsize=(15, 3), sharey=True)

for ax, goal in zip(axs, sdg_keys):

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)
    orgs = orgs_h2020_df[orgs_h2020_df['type'] == 'coordinator']
    org_type_counts = orgs.loc[ids]['org_type'].value_counts() / len(ids) * 100
    org_type_counts.sort_index().plot.bar(color=sdg_hex_color_codes()[goal], ax=ax)
    ax.scatter(coord_counts_all.index, coord_counts_all.values, color='C0', zorder=5, marker='_', s=100)
    ax.set_xlabel('Org Type')
    ax.set_ylabel('Coordinator Frequency')
    
plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/frac_coord_vs_org_type_by_sdg_bar.png', 
    dpi=300
);

The same ordering broadly hold true across SDGs if we only look at the project coordinators. Although we see that the number of public and other organisations decrease dramatically.

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for ax, goal in zip(axs, sdg_keys):

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)
    orgs = orgs_h2020_df[orgs_h2020_df['type'] == 'coordinator']
    orgs = orgs.drop_duplicates(subset='legal_name')
    org_type_counts = orgs.loc[orgs.index.intersection(ids)]['org_type'].value_counts()
    org_type_counts.plot.bar(color=sdg_hex_color_codes()[goal], ax=ax)
    ax.set_xlabel('Org Type')
    ax.set_ylabel('Frequency')
    
plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/n_unique_orgs_vs_org_type_by_sdg_bar.png', 
    dpi=300
);

If we look instead at the number of organisations involved in each SDG, we see a very different picture. There are generally very large numbers of private organisations in comparison to all other types. This makes sense perhaps because private organisations are typically smaller and focus on one area or project, while organisations such as universities and foundations will be involved in very large portfolios of work and may hold multiple simultaneous projects on a topic.

In [None]:
unique_org_type_counts = orgs_h2020_df.drop_duplicates('legal_name')['org_type'].value_counts()

fig, axs = plt.subplots(ncols=4, figsize=(15, 3), sharey=True)

for ax, goal in zip(axs, sdg_keys):

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)
    orgs = orgs_h2020_df.loc[ids]
    org_type_counts = orgs.drop_duplicates(subset='legal_name')['org_type'].value_counts()
    org_type_counts = org_type_counts / unique_org_type_counts * 100
    org_type_counts.plot.bar(color=sdg_hex_color_codes()[goal], ax=ax)
    ax.set_xlabel('Org Type')
    ax.set_ylabel('% of Orgs')
    
plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/frac_unique_orgs_vs_org_type_by_sdg_bar.png', 
    dpi=300
);

But what about the actual percentage of all organisations in each type that have been involved in SDG related work? As we can see, significant proportions of higher education establishments tend to be involved in projects across all the Goals. Each goal has its own profile however; private companies are highly represented for SDG 7, while public institutions are the most highly represented in SDG 11, which might be expected due to the nature of these Goals' topics.

So what about the distribution of projects among these organisations? For each organisation type, do we have many organisations involved in many projects or a small number of organisations hoarding all of them?

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3), 
#                         sharex=True
                        sharey=True
                       )

for goal, ax in zip(sdg_keys, axs):

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)

    a = (orgs_h2020_df
         .loc[ids]
         .groupby(['org_type', 'legal_name'])['type']
         .count()
         .unstack(level=0)
        )
    x = a / a.sum() * 100

    for c in x.columns:
        X = x[c].dropna()

        xs = np.sort(X)
        n = np.arange(1, len(X)+1) / np.float(len(X))
        ax.step(xs, n, label=c) 

    ax.set_xlim(-0.1, 4)
    ax.set_ylabel('Cumulative Frequency (Norm)')
    ax.set_xlabel('Proportion of Projects (%)')
    ax.set_title(sdg_names()[goal])
    ax.legend()
plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/project_share_vs_org_by_org_type_by_sdg_cumulhist.png', 
    dpi=300
);

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3), 
#                         sharex=True
#                         sharey=True
                       )

for goal, ax in zip(sdg_keys, axs):

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)

    a = (orgs_h2020_df
         .loc[ids]
         .groupby(['org_type', 'legal_name'])['type']
         .count()
         .unstack(level=0)
        )
#     x = a / a.sum() * 100
    x = a

    for c in x.columns:
        X = x[c].dropna()

        xs = np.sort(X)
        n = np.arange(1, len(X)+1) / np.float(len(X))
        ax.step(xs, n, label=c) 

#     ax.set_xlim(-0.1, 4)
    ax.set_ylabel('Cumulative Frequency (Norm)')
    ax.set_xlabel('N Projects')
    ax.set_title(sdg_names()[goal])
    ax.legend()
plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/n_projects_vs_org_by_org_type_by_sdg_cumulhist.png', 
    dpi=300
);

Here we look at the organisational share of projects within each organisation type and for each SDG. This gives us an idea as to how concentrated projects are within organisations. The further a curve reaches to the top left of the graph, the more distributed projects are (more organisations have a smaller share of projects), wheres a collapsing curve means that there are more organisations that have accumulated a larger share of projects.

For each SDG, the trend and level of concentration is different. For example, projects in Goals 3 and 7 tend to be much more distributed than projects for Goal 6. We can see that private organisations tend to have a much more distributed share of projects. For the other types, the ordering of highest to lowest distributed appears to depend on the goal. For example in SDG 3, public institutions have a highly concentrated project distribution. For sustainable cities and communities however, they are the second most distributed.

In [None]:
fig, ax = plt.subplots()

a = (orgs_h2020_df
         .groupby(['org_type', 'legal_name'])['type']
         .count()
         .unstack(level=0)
        )


x = a / a.sum() * 100

for c in x.columns:
    X = x[c].dropna()

    xs = np.sort(X)
    n = np.arange(1, len(X)+1) / np.float(len(X))
    ax.step(xs, n, label=c) 

ax.set_xlim(-0.1, 1.5)
ax.set_ylabel('Cumulative Frequency (Normalised)')
ax.set_xlabel('Proportion of Projects (%)')
ax.legend()
plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/project_share_vs_org_by_org_type_cumulhist.png', 
    dpi=300
);

In [None]:
fig, ax = plt.subplots()

a = (orgs_h2020_df
         .groupby(['org_type', 'legal_name'])['type']
         .count()
         .unstack(level=0)
        )


# x = a / a.sum() * 100
x = a

for c in x.columns:
    X = x[c].dropna()

    xs = np.sort(X)
    n = np.arange(1, len(X)+1) / np.float(len(X))
    ax.step(xs, n, label=c) 

# ax.set_xlim(-0.1, 1.5)
ax.set_ylabel('Cumulative Frequency (Normalised)')
ax.set_xlabel('Number of Projects')
ax.legend()
plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/n_projects_vs_org_by_org_type_cumulhist.png', 
    dpi=300
);

For reference this is the distribution across all H2020 projects.

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3), sharey=True) 

for goal, ax in zip(sdg_keys, axs):

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)

    x = (orgs_h2020_df
         .loc[ids]
         .groupby(['org_type', 'legal_name'])['type']
         .count()
         .unstack(level=0)
        )
    x = x / x.sum() * 100    
    x = x.melt()

    sns.boxplot(data=x, x='org_type', y='value', showfliers=False, ax=ax, color=sdg_hex_color_codes()[goal])

    ax.set_title(sdg_names()[goal])
plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/project_share_vs_org_by_org_type_by_sdg_box.png', 
    dpi=300
);

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3), sharey=True) 

for goal, ax in zip(sdg_keys, axs):

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)

    x = (orgs_h2020_df
         .loc[ids]
         .groupby(['org_type', 'legal_name'])['type']
         .count()
         .unstack(level=0)
        ) 
#     x = x / x.sum() * 100    
    x = x.melt()

    sns.boxplot(data=x, x='org_type', y='value', showfliers=False, ax=ax, color=sdg_hex_color_codes()[goal])

    ax.set_title(sdg_names()[goal])
plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/n_projects_vs_org_by_org_type_by_sdg_box.png', 
    dpi=300
);

These boxplots are another way of seeing the same project concentration levels as the cumulative histograms above.

What about the levels of collaboration between different types of organisation?

## Organisation Type Combinations

In [None]:
collab_counts = orgs_h2020_df.groupby('rcn')['org_type'].count()

In [None]:
s = project_sdgs_h2020_df[sdg_keys].unstack().reset_index(level=0).rename(columns={'level_0': 'goal', 0: 'is_goal'})
s = s[s['is_goal'] == 1]
s = s.drop('is_goal', axis=1)

In [None]:
import ast

def sorted_set(x):
    return sorted(set(x))

In [None]:
df_combos = pd.get_dummies(orgs_h2020_df.groupby('rcn')['org_type'].apply(sorted_set).astype(str))
cols = [', '.join(sorted(ast.literal_eval(c))) for c in df_combos.columns]
df_combos.columns = cols
cols.sort(key=lambda s: len(s))
df_combos = df_combos[cols]

In [None]:
combo_shares = df_combos.sum() / df_combos.shape[0] * 100

In [None]:
fig, axs = plt.subplots(nrows=4, figsize=(8, 8), sharex=True)

combos = {}

for goal, ax in zip(sdg_keys, axs.ravel()):

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)

    a = (df_combos
         .loc[ids]
        )
    a = pd.Series(a.sum() / a.shape[0] * 100)
    combos[goal] = a
    x = a / combo_shares
    x.plot.bar(ax=ax, color=['C0' if n >= 1 else 'C3' for n in x])
    ax.set_title(sdg_names()[goal])
    ax.axhline(1, c='gray', linestyle='--')
    ax.set_xlabel('Organisational Combination')
    ax.set_ylabel('Relative Proportion')

plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/org_type_combo_representation_by_sdg_bar.png', 
    dpi=300
);

In [None]:
fig, axs = plt.subplots(nrows=4, figsize=(8, 16))

combos = {}

for goal, ax in zip(sdg_keys, axs.ravel()):

    ids = sdg_project_ids(project_sdgs_h2020_df, goal)

    a = (df_combos
         .loc[ids]
        )
    a = pd.Series(a.sum() / a.shape[0] * 100)
    combos[goal] = a
    x = a / combo_shares
    x.sort_values().plot.bar(ax=ax, color=['C0' if n >= 1 else 'C3' for n in x.sort_values()])
    ax.set_title(sdg_names()[goal])
    ax.axhline(1, c='gray', linestyle='--')
    ax.set_xlabel('Organisational Combination')
    ax.set_ylabel('Relative Proportion')

plt.tight_layout()
plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/org_type_combo_representation_by_sdg_sorted_bar.png', 
    dpi=300
);

Here we look at the levels of the different forms of organisational composition for each SDG compared to the levels across all projects. This reveals whether there are particular organisational compositions that are preferred for particular goals.

For Good Health and Well-being we can see that there are more projects that involve collaborations between public bodies and research organisations than average, as well as those with public, research and private organisations. Almost every other type of combination is at or below the average level.

On the other hand, Goals 6 and 11 have an over-representation of projects that involve a wider spectrum of organisation types.

Goal 7 has a significant representation of combinations that include private and 'other' type organisations.

Do countries with high performance have usual or unusual combinations of organisation types for a particular SDG?

## Pairwise Organisation Type Combinations

In [None]:
collabs = orgs_h2020_df.groupby('rcn')['org_type'].apply(list)
collab_counts_all = pd.Series(Counter(list(chain(*[combinations(sorted(c), 2) for c in collabs]))))
collab_frac_all = collab_counts_all / collab_counts_all.sum()
collab_frac_all = collab_frac_all.unstack().T / collab_frac_all.sum()

In [None]:
collab_counts = {}

for goal in sdg_keys:
    ids = sdg_project_ids(project_sdgs_h2020_df, goal)
    collabs_g = collabs.loc[ids]
    collab_counts[goal] = pd.Series(Counter(list(chain(*[combinations(sorted(c), 2) for c in collabs_g]))))

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(8, 6), sharex=True, sharey=True)

for goal, ax in zip(sdg_keys, axs.ravel()):

    x = (collab_counts[goal].unstack().T / collab_counts[goal].sum().sum()) / collab_frac_all
    sns.heatmap(x, cmap='coolwarm_r', ax=ax, center=1, square=True)
    ax.set_title(sdg_names()[goal])
    ax.set_xlabel('Org Type')
    ax.set_ylabel('Org Type')

plt.tight_layout()

plt.savefig(
    f'{project_dir}/reports/analysis_cordis_sdg_collaborations/org_type_pairwise_combo_representation_by_sdg_heatmap.png', 
    dpi=600
);

If we look at pairwise collaborations only, we see some of the underlying patterns. Good Health and Well-being has a significant over-representation of higher education institutions collaborating with other types of institutions, and themselves, as well as public bodies and research institutions.

**This is in effect a function of frequency - how do we address this? Using a model or by normalising to account for frequency?**

What about combinations of countries?

Are there goals which are similar in terms of function or system but but have totally organisational compositions?

We can get a sense of the concentration of projects Of the projects that organisations of a particular type are involved in, 

In [None]:
duration_years = (project_h2020_df['end_date'] - project_h2020_df['start_date']).dt.days / 365.25

for goal in sdg_keys:
    ids = project_sdgs_h2020_df[project_sdgs_h2020_df[goal] == 1].index.values
    y = duration_years.loc[ids]
    print(f'Mean duration for Goal {goal}: {np.round(y.mean(), 3)}')

## Collaboration Networks

In [None]:
from gensim.corpora import Dictionary

In [None]:
import itertools
import numpy
import warnings

from collections import Counter, defaultdict
from gensim.corpora import Dictionary
from graph_tool.all import Graph
from itertools import chain, combinations

from rhodonite.cooccurrence import cooccurrence_graph
from rhodonite.cooccurrence.normalise import association_strength

In [None]:
sdg_orgs_h2020 = project_sdgs_h2020_df.merge(orgs_h2020_df, left_index=True, right_index=True, how='left')
project_orgs = orgs_h2020_df.groupby(orgs_h2020_df.index)['legal_name'].apply(list)

org_sdg_counts = sdg_orgs_h2020.groupby('legal_name')[sdg_keys + [0]].sum()
org_sdg_specialisation = create_quotient(org_sdg_counts)

In [None]:
coords_h2020_df = orgs_h2020_df[orgs_h2020_df['type'] == 'coordinator']
part_h2020_df = orgs_h2020_df[orgs_h2020_df['type'] != 'coordinator']

In [None]:
collab_pairs_h2020_df = pd.merge(coords_h2020_df['legal_name'], part_h2020_df['legal_name'], 
                                 left_index=True, right_index=True, how='right', suffixes=('_c', '_p'))

In [None]:
collab_pairs_h2020_df = (collab_pairs_h2020_df
                       .merge(org_sdg_specialisation, left_on='legal_name_c', right_index=True, how='left')
                        )
collab_pairs_h2020_df = (collab_pairs_h2020_df
                       .merge(org_sdg_specialisation, left_on='legal_name_p', right_index=True, how='left', 
                              suffixes=('_c', '_p'))
                        )

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3), sharex=True) 

for goal, ax in zip(sdg_keys, axs):
    ids = project_sdgs_h2020_df[project_sdgs_h2020_df[goal] == 1].index.values
    collab_pairs_goal = collab_pairs_h2020_df.loc[collab_pairs_h2020_df.index.intersection(ids)]
    diff = collab_pairs_goal[f'{goal}_c'] - collab_pairs_goal[f'{goal}_p']
    diff = diff / diff.std()
    diff.plot.hist(ax=ax, color=sdg_hex_color_codes()[goal], bins=25, density='normed')
    ax.axvline(diff.mean(), color='gray', linestyle='--')
    print(goal, diff.mean())
    ax.set_xlabel('Specialisation Difference (Norm)')
    
plt.tight_layout();

In general we see that the specialisation of a project coordinator is lower than that of the other participants on a project. We see differences between the goals too, with Goal 3 having the smallest relative difference between coordinators and participants.

One issue here is that specialisation tends to be higher for organisations that take on fewer projects and that those with fewer projects are less likely to be coordinators.

In [None]:
collab_pairs_h2020_df = (collab_pairs_h2020_df
                       .merge(org_sdg_counts, left_on='legal_name_c', right_index=True, how='left')
                        )
collab_pairs_h2020_df = (collab_pairs_h2020_df
                       .merge(org_sdg_counts, left_on='legal_name_p', right_index=True, how='left', 
                              suffixes=('_count_c', '_count_p'))
                        )

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3), 
#                         sharex=True
                       ) 

for goal, ax in zip(sdg_keys, axs):
    ids = project_sdgs_h2020_df[project_sdgs_h2020_df[goal] == 1].index.values
    collab_pairs_goal = collab_pairs_h2020_df.loc[collab_pairs_h2020_df.index.intersection(ids)]
    diff = collab_pairs_goal[f'{goal}_count_c'] - collab_pairs_goal[f'{goal}_count_p']
#     diff = diff / diff.std()
    diff.plot.hist(ax=ax, color=sdg_hex_color_codes()[goal], bins=25)
    ax.axvline(diff.mean(), color='gray', linestyle='--')
    print(goal, diff.mean())
    ax.set_xlabel('Project Count Difference')
    
plt.tight_layout();

We see that normally the number of projects a coordinator has engaged in is larger than the number of projects other participants have engaged in.

## Graph Metrics

In [None]:
project_orgs_collab = project_orgs[project_orgs.apply(lambda x: len(x)) > 1]

In [None]:
collab_graphs = {}
for goal in sdg_keys:

    ids = project_sdgs_h2020_df[project_sdgs_h2020_df[goal] == 1].index.values

    # project_hes = project_orgs.dropna()
    sdg_orgs = project_orgs_collab.loc[project_orgs_collab.index.intersection(ids)]
    dictionary = Dictionary(sdg_orgs)
    org_ids = [dictionary.doc2idx(d) for d in sdg_orgs]

    g_co, o, co = cooccurrence_graph(org_ids)
    g_co.vp['o'] = o
    g_co.ep['co'] = co
    collab_graphs[goal] = g_co

### Degree Distribution

In [None]:
from graph_tool.stats import vertex_hist

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3), sharex=True)
for goal, ax in zip(sdg_keys, axs.ravel()):
    n, bins = vertex_hist(collab_graphs[goal], deg='out')
    ax.scatter(bins[1:], n, color=sdg_hex_color_codes()[goal], alpha=0.7)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_ylim(0.9, 1000)
    ax.set_xlabel('Node Degree')
    ax.set_ylabel('Frequency')
    
plt.tight_layout();

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3), sharex=True)
for goal, ax in zip(sdg_keys, axs.ravel()):
    n, bins = vertex_hist(collab_graphs[goal], deg='out')
    bins = bins / collab_graphs[goal].num_vertices()
    ax.scatter(bins[1:], n, color=sdg_hex_color_codes()[goal], alpha=0.7)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_ylim(0.9, 1000)
    ax.set_xlabel('Node Degree')
    ax.set_ylabel('Frequency')
    
plt.tight_layout();

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3), sharex=True)
for goal, ax in zip(sdg_keys, axs.ravel()):
    n, bins = vertex_hist(collab_graphs[goal], deg='out')
    ax.step(bins[1:], np.cumsum(n) / np.cumsum(n).max(), color=sdg_hex_color_codes()[goal], alpha=0.7)
    ax.set_xscale('log')
    
plt.tight_layout();

### Centrality

In [None]:
from graph_tool.centrality import betweenness, eigenvector, pagerank

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))
for goal, ax in zip(sdg_keys, axs.ravel()):
    collab_graphs[goal].vp['betweeness'], collab_graphs[goal].ep['betweeness'] = betweenness(
        collab_graphs[goal],
#         weight=collab_graphs[goal].ep['co']
                          )
    ax.hist(collab_graphs[goal].vp['betweeness'].a, bins=25, color=sdg_hex_color_codes()[goal])
#     ax.set_xscale('log')
    
plt.tight_layout();

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))
for goal, ax in zip(sdg_keys, axs.ravel()):
    _, collab_graphs[goal].vp['eigenvector'] = eigenvector(
        collab_graphs[goal],
#         weight=collab_graphs[goal].ep['co']
                          )
    ax.hist(collab_graphs[goal].vp['eigenvector'].a, bins=25, color=sdg_hex_color_codes()[goal])
#     ax.set_xscale('log')
    
plt.tight_layout();

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))
for goal, ax in zip(sdg_keys, axs.ravel()):
    collab_graphs[goal].vp['pagerank'] = pagerank(
        collab_graphs[goal],
#         weight=collab_graphs[goal].ep['co']
                          )
    ax.hist(collab_graphs[goal].vp['pagerank'].a, bins=25, color=sdg_hex_color_codes()[goal])
#     ax.set_xscale('log')
    
plt.tight_layout();

In [None]:
from graph_tool.clustering import local_clustering, global_clustering
from graph_tool.draw import graph_draw

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))
for goal, ax in zip(sdg_keys, axs.ravel()):
    collab_graphs[goal].vp['cluster_coeff'] = local_clustering(
        collab_graphs[goal],
#         weight=collab_graphs[goal].ep['co']
                          )
    ax.hist(collab_graphs[goal].vp['cluster_coeff'].a, bins=25, color=sdg_hex_color_codes()[goal])
#     ax.set_xscale('log')
    
plt.tight_layout();

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))
for goal, ax in zip(sdg_keys, axs.ravel()):
    collab_graphs[goal].vp['od'] = collab_graphs[goal].new_vertex_property('float')
    x = (collab_graphs[goal]
                                   .get_out_degrees(vs=list(collab_graphs[goal].vertices()))
                                   )
    x = x / (collab_graphs[goal].num_vertices() - 1)
    collab_graphs[goal].vp['od'].a = x
    ax.hist(collab_graphs[goal].vp['od'].a, bins=50, color=sdg_hex_color_codes()[goal])
#     ax.set_xscale('log')
    
plt.tight_layout();

In [None]:
from rhodonite.utils.tabular import vertices_to_dataframe

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))

for goal, ax in zip(sdg_keys, axs.ravel()):
    vertex_df = vertices_to_dataframe(collab_graphs[goal]).set_index('v')
    vertex_df = vertex_df.rename(columns={'o': 'frequency', 'od': 'out degree'})
    sns.heatmap(vertex_df.corr(), cmap='viridis', ax=ax, vmin=-0.6)
    
plt.tight_layout();

In [None]:
dictionaries = {}

for goal in sdg_keys:
    ids = project_sdgs_h2020_df[project_sdgs_h2020_df[goal] == 1].index.values

    # project_hes = project_orgs.dropna()
    sdg_orgs = project_orgs_collab.loc[project_orgs_collab.index.intersection(ids)]
    dictionaries[goal] = Dictionary(sdg_orgs);

In [None]:
fig, ax = plt.subplots(figsize=(8, 3), ncols=2)

for goal in sdg_keys:
    ids = project_sdgs_h2020_df[project_sdgs_h2020_df[goal] == 1].index.values
    sdg_orgs = project_orgs_collab.loc[project_orgs_collab.index.intersection(ids)]
    dictionary = dictionaries[goal]
    mms = MinMaxScaler()
    vertex_df = vertices_to_dataframe(collab_graphs[goal]).set_index('v')
    vertex_df['legal_name'] = vertex_df.index.map(dictionary)
    org_types = orgs_h2020_df[['org_type', 'legal_name']].drop_duplicates()
    vertex_df = vertex_df.merge(org_types.reset_index(), left_on='legal_name', right_on='legal_name', how='left')
    vertex_df[['betweeness', 'cluster_coeff']] = mms.fit_transform(vertex_df[['betweeness', 'cluster_coeff']])
    v = vertex_df.groupby('org_type')[['betweeness', 'cluster_coeff']].mean().sort_index()
    v_err = vertex_df.groupby('org_type')[['betweeness', 'cluster_coeff']].std().sort_index()
    ax[0].scatter(v.index, v['betweeness'], color=sdg_hex_color_codes()[goal])
    ax[0].set_ylabel('Betweeness')
    ax[1].scatter(v.index, v['cluster_coeff'], color=sdg_hex_color_codes()[goal])
    ax[1].set_ylabel('Clustering Coefficient')
plt.tight_layout()

Create a GraphView for the nodes in each European country and calculat the clustering coefficient for that graph.

In [None]:
from graph_tool import GraphView

In [None]:
def generate_eu_country_codes():
    country_df = pd.read_json(f'{data_path}/raw/countries/countries_restcountries_api.json')
    europe = []
    for code, c in zip(country_df['alpha2Code'], country_df['regionalBlocs']):
        for x in c:
            if x['acronym'] == 'EU':
                europe.append(code)
    
    # Britain called 'UK' in CORDIS
    europe = sorted(['UK' if e == 'GB' else e for e in europe])
    return europe

In [None]:
europe = generate_eu_country_codes()

In [None]:
clustering_coeffs = defaultdict(list)
clustering_err = defaultdict(list)

for goal in sdg_keys:
    for country in europe:
        country_orgs = orgs_h2020_df[orgs_h2020_df['nuts_code'] == country]['legal_name'].drop_duplicates()
        country_org_vertices = [dictionaries[goal].token2id[k] for k in country_orgs if k in dictionaries[goal].token2id]
        
        is_country = collab_graphs[goal].new_vp('bool')
        for org in country_org_vertices:
            is_country[org] = True
        gv = GraphView(collab_graphs[3], vfilt=is_country)
        coeff = global_clustering(gv)
        clustering_coeffs[goal].append(coeff[0])
        clustering_err[goal].append(coeff[1])

In [None]:
n_orgs = {}

for goal in sdg_keys:
    ids = project_sdgs_h2020_df[project_sdgs_h2020_df[goal] == 1].index.values
    countries = orgs_h2020_df[['nuts_code', 'legal_name']].loc[ids]
    countries = countries.drop_duplicates('legal_name')
    n_orgs[goal] = countries['nuts_code'].value_counts().reindex(europe)

In [None]:
n_orgs = pd.DataFrame(n_orgs)

In [None]:
cluster_df = pd.DataFrame(data=clustering_coeffs, index=europe)
cluster_err_df = pd.DataFrame(data=clustering_err, index=europe)

In [None]:
fig, axs = plt.subplots(ncols=4, figsize=(15, 3))
for goal, ax in zip(sdg_keys, axs.ravel()):
    ax.errorbar(x=n_orgs[goal], y=cluster_df[goal], yerr=cluster_err_df[goal], 
                color=sdg_hex_color_codes()[goal], linestyle="None", marker='o',
               alpha=0.7)
    ax.set_ylabel('Clustering Coefficient')
    ax.set_xlabel('N Orgs (Nodes)')
plt.tight_layout();