# CORDIS SDG EDA

Initial high level analysis of CORDIS projects labelled with SDGs.

Quality
- [x] Percentage of projects that had classification successfully performed

EDA
- [x] Change in SDG activity over time
- [x] Change in funding for SDGs over time
- [x] SDG activity and specialisation by coordinator country
- [x] SDG activity and specialisation by participant countries
- [x] SDG activity and specialisation by country involvement

Extra Ideas
- Diversity index of countries by SDG
- Diversity index of SDGs by country
- Distance between country's SDG profile in different FPs
- Difference between country's participation and coordinator profile

In [None]:
%run ../notebook_preamble.ipy

In [None]:
from sdg_mapping.cordis import load_cordis_projects, load_cordis_project_sdgs
from sdg_mapping.cordis.cordis_utils import FRAMEWORK_PROGRAMMES
from sdg_mapping.utils.sdg_utils import sdg_hex_color_codes, sdg_names

import seaborn as sns

In [None]:
projects = {}
project_sdgs = {}

for fp in FRAMEWORK_PROGRAMMES:
    projects[fp] = load_cordis_projects(fp).set_index('rcn')
    project_sdgs[fp] = load_cordis_project_sdgs(fp, 'strict_label').set_index('rcn')

## Classification Coverage

In [None]:
def coverage(projects, sdgs):
    c = sdgs.shape[0] / projects.shape[0] * 100
    return c

for fp in FRAMEWORK_PROGRAMMES:
    c = coverage(projects[fp], project_sdgs[fp])
    print(f'{c:.2f}% of {fp.upper()} projects were successfully classified')

In [None]:
fig_dir = f'{project_dir}/reports/eda/figures'

In [None]:
c = [coverage(projects[fp], project_sdgs[fp]) for fp in FRAMEWORK_PROGRAMMES]

fig, ax = plt.subplots()
ax.bar([fp.upper() for fp in FRAMEWORK_PROGRAMMES], c)
ax.set_ylabel('Classifier Coverage (%)')
ax.set_xlabel('Framework Programme')
plt.savefig(f'{fig_dir}/classification_coverage_bar.png', dpi=300);

## H2020, FP7 and FP6

### Aggregate Activity

In [None]:
def sum_columns(X, norm=False):
    '''calculate_aggregate_activity
    
    Args:
        X (pd.DataFrame): DataFrame where columns are counts of categories.
        norm (bool): If true, returned result is normalised such that values
            add to 1. Default is False.
        
    Returns:
        summed (pd.Series)
    '''
    activity = X.sum()
    if norm:
        activity = activity / X.shape[0]
    return activity

def plot_sdg_activity(sdg_df, ax=ax, norm=False):
    '''plot_sdg_activity
    '''
    activity = sum_columns(sdg_df, norm=norm)  
    if norm:
        activity = activity * 100

    activity.plot.barh(color=sdg_hex_color_codes().values(), ax=ax)
    ax.set_yticklabels(sdg_names().values())
    ax.invert_yaxis()
    
    if norm:
        ax.set_xlabel('% of Projects')
    else:
        ax.set_xlabel('Number of Projects')    

In [None]:
fig, axs = plt.subplots(ncols=3, figsize=(15, 4), sharey=True)

for ax, fp in zip(axs, FRAMEWORK_PROGRAMMES[-3:]):
    plot_sdg_activity(project_sdgs[fp], ax)
    ax.set_title(fp.upper())
    
plt.tight_layout()
    
plt.savefig(f'{fig_dir}/cordis_sdg_project_count_by_fp_barh.png', dpi=300);

In [None]:
fig, axs = plt.subplots(ncols=3, figsize=(15, 4), sharey=True)

for ax, fp in zip(axs, FRAMEWORK_PROGRAMMES[-3:]):
    plot_sdg_activity(project_sdgs[fp], ax, norm=True)
    ax.set_title(fp.upper())
    
plt.tight_layout()
    
plt.savefig(f'{fig_dir}/cordis_sdg_project_share_by_fp_barh.png', dpi=300);

### Multiple Goal Frequency

In [None]:
def sum_rows(X, norm=False, sort='index', sort_ascending=False):
    '''sum_rows'''
    X = X.sum(axis=1).value_counts()
    if sort == 'index':
        X = X.sort_index(ascending=sort_ascending)
    elif sort == 'values':
        X = X.sort_values(ascending=sort_ascending)
        
    if norm:
        X = X / X.sum()
        
    return X

def plot_sdg_count(sdg_df, ax, norm=False):
    '''plot_sdg_count
    '''
    counts = sum_rows(sdg_df, norm=norm)
    if norm:
        counts = counts * 100
    counts.plot.barh(ax=ax)
    if norm:
        ax.set_xlabel('% of Projects')
    else:
        ax.set_xlabel('Number of Projects')
    ax.set_ylabel('Number of SDGs')

In [None]:
fig, axs = plt.subplots(ncols=3, figsize=(15, 4))

for ax, fp in zip(axs, FRAMEWORK_PROGRAMMES[-3:]):
    plot_sdg_count(project_sdgs[fp], ax, norm=True)
    ax.set_title(fp.upper())
    
plt.savefig(f'{fig_dir}/cordis_sdgs_per_project_by_fp_barh.png', dpi=300);

### Change in Relative SDG Activity Over Time

In [None]:
def merge_projects_sdgs(project_dfs, sdg_dfs):
    sdg_df = pd.concat(sdg_dfs, axis=0)
    project_df = pd.concat(project_dfs, axis=0)
    project_df = project_df.merge(sdg_df, left_index=True, right_index=True, how='right')
    return project_df

def plot_sdg_frequency_fps(project_df, fps=FRAMEWORK_PROGRAMMES):
    '''plot_sdg_frequency_fps'''
    
    fp_project_count = project_df['framework_programme'].value_counts()
    fps = [fp.upper() for fp in fps]
    
    fig, ax = plt.subplots(figsize=(15, 7))
    freqs = (project_df
             .groupby('framework_programme')[sdg_keys].sum()
             .divide(fp_project_count, axis=0)
             .T 
             * 100)[fps]
    freqs.plot.bar(cmap='viridis_r', ax=ax)

    ax.set_xticklabels(sdg_names().values(), rotation=45, ha='right')
    ax.set_ylabel('% of Projects')
    plt.tight_layout();

In [None]:
all_project_df = merge_projects_sdgs(projects.values(), project_sdgs.values())
plot_sdg_frequency_fps(all_project_df, fps=FRAMEWORK_PROGRAMMES)
plt.savefig(f'{fig_dir}/cordis_project_share_vs_sdg_by_fp_bar.png', dpi=300);

### Combining H2020, FP7 and FP6

In [None]:
sdg_keys = list(range(1, 17))
sdg_keys_all = list(range(0, 17))

In [None]:
def add_no_sdg(df, sdg_cols):
    df[0] = 0
    df.loc[df[sdg_cols].sum(axis=1) == 0, 0] = 1
    return df

In [None]:
project_df = merge_projects_sdgs([projects[fp] for fp in FRAMEWORK_PROGRAMMES[:3]],
                                 [project_sdgs[fp] for fp in FRAMEWORK_PROGRAMMES[:3]])
project_df = add_no_sdg(project_df, sdg_keys)

### Project Validation

In [None]:
def validate(df, sdg):
    v = []
    rows = project_df[project_df[sdg] == 1].sample(10, random_state=0)
    print(f'=== {sdg_names().get(sdg, "No SDG")} ===')
    for i, row in rows.iterrows():
        title = row.title
        objective = row.objective
        print(f'>>> {title.upper()}')
        print(f'{objective}')
        x = input()
        v.append(x)
    return v

### Top Tf-Idf Terms

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from stop_words import get_stop_words

In [None]:
stops = get_stop_words('en')
tfidf = TfidfVectorizer(min_df=10, max_df=0.3, ngram_range=(2,2), 
                        stop_words=stops, token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b')
tfidf_vecs = tfidf.fit_transform(project_df['objective'].fillna(''))

In [None]:
def get_top_tfidf_scores(doc_ids, tfidf, tfidf_vecs, topn=20):
    tfidf_docs_sum = np.array(tfidf_vecs[doc_ids].sum(axis=1))[:, 0]
    top_term_ids = np.argsort(tfidf_docs_sum)[::-1][:topn]
    terms = np.array(tfidf.get_feature_names())[top_term_ids]
    return terms

In [None]:
get_top_tfidf_scores((project_df[7] == 1).values, tfidf, tfidf_vecs)

### SDG Activity by Year

In [None]:
def mask_by_year(df, date_col='start_date', start_year=2004, end_year=2019):
    '''mask_by_year
    
    Args:
        df (pd.DataFrame):
        date_col (str):
        start_year (int):
        end_year (int):
        
    Returns:
        (pd.DataFrame):
    '''
    return df[(df[date_col].dt.year >= start_year) & (df[date_col].dt.year <= end_year)]


def normalised_count_activity(df, groupby, value_columns, baseline_col):
    '''normalised_activity
    Calculates count of 
    '''
    total_activity_by_year = df.groupby(groupby)[baseline_col].count()
    normalised_activity = (df.groupby(groupby)[value_columns].sum()
                .divide(total_activity_by_year, axis=0)) * 100
    return normalised_activity


def plot_sdg_normalised_count_activity(project_df):
    opts = {
        'groupby': pd.Grouper(key='start_date', freq='Y'),
        'value_columns': list(range(1, 17)),
        'baseline_col': 'id',
    }
    normed = normalised_count_activity(project_df, **opts)
    fig, axs = plt.subplots(figsize=(7, 5))

    (normed
     .rename(columns=sdg_names())
     .plot(
         color=sdg_hex_color_codes().values(), 
         ax=axs, 
         legend=None, 
         alpha=.7, 
         linewidth=2)
    )
    axs.set_xlabel('Project Start Year')
    axs.set_ylabel('Share of Projects in Year (%)')
    axs.legend(loc='center left', bbox_to_anchor=(1, 0.5));

In [None]:
start_year = 2004
end_year = 2019
project_df = mask_by_year(project_df, date_col='start_date', start_year=start_year, end_year=end_year)

plot_sdg_normalised_count_activity(project_df)
plt.savefig(f'{fig_dir}/cordis_sdg_project_share_vs_time_line.png', dpi=300, bbox_inches="tight");

In [None]:
fp_details = pd.read_csv('../../data/raw/cordis/ref/cordis_fp_details.csv')

In [None]:
fig, ax = plt.subplots()
total_projects_by_year = project_df.groupby(pd.Grouper(key='start_date', freq='Y'))['id'].count()
total_projects_by_year.plot(ax=ax)
ax.axvline(
    pd.to_datetime(
        f"{fp_details.set_index('name').loc['h2020']['start_year']}-01-01"), 
        color='gray', 
        linestyle='--')
ax.axvline(
    pd.to_datetime(
        f"{fp_details.set_index('name').loc['fp7']['start_year']}-01-01"), 
        color='gray', 
        linestyle='--')
ax.set_xlabel('Year')
ax.set_ylabel('Number of Projects')

plt.savefig(f'{fig_dir}/cordis_n_projects_vs_year_line.png', dpi=300, bbox_inches="tight");

### Temporal Trends in SDG Quotient

In [None]:
def create_quotient(X, binary=False):
    """Calculate the location quotient

    Divides the share of activity in a location by the share of activity in the UK total

    Args:
        X (pandas.DataFrame): DataFrame where rows are locations, columns are sectors 
            and values are activity in a given sector at a location.
        binary (bool, optional): If True, discretise the data with a cut-off value of 1

    Returns:
        pandas.DataFrame
    """
    Xm = X.values
    X = pd.DataFrame((Xm/Xm.sum(1)[:, np.newaxis])/(Xm.sum(0)/Xm.sum()),
            index=X.index, columns=X.columns)

    return (X > 1) if binary else X

def sdg_group_quotient(df, groupby):
    '''sdg_group_quotient
    
    Args:
        df (pd.DataFrame): 
        groupby (str of pd.Grouper): 
        
    Returns:
        quotient (pd.DataFrame): 
    '''
    sdg_count_fp = df.groupby(groupby)[sdg_keys_all].sum()
    quotient = create_quotient(sdg_count_fp)[sdg_keys]
    return quotient

def plot_sdg_group_quotient_change_bar(df, first=0, last=-1):
    grad = df.iloc[last] - df.iloc[first]
    order = grad.sort_values().index.values
    
    fig, ax = plt.subplots(figsize=(5, 10))
    df.iloc[::-1][order].T.plot.barh(
        ax=ax, color=['#014092', '#177afd', '#8bbdfe'], edgecolor='white')
    ax.set_yticklabels([sdg_names()[i] for i in order])
    ax.axvline(1, color='gray', linestyle='--')
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    ax.set_xlabel('Quotient');

In [None]:
sdg_quotient_fp = sdg_group_quotient(project_df, 'framework_programme')
plot_sdg_group_quotient_change_bar(sdg_quotient_fp)

plt.savefig(f'{fig_dir}/cordis_activity_specialisation_vs_sdg_by_fp_barh.png', dpi=300, bbox_inches="tight");

### SDG Funding by Year

In [None]:
def sdg_project_funds(df, fund_column):
    '''sdg_project_funds
    Multiplies SDG columns by the amount of funding recieved by a project.
    '''
    sdg_funds = df[sdg_keys_all] * df[fund_column].values[:,None]
    return sdg_funds

def sdg_funding_by_group(df, groupby, fund_column='ec_max_contribution', norm=False):
    '''normalised_activity
    Calculates count of 
    '''
    total_funds = df.groupby(groupby)[fund_column].sum()
    sdg_funds = sdg_project_funds(df, fund_column)
    
    if type(groupby) == str:
        sdg_funds[groupby] = df[groupby]
    elif type(groupby) == pd.core.resample.TimeGrouper:
        sdg_funds[groupby.key] = df[groupby.key]
        
    sdg_funds_group = sdg_funds.groupby(groupby)[sdg_keys].sum()
    
    if norm:
         sdg_funds_group = sdg_funds_group.divide(total_funds, axis=0)
            
    return sdg_funds_group

def plot_sdg_funds_vs_time_line(df, norm=True):
    
    fig, ax = plt.subplots(figsize=(7, 5))
    
    if norm:
        df = df * 100
    else:
        df = df / 1e9
        
    df.columns = [sdg_names()[i] for i in df.columns]
    
    df.plot(color=sdg_hex_color_codes().values(), ax=ax, alpha=.9, linewidth=2)
    
    ax.set_xlabel('Project Start Year')
    if norm:
        ax.set_ylabel('Share of Funding in Year (%)')
    else:
        ax.set_ylabel('Funding (€bn)')
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5));

In [None]:
start_date_1yr_grouper = pd.Grouper(key='start_date', freq='Y')
sdg_funding = sdg_funding_by_group(project_df, start_date_1yr_grouper, 
                                        fund_column='ec_max_contribution', norm=False)
plot_sdg_funds_vs_time_line(sdg_funding, norm=False)
plt.savefig(f'{fig_dir}/cordis_funding_vs_time_by_sdg_line.png', dpi=300, bbox_inches="tight");

In [None]:
sdg_funding_norm = sdg_funding_by_group(project_df, start_date_1yr_grouper, 
                                        fund_column='ec_max_contribution', norm=True)
plot_sdg_funds_vs_time_line(sdg_funding_norm, norm=True)
plt.savefig(f'{fig_dir}/cordis_funding_share_vs_time_by_sdg_line.png', dpi=300, bbox_inches="tight");

### Temporal Trends in SDG Funding Quotient

In [None]:
sdg_fund_quotient_fp = sdg_group_quotient(
    sdg_project_funds(project_df, 'ec_max_contribution'), 
    project_df['framework_programme'])
plot_sdg_group_quotient_change_bar(sdg_fund_quotient_fp)
plt.savefig(f'{fig_dir}/cordis_funding_specialisation_vs_sdg_by_fp_barh.png', dpi=300, bbox_inches="tight");

### Coordinators

In [None]:
import seaborn as sns

In [None]:
coordinator_ohe = pd.get_dummies(project_df['coordinator_country'])
top_countries = coordinator_ohe.sum()[coordinator_ohe.sum() > 50].index

In [None]:
project_count_by_country = project_df['coordinator_country'].value_counts()
project_share_by_country = (project_count_by_country / project_count_by_country.sum()) * 100

sdg_count_by_country = project_df.groupby('coordinator_country')[sdg_keys_all].sum()
sdg_share_by_country = (sdg_count_by_country / sdg_count_by_country.sum()) * 100



In [None]:
def coordinator_country_share_of_projects(df):
    all_count = df['coordinator_country'].value_counts()
    share = all_count / all_count.sum()
    return share

def coordinator_country_share_of_sdg_projects(df, include_all=False):
    sdg_count = df.groupby('coordinator_country')[sdg_keys_all].sum()
    share = (sdg_count / sdg_count.sum())
    return share

In [None]:
def plot_sdg_share_by_coordinator_country_bar(share_sdg, share_all, country=None, topn=10):
    if country is not None:
        order = share_sdg.loc[country].sort_values(ascending=False).index.values
        order = order[order != 0]
    else:
        order = sdg_keys
        
    fig, axs = plt.subplots(figsize=(15, 7), ncols=4, nrows=4)

    for sdg, ax in zip(order, axs.ravel()):
        top = share_sdg[sdg].sort_values(ascending=False)[:topn]
        if country is not None:
            color = ['C0' if c != country else 'C1' for c in top.index.values]
        else:
            color = 'C0'
        (top * 100).plot.bar(ax=ax, color=color, alpha=0.6)
        ax.set_title(sdg_names()[sdg])
        shares = share_all.loc[top.index.values] * 100
        ax.scatter(shares.index.values, shares.values, color=color)
        ax.set_xlabel('Coordinator Country')
        ax.set_ylabel('% of SDG')

    plt.tight_layout();

In [None]:
plot_sdg_share_by_coordinator_country_bar(coordinator_country_share_of_sdg_projects(project_df),
                                         coordinator_country_share_of_projects(project_df))
plt.savefig(f'{fig_dir}/cordis_sdg_activity_share_vs_coordinator_by_sdg_bar.png', dpi=300);

### Coordinator Country Specialisation

#### Get EU Country Codes

In [None]:
def generate_eu_country_codes():
    country_df = pd.read_json(f'{data_path}/raw/countries/countries_restcountries_api.json')
    europe = []
    for code, c in zip(country_df['alpha2Code'], country_df['regionalBlocs']):
        for x in c:
            if x['acronym'] == 'EU':
                europe.append(code)
    
    # Britain called 'UK' in CORDIS
    europe = sorted(['UK' if e == 'GB' else e for e in europe])
    return europe

In [None]:
europe = generate_eu_country_codes()

In [None]:
def coordinator_country_sdg_lq(df, min_cnt=0):
    '''coordinator_country_sdg_lq
    Calculates the location quotient of SDG projects for each coordinator
    country.
    Args:
        df (pd.DataFrame): CORDIS projects with coordinator_country column 
            and sdg labels with integer headers.
        min_cnt (int): Only return LQ for countries with this many projects
            or more. Defaults to 0.
        
    Returns:
        lq (pd.DataFrame): SDG project location quotients with SDGs along 
            columns and countries as index.
    '''
    counts = df.groupby('coordinator_country')[sdg_keys_all].sum()
    lq = create_quotient(counts)
    lq = lq[counts.sum(axis=1) >= min_cnt]
    return lq

In [None]:
# eu_coordinator_sdg_lq = coordinator_sdg_lq.reindex(europe).dropna(how='all')
# eu_high = project_count_by_country[project_count_by_country > 50].reindex(europe).dropna().index.values

def plot_sdg_lq_by_country_bar(lq_df, countries=None, topn=10):
    '''plot_sdg_lq_by_country_bar
    '''
    
    countries = lq_df.reindex(countries).dropna().index.values
    
    fig, axs = plt.subplots(figsize=(15, 7), ncols=4, nrows=4)

    for sdg, ax in zip(sdg_keys, axs.ravel()):
        x = lq_df[sdg].loc[countries].sort_values(ascending=False)[:topn]
        x.plot.bar(ax=ax, color=sdg_hex_color_codes()[sdg])
        ax.set_title(sdg_names()[sdg])
        ax.set_xlabel('Coordinator Country')
        ax.set_ylabel('LQ')
        ax.axhline(1, color='gray', linestyle='--')

    plt.tight_layout();

In [None]:
coordinator_lq = coordinator_country_sdg_lq(project_df, 50)
plot_sdg_lq_by_country_bar(coordinator_lq, countries=None, topn=10)

plt.savefig(f'{fig_dir}/cordis_specialisation_vs_coordinator_by_sdg_bar.png', dpi=300);

In [None]:
from sdg_mapping.geo.nuts import load_nuts_regions

In [None]:
def plot_europe_sdg_choropleth(df, label, countries=None):
    '''plot_europe_sdg_choropleth
    Plots a 4x4 axis of choropleths with one for each SDG.
    '''
    nuts = load_nuts_regions(2016, f'{data_path}/raw/shapefiles', level=0, resolution=60, countries=None)
    
    fig, axs = plt.subplots(ncols=4, nrows=4, figsize=(15,12))

    for sdg, ax in zip(sdg_keys, axs.ravel()):

        nuts_sdg = nuts.set_index('CNTR_CODE')
        nuts_sdg['value'] = df[sdg]
        if countries is not None:
            nuts_sdg = nuts_sdg.reindex(countries)
        nuts_sdg.plot(column='value', ax=ax, cmap='viridis_r', legend=True,
                     edgecolor='#bbbbbb', linewidth=.5,
                     legend_kwds={'orientation': "vertical", 'shrink': .4, 'label': label})
        ax.set_xlim(-25, 45)
        ax.set_ylim(30, 75)
        ax.set_title(sdg_names()[sdg])

        ax.axis('off')

        plt.tight_layout();

In [None]:
plot_europe_sdg_choropleth(coordinator_lq, 'LQ')
plt.savefig(f'{fig_dir}/cordis_specialisation_vs_coordinator_by_sdg_europe_choropleth.png', dpi=300);

### All Country Specialisation

In [None]:
def generate_all_countries(coordinator, participants):
    if type(participants) == list:
        if pd.isnull(coordinator):
            return participants
        else:
            return [coordinator] + participants
    else:
        if pd.isnull(coordinator):
            return []
        else:
            return [coordinator]

In [None]:
def create_country_ohe(df, col):
    '''create_country_ohe
    Creates a one hot encoding of countries involved in a project
    with project RCN as the index.
    
    Args:
        df (pd.DataFrame):
        
    Returns:
        countries_df (pd.DataFrame):
    '''
    mlb_all = MultiLabelBinarizer()
    mlb_vecs_all = mlb_all.fit_transform(project_df[col])
    countries_df = pd.DataFrame(
        mlb_vecs_all, 
        columns=mlb_all.classes_, 
        index=project_df.index.values)
    countries_df.index.name = 'rcn'
    return countries_df

In [None]:
def multi_country_sdg_count(df, col, norm=False):
    '''mult_country_sdg_count
    '''
    countries_ohe = create_country_ohe(df, col)

    x = countries_ohe.T.unstack().reset_index(level=1)
    x = x[x[0] == 1]
    x.drop(0, axis=1, inplace=True)
    x.rename(columns={'level_1': 'country'}, inplace=True)
    x = x.merge(df[sdg_keys_all], left_index=True, right_index=True, how='left')
    sdg_count = x.groupby('country').sum()
    if norm:
        sdg_count = sdg_count.divide(sdg_count.shape[0])
    return sdg_count

def multi_country_share_of_projects(df, col):
    all_count = create_country_ohe(df, col).sum()
    share = all_count / all_count.sum()
    return share

def multi_country_share_of_sdg_projects(df, col):
    count = multi_country_sdg_count(project_df, 'all_countries')
    share = (count / count.sum())
    return share

In [None]:
project_df['all_countries'] = project_df.apply(
    lambda row: generate_all_countries(row['coordinator_country'], row['participant_countries']), axis=1)
all_countries_sdg_activity_df = multi_country_sdg_count(project_df, 'all_countries')

plot_sdg_share_by_coordinator_country_bar(multi_country_share_of_sdg_projects(project_df, 'all_countries'),
                                         multi_country_share_of_projects(project_df, 'all_countries'))

plt.savefig(f'{fig_dir}/cordis_sdg_activity_share_vs_involved_by_sdg_bar.png', dpi=300)

In [None]:
def multi_country_location_quotient(df, col, min_cnt=0):
    counts = multi_country_sdg_count(project_df, col)
    lq = create_quotient(counts)[sdg_keys]
    lq = lq[counts.sum(axis=1) >= min_cnt]
    return lq

In [None]:
all_lq = multi_country_location_quotient(project_df, 'all_countries', 50)
plot_sdg_lq_by_country_bar(all_lq, countries=None, topn=10)
plt.savefig(f'{fig_dir}/cordis_specialisation_vs_involved_by_sdg_bar.png', dpi=300);

In [None]:
plot_europe_sdg_choropleth(all_lq, 'LQ')
plt.savefig(f'{fig_dir}/cordis_specialisation_vs_involved_by_sdg_europe_choropleth.png', dpi=300);

### All Country Specialisation Clustering

In [None]:
from sklearn.metrics import pairwise_distances

In [None]:
all_cluster_lq = multi_country_location_quotient(project_df, 'all_countries', 10)

In [None]:
sns.clustermap(pd.DataFrame(1 - pairwise_distances(all_cluster_lq[sdg_keys], metric='cosine'), 
                            index=all_cluster_lq.index.values, columns=all_cluster_lq.index.values))
plt.savefig(f'{fig_dir}/cordis_sdg_specialisation_involved_cosine_clustermap.png', dpi=300)