In [None]:
%run ../notebook_preamble.ipy

import os
import seaborn as sns
from sklearn.metrics import precision_recall_fscore_support

from sdg_mapping.cordis import load_cordis_projects, load_cordis_project_sdgs
from sdg_mapping.cordis.cordis_utils import FRAMEWORK_PROGRAMMES
from sdg_mapping.utils.sdg_utils import sdg_hex_color_codes, sdg_names

### Import CORDIS

In [None]:
projects = []
project_sdgs = []
project_sdg_probs = []

for fp in ['fp6', 'fp7', 'h2020']:
    projects.append(load_cordis_projects(fp).set_index('rcn'))
    project_sdgs.append(load_cordis_project_sdgs(fp, 'strict_label').set_index('rcn'))
    project_sdg_probs.append(load_cordis_project_sdgs(fp, 'probability').set_index('rcn'))
    
projects = pd.concat(projects)
project_sdgs = pd.concat(project_sdgs)
project_sdg_probs = pd.concat(project_sdg_probs)

### Import Annotated

In [None]:
annotated_dir = f'{data_path}/interim/doccano/results'
label_dir = f'{data_path}/interim/doccano/results/labels'

In [None]:
dfs = {}
for file in os.listdir(annotated_dir):
    if '.csv' in file:
        fin = os.path.join(annotated_dir, file)
        df = pd.read_csv(fin)
        n = int(fin.split('_')[-1].split('.')[0][3:])

        label_path = os.path.join(label_dir, f'labels_sdg{n}.json')
        labels = pd.read_json(label_path)
        label_map = {i: k for i, k in zip(labels['id'], labels['suffix_key'])}
        df['label'] = df['label'].map(label_map)
        df['label'] = df['label'].map({'y': 1, 'n': 0})

        df = df.rename(columns={'meta.rcn': 'rcn'})
        df = df.set_index('rcn')

        dfs[n] = df

### Calculate Classification Scores

In [None]:
scores = []

for sdg in range(1, 17):
    df_score = project_sdgs.merge(dfs[sdg], left_index=True, right_index=True, how='right')
    score = precision_recall_fscore_support(df_score['label'], df_score[sdg])
    scores.append(np.array(score).T.ravel())

In [None]:
score_df = pd.DataFrame(np.array(scores))
score_df.columns =[
    '0_precision', '0_recall', '0_f1', '0_support',
    '1_precision', '1_recall', '1_f1', '1_support',
]
score_df.index = range(1,17)

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
_score_df = score_df[[c for c in score_df.columns if 'support' not in c]]
_score_df = _score_df.sort_values(by='1_f1', ascending=False)
_score_df.index = [sdg_names()[i] for i in _score_df.index.values]
sns.heatmap(_score_df, fmt='.2f', annot=True, cmap='RdBu')
ax.axvline(3, color='#ffffff', linewidth=3)
plt.tight_layout()
plt.savefig('../../reports/eda/figures/cordis_precision_recall_f1_heatmap.png', dpi=300);

Based on F1 scores for positive samples:

- Performing well: 2, 3, 6, 7, 11
- Performing moderately: 8, 16
- Performing badly: 1, 4, 5, 9, 10, 12, 13, 14, 15

### Print out titles of misclassified projects (FNs and FPs)

In [None]:
scores = []

for sdg in range(1, 17):
    print('===', sdg_names()[sdg], '===')
    _df = project_sdgs.merge(dfs[sdg], left_index=True, right_index=True, how='right')
    _df['prob'] = project_sdg_probs.loc[_df.index.values][sdg]
    df_fp = _df[(_df[sdg] == 1) & (_df['label'] == 0)]
    df_fn = _df[(_df[sdg] == 0) & (_df['label'] == 1)]
    print('False Positives')
    for v, s in zip(df_fp['text'].values[:5], df_fp['prob'].values[:5]):
        print(f'> {s:.2f}', v.split('=== ')[1])
    print('\nFalse Negatives')
    if df_fn.shape[0] > 0:
        for v, s in zip(df_fn['text'].values[:5], df_fn['prob'].values[:5]):
            print(f'> {s:.2f}', v.split('=== ')[1])
    print('')