In [None]:
%run ../notebook_preamble.ipy

from sdg_mapping.cordis import load_cordis_projects, load_cordis_project_sdgs
from sdg_mapping.cordis.cordis_utils import FRAMEWORK_PROGRAMMES
from sdg_mapping.utils.sdg_utils import sdg_hex_color_codes, sdg_names

import seaborn as sns
import os

In [None]:
projects = []
project_sdgs = []

for fp in FRAMEWORK_PROGRAMMES:
    projects.append(load_cordis_projects(fp).set_index('rcn'))
    project_sdgs.append(load_cordis_project_sdgs(fp, 'probability').set_index('rcn'))
    
projects = pd.concat(projects)
project_sdgs = pd.concat(project_sdgs)

In [None]:
projects = projects.merge(project_sdgs, left_index=True, right_index=True, how='right')

In [None]:
annotated_dir = f'{data_path}/interim/doccano/results'
label_dir = f'{data_path}/interim/doccano/results/labels'

dfs = {}
for file in os.listdir(annotated_dir):
    if '.csv' in file:
        fin = os.path.join(annotated_dir, file)
        df = pd.read_csv(fin)
        n = int(fin.split('_')[-1].split('.')[0][3:])

        label_path = os.path.join(label_dir, f'labels_sdg{n}.json')
        labels = pd.read_json(label_path)
        label_map = {i: k for i, k in zip(labels['id'], labels['suffix_key'])}
        df['label'] = df['label'].map(label_map)
        df['label'] = df['label'].map({'y': 1, 'n': 0})

        df = df.rename(columns={'meta.rcn': 'rcn'})
        df = df.set_index('rcn')

        dfs[n] = df

In [None]:
for df in dfs.values():
    projects = projects.drop(projects.index.intersection(df.index.values))

In [None]:
(projects[sdg_keys] > .5).sum()

In [None]:
x = projects.sort_values(13, ascending=False)['title'].values[:20]
for i in x:
    print('>>>', i)

### Probability Distributions

In [None]:
fig, axs = plt.subplots(ncols=4, nrows=4, figsize=(15,13))

for i, ax in enumerate(axs.ravel()):
    projects[i+1].plot.hist(ax=ax)

    ax.set_yscale('log')
    
plt.tight_layout()

### Getting Documents from Across the Prediction Probability Distribution

In [None]:
sdg_keys = list(range(1, 17))
quintiles = [0, 20, 40, 60, 80, 100]

project_ids = {}

for sdg in sdg_keys:
    probs = projects[sdg]
    
    steps = np.linspace(probs.min(), probs.max(), 5)
    ids = []
    for lower, upper in zip(steps[:-1], steps[1:]):
        probs_step = probs[(probs > lower) & (probs <= upper)]
        if probs_step.shape[0] < 200:
            ids.extend(probs_step.index.values)
        else:
            ids_q = probs[(probs > lower) & (probs <= upper)].sample(200, random_state=0).index.values
            ids.extend(ids_q)
    project_ids[sdg] = ids

### Getting Similar Documents that Are Not Positively Predicted

In [None]:
import tensorflow_hub as hub

In [None]:
embed = hub.load('/Users/grichardson/models/universal-sentence-encoder_4')

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
embeddings = []

for chunk in chunks(projects['objective'].fillna(''), 1000):
    embeddings.extend(embed(chunk).numpy())

In [None]:
from annoy import AnnoyIndex

In [None]:
t = AnnoyIndex(512, 'angular')

for i, v in zip(projects.index.values, np.array(vec_df)):
    t.add_item(i, v)
    
t.build(500)

# vec_df = pd.DataFrame(np.array(embeddings))
# vec_df.index = projects.index.values

In [None]:
mean_vecs = []

for sdg in sdg_keys:
    ids = projects.sort_values(sdg, ascending=False).index.values[:100]
    mean_vecs.append(vec_df.loc[ids].mean())

In [None]:
extra_ids = {}

for sdg, vec in zip(sdg_keys, mean_vecs):
    similar = t.get_nns_by_vector(vec, 3000)
    similar_negative_projects = projects.loc[similar][projects.loc[similar][sdg] < .5][sdg]
    similar_negative_projects = similar_negative_projects.sort_values(ascending=False)
    
    similar_negative_ids = set(similar_negative_projects.index.values[:200])
    
    sdg_ids = set(project_ids[sdg])
    
    extra_ids[sdg] = list(similar_negative_ids.difference(sdg_ids))
    

In [None]:
for k, v in project_ids.items():
    project_ids[k].extend(extra_ids[k])

### Documents That are Similar to Manually Labelled Documents but Not Included

In [None]:
for df in dfs.values():
    df['text'] = df['text'].apply(lambda x: x.split(' === ')[-1])

In [None]:
correctly_labelled_vec = {}

for sdg, df in dfs.items():
    positives = df[df['label'] == 1]
    embedding = embed(df['text'].values).numpy().mean(axis=0)
    correctly_labelled_vec[sdg] = embedding

In [None]:
extra_ids = {}

for sdg, vec in zip(sdg_keys, correctly_labelled_vec.values()):
    similar = t.get_nns_by_vector(vec, 201)
#     similar_negative_projects = projects.loc[similar][sdg]
#     similar_negative_projects = similar_negative_projects.sort_values(ascending=False)
    
#     similar_negative_ids = set(similar_negative_projects.index.values[:200])
    similar_negative_ids = set(similar)
    
    
    sdg_ids = set(project_ids[sdg])
    
    extra_ids[sdg] = list(similar_negative_ids.difference(sdg_ids))
    

In [None]:
for k, v in project_ids.items():
    project_ids[k].extend(extra_ids[k])

In [None]:
exports = {}

for sdg in sdg_keys:
    export = projects.loc[project_ids[sdg]]
    exports[sdg] = export

In [None]:
for sdg in sdg_keys:
    test = exports[sdg]
    test['text'] = '=== ' + test['title'] + ' === ' + test['objective']

    test = test.reset_index()
    test = test[['rcn', 'text', 1]]
    test = test.rename(columns={'rcn': 'ID', 'text': 'Text', 1: 'Label'})

    test['Label'] = (test['Label'] > .5).map({True: 'Yes', False: 'No'})
    test.dropna(inplace=True)
    
    test.to_csv(f'../../data/interim/smart_sdg_{sdg}.csv', index=False)
    
    test['Label'] = ''
    test.to_csv(f'../../data/interim/smart_sdg_{sdg}_unlabelled.csv', index=False)