In [None]:
%run ../notebook_preamble.ipy

import spacy

from sdg_mapping.cordis import load_cordis_projects, load_cordis_project_sdgs
from sdg_mapping.cordis.cordis_utils import FRAMEWORK_PROGRAMMES
from sdg_mapping.utils.sdg_utils import sdg_hex_color_codes, sdg_names

import seaborn as sns
import os
import re

from spacy.matcher import Matcher
from collections import defaultdict
from collections import Counter

%load_ext line_profiler

In [None]:
sdg_keys = list(range(1, 17))

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
projects = []
project_sdgs = []

for fp in FRAMEWORK_PROGRAMMES:
    projects.append(load_cordis_projects(fp).set_index('rcn'))
    project_sdgs.append(load_cordis_project_sdgs(fp, 'probability').set_index('rcn'))
    
projects = pd.concat(projects)
project_sdgs = pd.concat(project_sdgs)

In [None]:
projects = projects.merge(project_sdgs, left_index=True, right_index=True, how='right')

In [None]:
vocab_path = f'{data_path}/raw/sdg_vocabulary/siris_sdg_vocabulary_v1.2.xlsx' 

sdg_vocab = {}

for i in range(1, 17):
    df = pd.read_excel(vocab_path, sheet_name=f'SDG {i}')
    sdg_vocab[i] = df['keyword'].values

## Annotated

In [None]:
annotated_dir = f'{data_path}/interim/doccano/results'
label_dir = f'{data_path}/interim/doccano/results/labels'

dfs = {}
for file in os.listdir(annotated_dir):
    if '.csv' in file:
        fin = os.path.join(annotated_dir, file)
        df = pd.read_csv(fin)
        n = int(fin.split('_')[-1].split('.')[0][3:])

        label_path = os.path.join(label_dir, f'labels_sdg{n}.json')
        labels = pd.read_json(label_path)
        label_map = {i: k for i, k in zip(labels['id'], labels['suffix_key'])}
        df['label'] = df['label'].map(label_map)
        df['label'] = df['label'].map({'y': 1, 'n': 0})

        df = df.rename(columns={'meta.rcn': 'rcn'})
        df = df.set_index('rcn')

        dfs[n] = df

In [None]:
for df in dfs.values():
    projects = projects.drop(projects.index.intersection(df.index.values))

### Probability Distributions

In [None]:
fig, axs = plt.subplots(ncols=4, nrows=4, figsize=(15,13))

for i, ax in enumerate(axs.ravel()):
    projects[i+1].plot.hist(ax=ax)

    ax.set_yscale('log')
    
plt.tight_layout()

### Getting Documents from Across the Prediction Probability Distribution

In [None]:
sdg_keys = list(range(1, 17))

project_ids = {}
sizes = [100, 100, 100, 300]

for sdg in sdg_keys:
    probs = projects[sdg]
    
    steps = np.linspace(probs.min(), probs.max(), 5)
    ids = []
    for lower, upper, size in zip(steps[:-1], steps[1:], sizes):
        probs_step = probs[(probs > lower) & (probs <= upper)]
        if probs_step.shape[0] < size:
            ids.extend(probs_step.index.values)
        else:
            ids_q = probs[(probs > lower) & (probs <= upper)].sample(size, random_state=0).index.values
            ids.extend(ids_q)
    project_ids[sdg] = ids

### Getting Similar Documents that Are Not Positively Predicted

In [None]:
import tensorflow_hub as hub

In [None]:
embed = hub.load('/Users/grichardson/models/universal-sentence-encoder_4')

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
embeddings = []

for chunk in chunks(projects['objective'].fillna(''), 1000):
    embeddings.extend(embed(chunk).numpy())

In [None]:
from annoy import AnnoyIndex

In [None]:
vec_df = pd.DataFrame(np.array(embeddings))
vec_df.index = projects.index.values

In [None]:
t = AnnoyIndex(512, 'angular')

for i, v in zip(projects.index.values, np.array(vec_df)):
    t.add_item(i, v)
    
t.build(500)

In [None]:
mean_vecs = []

for sdg in sdg_keys:
    ids = projects.sort_values(sdg, ascending=False).index.values[:100]
    mean_vecs.append(vec_df.loc[ids].mean())

In [None]:
extra_ids = {}

for sdg, vec in zip(sdg_keys, mean_vecs):
    similar = t.get_nns_by_vector(vec, 3000)
    similar_negative_projects = projects.loc[similar][projects.loc[similar][sdg] < .5][sdg]
    similar_negative_projects = similar_negative_projects.sort_values(ascending=False)
    
    similar_negative_ids = set(similar_negative_projects.index.values[:200])
    
    sdg_ids = set(project_ids[sdg])
    
    extra_ids[sdg] = list(similar_negative_ids.difference(sdg_ids))
    

In [None]:
for k, v in project_ids.items():
    project_ids[k].extend(extra_ids[k])

### Documents That are Similar to Manually Labelled Documents but Not Included

In [None]:
for df in dfs.values():
    df['text'] = df['text'].apply(lambda x: x.split(' === ')[-1])

In [None]:
# correctly_labelled_vec = {}

# for sdg, df in dfs.items():
#     positives = df[df['label'] == 1]
#     embedding = embed(df['text'].values).numpy().mean(axis=0)
#     correctly_labelled_vec[sdg] = embedding

In [None]:
# extra_ids = {}

# for sdg, vec in zip(sdg_keys, correctly_labelled_vec.values()):
#     similar = t.get_nns_by_vector(vec, 201)
#     similar_negative_ids = set(similar)
#     sdg_ids = set(project_ids[sdg])
    
#     extra_ids[sdg] = list(similar_negative_ids.difference(sdg_ids))
    

In [None]:
for k, v in project_ids.items():
    project_ids[k].extend(extra_ids[k])

In [None]:
correctly_labelled_vec = {}

similar_negative_ids = {}

for sdg, df in dfs.items():
    positives = df[df['label'] == 1]
#     embedding = embed(df['text'].values).numpy().mean(axis=0)
    embedding = embed(df['text'].values).numpy()
    similar = []
    for vec in embedding:
        similar.extend(t.get_nns_by_vector(vec, 20)[1:])
    similar_negative_ids[sdg] = set(similar)
#     correctly_labelled_vec[sdg] = embedding

In [None]:
for k, v in project_ids.items():
    extra_ids = list(similar_negative_ids[k].difference(sdg_ids))
    project_ids[k].extend(extra_ids)

## PhraseMatcher

In [None]:
from spacy.matcher import PhraseMatcher
from itertools import chain

In [None]:
def get_match_counts(doc, matcher):
    matches = matcher(doc)
    id_counts = Counter([m[0] for m in matches])
    return id_counts

In [None]:
def get_match_sdgs(texts, ids, matcher, batch_size=1000, n_process=3, disable=[]):
    match_counts = []
    for doc in nlp.pipe(texts, batch_size=batch_size, 
                        disable=disable, n_process=n_process):
        match_counts.append(get_match_counts(doc, matcher))
    df = pd.DataFrame(match_counts, index=ids)
    df.columns = [nlp.vocab[c].text for c in df.columns]
    return df

In [None]:
def print_sdg_matches(doc):
    x = []
    for sent in doc.sents:
        matches = matcher(sent.as_doc())
        x.extend([(nlp.vocab[m[0]].text, sent[m[1]: m[2]]) for m in matches])
    return x

In [None]:
sdg_vocab_extra = {}

for sdg in range(1, 17):
    kw_counts = Counter(
        chain(*[kw.split(' ') for kw in sdg_vocab[sdg]])
    ).most_common(20)
    extra_kw = [k for k, v in kw_counts if (len(k) > 3) & (v > 2)]
    sdg_vocab_extra[sdg] = extra_kw

In [None]:
removes = {
    1: ['social', 'development', 'government', 'work'],
    2: ['work', 'production, growth', 'productivity', 'income', 'quality'],
    3: ['carbon', 'child', 'quality'],
    4: ['discrimination', 'social', 'global', 'prejudice'],
    5: ['work', 'against', 'convention'],
    6: ['management', 'pollution', 'system', 'quality'],
    7: [],
    8: ['child', 'access'],
    9: ['development', 'research', 'social', 'sustainable', 'work'],
    10: ['social', 'country', 'financial', 'work'],
    11: ['design', 'climate', 'change', 'environmental'],
    12: ['sustainable'],
    13: ['change', 'management', 'global', 'sustainable'],
    14: ['sustainable', 'resource'],
    15: ['loss'],
    16: ['right', 'child', 'education', 'social', 'access']

}

adds = {
    1: ['welfare', 'poverty'],
    2: ['hunger', 'malnutrition', 'nutrition'],
    3: [],
    4: ['childcare'],
    5: ['bride', 'female'],
    6: ['hygiene'],
    7: ['battery', 'photovoltaic'],
    8: [],
    9: [],
    10: ['lgbt', 'lgbtq', 'lgbtq+', 'homophobia', 'lesbian', 'gay', 'bisexual', 'transgender', 'intersex'],
    11: [],
    12: ['plastic'],
    13: [],
    14: ['sea'],
    15: [],
    16: ['corrupt', 'justice']
}

In [None]:
for sdg, v in sdg_vocab_extra.items():
    remove = removes[sdg]
    add = adds.get(sdg, [])
    
    sdg_vocab_extra[sdg] = [s for s in sdg_vocab_extra[sdg] if s not in remove]
    sdg_vocab_extra[sdg].extend(add)
    
    sdg_vocab_extra[sdg] = list(set(sdg_vocab_extra[sdg]))

In [None]:
matcher = PhraseMatcher(nlp.vocab, attr="LEMMA")

for sdg in range(1, 17):
    matcher.add(f"sdg{sdg}", None, *[nlp(kw) for kw in sdg_vocab[sdg]])
    matcher.add(f"sdg{sdg}", None, *[nlp(kw) for kw in sdg_vocab_extra[sdg]])

In [None]:
# %lprun -f get_match_sdgs 
df_kws = get_match_sdgs(
    projects['objective'].fillna(''), 
    projects.index.values, matcher, 
    disable=['ner', 'parser']
)

In [None]:
phrase_match_ids = {}

for col in df_kws.columns:
    sdg = int(col.replace('sdg', ''))
    
    ids = df_kws.sort_values(col, ascending=False).index.values[:600]
    
    sdg_ids = set(project_ids[sdg])
    
    phrase_match_ids[sdg] = list(set(ids).difference(sdg_ids))

In [None]:
for k, v in project_ids.items():
    project_ids[k].extend(phrase_match_ids[k])

### Negative Adjustment

Add in some extra random articles to add balance to the corpora.

In [None]:
n = 19

((n * 2) - 20) / 20

In [None]:
for i in projects.loc[phrase_match_ids[16]].sample(20).title.values:
    print('>', i)

In [None]:
negative_adjustment = {
    1: 0,
    2: 0.6,
    3: 0.6,
    4: 0,
    5: 0,
    6: 0.4,
    7: 0.5,
    8: 0,
    9: 0,
    10: 0,
    11: 0.4,
    12: 0.3,
    13: 0.6,
    14: 0.3,
    15: 0.9,
    16: 0
}

In [None]:
for sdg, v in negative_adjustment.items():
    n = round(len(project_ids[k]) * v)
    neg_ids = set(projects.sample(n, random_state=0).index.values)
    neg_ids = neg_ids.difference(project_ids[k])
    
    project_ids[sdg].extend(neg_ids)

### Export

In [None]:
exports = {}

for sdg in sdg_keys:
    export = projects.loc[project_ids[sdg]].sample(frac=1, random_state=0)
    exports[sdg] = export

In [None]:
for sdg in sdg_keys:
    test = exports[sdg]
    test['text'] = '===== ' + test['title'] + ' =====                    ' + test['objective']

    test = test.reset_index()
    test = test[['rcn', 'text', 1]]
    test = test.rename(columns={'rcn': 'ID', 'text': 'Text', 1: 'Label'})

    test['Label'] = (test['Label'] > .5).map({True: 'Yes', False: 'No'})
    test.dropna(inplace=True)
    
    test.to_csv(f'../../data/interim/smart_sdg_{sdg}.csv', index=False)
    
    test['Label'] = ''
    test.to_csv(f'../../data/interim/smart_sdg_{sdg}_unlabelled.csv', index=False)

## Matcher (not used)

In [None]:
n = 1
patterns = defaultdict(list)

for sdg in range(1, 17):
    kws = sdg_vocab[sdg]

    for k in kws:
        _k = k.split(' ')
        if len(_k) == 2:
            for i in range(0, n + 1):
                p = [{"LEMMA": _k[0]}, *[{}] * i, {"LEMMA": _k[1]}]
                p_r = [{"LEMMA": _k[1]}, *[{}] * i, {"LEMMA": _k[0]}]
                patterns[sdg].append(p)
                patterns[sdg].append(p_r)
        else:
            patterns[sdg].append([{"LEMMA": l} for l in _k])
            
matcher = Matcher(nlp.vocab)

for sdg in range(1, 17):
    matcher.add(f'sdg{sdg}', None, *patterns[sdg])

In [None]:
%lprun -f get_match_counts get_match_counts(doc, matcher)

In [None]:
%lprun -f get_match_sdgs get_match_sdgs(projects['objective'].values[:100], df.index.values[:100], matcher, disable=['ner', 'parser'])