# Matching algorithm

This scripts implements function to:
- represent textual data of projects and sdg in a vector space, by means of word-embeddings;
- assign a list of relevance scores to projects, one for each goal, using similarity metrics;
- evaluate the accuracy of the matching algorithm by means of standard metrics.

In [None]:
load_docs = True
dump_docs = False

In [None]:
import pandas, pickle, random
import numpy as np
from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss
import matplotlib.pyplot as plt

import nlp_functions, explanation, utils, evaluation

Computes the relevance scores to sdgs for each spacy document in a given list, using either the default or the custom similarity. If "only_projects_with_labels" is True, then the projects that in the original dataset have 0 true labels are ignored.

Returns three lists, where at each position there are the true labels, the estimated scores (not sorted) and the id of the project they refer to, respectively.

In [None]:
def apply_matching(project_docs, goal_docs, similarity='custom'):
    y_true = []
    y_score = []
    for pdoc in project_docs:
        true_labels = [1 if str(i) in pdoc._.goal_labels else 0 for i in range(14)]
        label_scores = nlp_functions.compute_goal_scores(pdoc, goal_docs, similarity=similarity)
        if any(np.isnan(ls) for ls in label_scores):
            continue
        pdoc._.predicted_goal_scores = sorted([(i+1,s) for i,s in enumerate(label_scores)], reverse=True, key=lambda x:x[-1])
        y_true.append(true_labels)
        y_score.append(label_scores)
    return y_true, y_score

In [None]:
utils.set_spacy_extensions()

In [None]:
if load_docs:
    with open('data/pickles/project_docs_labeled_optimized.pkl', 'rb') as f:
        project_docs = pickle.load(f)
    with open('data/pickles/goal_docs_optimized.pkl', 'rb') as f:
        goal_docs = pickle.load(f)
else:
    projects_df = pandas.read_csv('data/ris3-mcat-projects-cleaned.csv', sep='\t')
    goals_df = pandas.read_excel('data/un-goals.xlsx')
    project_docs, goal_docs = nlp_functions.generate_project_and_goal_docs(projects_df, goals_df, select_projects='labeled')

In [None]:
if dump_docs:
    with open('data/pickles/project_docs_labeled_optimized.pkl', 'wb') as f:
        pickle.dump(project_docs, f, protocol=pickle.HIGHEST_PROTOCOL)

    with open('data/pickles/goal_docs_optimized.pkl', 'wb') as f:
        pickle.dump(goal_docs, f, protocol=pickle.HIGHEST_PROTOCOL)

Split the dataset into training and test set.

In [None]:
random.seed(10)
test_project_docs = random.sample(project_docs, 200)
training_project_docs = [pdoc for pdoc in project_docs if pdoc not in test_project_docs]

Evaluate the performance of the matching algorithm.

For reference about the metrics:
- Label ranking average precision: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.label_ranking_average_precision_score.html#sklearn.metrics.label_ranking_average_precision_score
- Label ranking loss: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.label_ranking_loss.html#sklearn.metrics.label_ranking_loss
- Coverage error: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.coverage_error.html#sklearn.metrics.coverage_error

In [None]:
y_true_train, y_score_train = apply_matching(training_project_docs, goal_docs, similarity='custom')

for y in y_score_train:
    evaluation.select_labels_from_scores(y)
raise Exception

ranking_metrics = evaluation.compute_ranking_metrics(y_true_train, y_score_train)
classification_metrics = evaluation.compute_classification_metrics(y_true_train, y_score_train, 
                                                                   label_selection_method='dcg')

print('RESULTS ON TRAINING SET')
evaluation.print_metrics(ranking_metrics)
evaluation.print_metrics(classification_metrics)  

In [None]:
y_true_test, y_score_test = apply_matching(project_docs, goal_docs, similarity='custom')

ranking_metrics = evaluation.compute_ranking_metrics(y_true_test, y_score_test)
classification_metrics = evaluation.compute_classification_metrics(y_true_test, y_score_test, 
                                                                   label_selection_k='R')

print('RESULTS ON TEST SET')
evaluation.print_metrics(ranking_metrics)
evaluation.print_metrics(classification_metrics)

In [None]:
explanation.visualize_output(project_docs, goal_docs, percentile_highlighted_words=75, use_colors=False) 