# Tarea 2: clasificar conceptos y fundamentos abiertos

## Consideramos los mismos modelos computados para la tarea 1

<p>Carga los modelos creados previamente (OJO cargar los modelos de vectores preentrendos puede tardar mucho y usar mucha memoria porque pesa 1GB cada uno)</p>

In [85]:
# we assume models are already created
# these are the global variables used when computing all the metrics
# best models in with no pretrained vectors in _model_<tema>_best_at_<k>.bin
# best models in with pretrained vectors in _model_<tema>_best_at_<k>.bin

import fasttext

_dataDir = "../../data/tarea2/"
_models_task1 = "../../task1/fasttext/"
ftlabel = "__label__"
temas = [1,2,3,4]

# prefix and suffixes for the models to load
model_no_ptvec = "_model_"
model_ptvec = "_model_ptvec_"
at_1 = "at_1"
at_5 = "at_5"

# what models to consider
# m_prefixes = [model_no_ptvec]
# descomentar lo siguient si se quieren cargar los modelos con vectores preentrenados
m_prefixes = [model_no_ptvec, model_ptvec]
m_suffixes = [at_1,at_5]

# names of all considered models, to iterate over models
model_names = []
for prefix in m_prefixes:
    for suffix in m_suffixes:
        model_names.append(prefix + suffix)
        
def print_model_name(model):
    return model[7:-5]

In [63]:
# load models
models = {}
for prefix in m_prefixes:
    for suffix in m_suffixes:
        models[prefix + suffix] = {}
        for tema in temas:
            models[prefix + suffix][tema] =  fasttext.load_model(_models_task1 + prefix + str(tema) + "_best_" + suffix + ".bin")
          

<p>Ahora carga los datos para ajustar los modelos. Cargamos solo test set</p>

In [143]:
# load data to predict

import string

def read_text_file_for_ft_input(filename):
    with open(filename) as f:
        out = []
        for line in f:
            # keep tab to separate original concepts from justifications
            strdata = "".join([c for c in line[:-1] if c not in string.punctuation or c == '\t']).lower()
            if strdata == '':
                strdata = ' '
            out.append(strdata)
    return out

def read_numbers_file_for_ft_input(filename):
    with open(filename) as f:
        out = []
        for line in f:
            out.append(int(line))
    return out

test_x_concepto = {}
test_x_fundamento = {}
test_x_concepto_fundamento = {}
test_y = {}

for i in temas:
    test_x_concepto_fundamento[i] = read_text_file_for_ft_input(
        _dataDir + "x_test_tema_" + str(i) + "_categorias_pnud_1.txt")
    test_y[i] = read_numbers_file_for_ft_input(
        _dataDir + "y_test_tema_" + str(i) + "_categorias_pnud_1.txt")
    
for i in temas:
    test_x_concepto[i] = []
    test_x_fundamento[i] = []
    for texto in test_x_concepto_fundamento[i]:
        test_x_concepto[i].append(texto.split('\t')[0])
        test_x_fundamento[i].append(texto.split('\t')[1])

categories = {}
for i in temas:
    categories[i] = []
    # load categories first
    categoriesFile = _dataDir + "categorias_tema_" + str(i) + "_pnud_0.txt"
    with open(categoriesFile) as f:
        for line in f:
            categories[i].append(line[:-1])

<p>Computa las predicciones para cada tema y con cada uno de los modelos</p>

In [144]:
sizes = {}
sizes[1] = 37
sizes[2] = 44
sizes[3] = 12
sizes[4] = 21

predictions_with_labels = {}

for model_name in model_names:
    predictions_with_labels[model_name] = {}
    for tema in temas:
        predictions_with_labels[model_name][tema] = {}
        predictions_with_labels[model_name][tema]['cf'] = models[model_name][tema].predict(
            test_x_concepto_fundamento[tema],k = sizes[tema])
        predictions_with_labels[model_name][tema]['c'] = models[model_name][tema].predict(
            test_x_concepto[tema],k = sizes[tema])
        predictions_with_labels[model_name][tema]['f'] = models[model_name][tema].predict(
            test_x_fundamento[tema],k = sizes[tema])

predictions = {}

# cambia las predicciones del tipo <ftlabel><C> a simplemente <C>
for model_name in model_names:
    predictions[model_name] = {}
    for tema in temas:
        predictions[model_name][tema] = {}
        for inp in ['cf','c','f']:
            predictions[model_name][tema][inp] = []
            for pred_list_labels in predictions_with_labels[model_name][tema][inp]:
                pred_list = []
                for label in pred_list_labels:
                    pred_list.append(int(label[len(ftlabel):]))
                predictions[model_name][tema][inp].append(pred_list)
            
# utility function to select the first prediction from a list of predictions and generate a 1D list of single predictions
def first_prediction(lists_of_predictions):
    out = []
    for predictions in lists_of_predictions:
        out.append(predictions[0])
    return out      

## Genera reportes

In [150]:
from sklearn import metrics

for i in temas:
    print("Tema " + str(i) + "\t\tacc\trec\tprec\tf1")
    for model,model_name in zip(['_model_at_1', '_model_ptvec_at_1'],['npt','ptv']):
        for inp in ['f','c','cf']:
            prediction = first_prediction(predictions[model][i][inp])
            acc = round(100*metrics.accuracy_score(test_y[i],prediction),2)
            rec = round(100*metrics.recall_score(test_y[i],prediction,average='weighted'),2)
            prec = round(100*metrics.precision_score(test_y[i],prediction,average='weighted'),2)
            f1 = round(100*metrics.f1_score(test_y[i],prediction,average='weighted'),2)
            print(model_name + "-" + inp + "\t\t"+ str(acc)+ "\t"+ str(rec) +"\t"+str(prec)+"\t"+str(f1))
    print()

            

Tema 1		acc	rec	prec	f1
npt-f		46.12	46.12	45.09	43.84
npt-c		61.31	61.31	65.47	60.72
npt-cf		59.96	59.96	58.83	57.26
ptv-f		47.91	47.91	47.27	45.94
ptv-c		61.37	61.37	66.57	60.7
ptv-cf		60.72	60.72	60.96	58.38

Tema 2		acc	rec	prec	f1
npt-f		54.98	54.98	58.03	54.11
npt-c		71.68	71.68	75.46	71.34
npt-cf		69.56	69.56	70.21	67.77
ptv-f		57.77	57.77	60.57	57.03
ptv-c		73.31	73.31	76.17	72.9
ptv-cf		70.62	70.62	72.79	69.06

Tema 3		acc	rec	prec	f1
npt-f		60.19	60.19	65.7	61.55
npt-c		77.31	77.31	81.86	78.52
npt-cf		75.32	75.32	78.68	75.62
ptv-f		61.65	61.65	66.8	62.89
ptv-c		79.01	79.01	83.48	80.17
ptv-cf		75.5	75.5	79.27	75.94

Tema 4		acc	rec	prec	f1
npt-f		35.93	35.93	59.73	32.37
npt-c		52.11	52.11	74.14	49.74
npt-cf		49.7	49.7	69.79	45.09
ptv-f		37.35	37.35	59.4	34.25
ptv-c		55.29	55.29	76.51	54.27
ptv-cf		52.71	52.71	72.44	49.79



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


## Genera reporte para predicciones en listas (top_k)

In [152]:
# utility function to compute top k accuracy

def top_k_accuracy(gold,predicted,k):
    '''
    #Arguments
        gold: the true labels of the test cases (size N = number of test cases)
        predicted: ranked list of label predictions for every test case (size N x L, where L is assumed to be >= k)
        k: the number of elements in the predicted lists that should be considered to compute the metric
    #Returns
        The portion of cases (between 0 and 1) in which the true label value was among the first k predicted labels
    '''
    count = 0
    for g,pred_labels in zip(gold,predicted):
        if g in pred_labels[:k]:
                count += 1
    return count/len(gold)

In [154]:
top_k_options = [1,3,5,7,10]
for i in temas:
    head_str = "Tema " + str(i) + "\t\t"
    head_str += "\t".join([str(op) for op in top_k_options])
    print(head_str)
    for model, model_name in zip(['_model_ptvec_at_5','_model_at_5'],['ptv','npt']):
        for prediction,gold,set_name in zip(
                [predictions[model][i]],
                [test_y[i]],
                ['test']):
            for inp in ['f','c','cf']:
                data_str = set_name + "-" + model_name + "-" + inp
                for k in top_k_options:
                    top_k = round(100*top_k_accuracy(gold,prediction[inp],k),2)
                    data_str += "\t" + str(top_k)
                print(data_str)
    print()

Tema 1		1	3	5	7	10
test-ptv-f	47.64	70.21	80.2	84.64	89.09
test-ptv-c	63.27	85.3	88.99	92.3	93.27
test-ptv-cf	60.72	83.72	89.91	92.4	94.63
test-npt-f	45.96	68.53	76.94	81.82	87.3
test-npt-c	60.12	78.24	83.18	86.65	87.68
test-npt-cf	59.74	81.33	87.52	90.94	93.71

Tema 2		1	3	5	7	10
test-ptv-f	57.71	77.78	83.62	86.77	89.95
test-ptv-c	72.77	89.24	91.81	93.1	95.12
test-ptv-cf	70.94	88.54	92.32	94.12	95.66
test-npt-f	55.2	74.21	81.31	84.84	87.99
test-npt-c	71.32	87.64	90.3	91.78	93.48
test-npt-cf	69.43	87.12	91.23	92.68	94.51

Tema 3		1	3	5	7	10
test-ptv-f	61.28	83.12	90.5	94.19	98.79
test-ptv-c	78.71	92.14	95.28	96.91	98.91
test-ptv-cf	75.5	91.11	95.52	97.34	99.58
test-npt-f	60.56	81.55	90.44	95.04	98.61
test-npt-c	77.13	90.32	94.13	96.55	98.73
test-npt-cf	75.62	91.29	95.52	98.0	99.46

Tema 4		1	3	5	7	10
test-ptv-f	37.81	57.88	68.92	77.24	85.27
test-ptv-c	55.68	77.81	86.44	88.88	90.44
test-ptv-cf	52.74	73.38	83.15	89.42	94.05
test-npt-f	35.93	54.41	64.88	72.88	82.65
test-npt-c	52.46	64.32	

In [156]:
import numpy as np

def ranking_sizes(gold,predicted):
    r_sizes = []
    for g,pred_labels in zip(gold,predicted):
        if g not in pred_labels:
            raise Exception('Label ' + str(g) + ' is not in the ranking.')
        r_sizes.append(pred_labels.index(g) + 1)
    return np.array(r_sizes)

In [161]:
percentile_options = [80,85,90,95]
for i in temas:
    head_str = "Tema " + str(i) + "\t\t"
    head_str += "\t".join([str(op)+"%" for op in percentile_options])
    print(head_str)
    for model, model_name in zip(['_model_ptvec_at_5','_model_at_5'],['ptv','npt']):
        prediction = predictions[model][i]
        gold = test_y[i]
        set_name = "real"
        for inp in ['f','c','cf']:
            data_str = set_name + "-" + model_name + "-" + inp
            for k in percentile_options:
                percentile = int(np.percentile(ranking_sizes(gold,prediction[inp]),k))
                data_str += "\t" + str(percentile)
            print(data_str)
    print()

Tema 1		80%	85%	90%	95%
real-ptv-f	5	8	11	20
real-ptv-c	3	3	6	16
real-ptv-cf	3	4	6	11
real-npt-f	7	9	13	21
real-npt-c	4	6	12	24
real-npt-cf	3	4	7	12

Tema 2		80%	85%	90%	95%
real-ptv-f	4	6	11	21
real-ptv-c	2	3	4	10
real-ptv-cf	2	3	4	9
real-npt-f	5	8	13	23
real-npt-c	2	3	5	17
real-npt-cf	2	3	5	11

Tema 3		80%	85%	90%	95%
real-ptv-f	3	4	5	8
real-ptv-c	2	2	3	5
real-ptv-cf	2	2	3	5
real-npt-f	3	4	5	7
real-npt-c	2	2	3	6
real-npt-cf	2	2	3	5

Tema 4		80%	85%	90%	95%
real-ptv-f	8	10	13	17
real-ptv-c	4	5	9	17
real-ptv-cf	5	6	8	11
real-npt-f	9	11	15	18
real-npt-c	7	11	17	17
real-npt-cf	7	8	11	17

