# Tarea 1: clasificar fundamentos

## Consideramos los modelos precomputados

<p>Carga los modelos creados previamente (OJO, cargar los modelos de vectores preentrendos puede tardar mucho y usar mucha memoria porque pesa 1GB cada uno)</p>

In [1]:
# we assume models are already created
# these are the global variables used when computing all the metrics
# best models in with no pretrained vectors in _model_<tema>_best_at_<k>.bin
# best models in with pretrained vectors in _model_<tema>_best_at_<k>.bin

import fasttext

_dataDir = "../../data/"
_models_task1 = "../../task1/fasttext/"
ftlabel = "__label__"
temas = [1,2,3,4]

# prefix and suffixes for the models to load
#model_no_ptvec = "_model_"
#model_ptvec = "_model_ptvec_"
#at_1 = "at_1"
#at_5 = "at_5"

# what models to consider
# m_prefixes = [model_no_ptvec]
# descomentar lo siguient si se quieren cargar los modelos con vectores preentrenados
m_prefixes = ["_model_", "_model_ptvec_","_model_ng2_","_model_ptvec_ng2_"]
m_suffixes = ["at_1","at_5"]

# names of all considered models, to iterate over models
model_names = []
for prefix in m_prefixes:
    for suffix in m_suffixes:
        model_names.append(prefix + suffix)
        

<p>Cargamos solo test set y dev set</p>

In [2]:
# load data to predict

import string

def read_text_file_for_ft_input(filename):
    with open(filename) as f:
        out = []
        for line in f:
            # keep tab to separate original concepts from justifications
            strdata = "".join([c for c in line[:-1] if c not in string.punctuation or c == '\t']).lower()
            if strdata == '':
                strdata = ' '
            out.append(strdata)
    return out

def read_numbers_file_for_ft_input(filename):
    with open(filename) as f:
        out = []
        for line in f:
            out.append(int(line))
    return out

test_x = {}
test_y = {}

dev_x = {}
dev_y = {}

for i in temas:
    test_x[i] = read_text_file_for_ft_input(
        _dataDir + "x_test_tema_" + str(i) + "_categorias_pnud_0.txt")
    test_y[i] = read_numbers_file_for_ft_input(
        _dataDir + "y_test_tema_" + str(i) + "_categorias_pnud_0.txt")
    dev_x[i] = read_text_file_for_ft_input(
        _dataDir + "x_dev_tema_" + str(i) + "_categorias_pnud_0.txt")
    dev_y[i] = read_numbers_file_for_ft_input(
        _dataDir + "y_dev_tema_" + str(i) + "_categorias_pnud_0.txt")
    


categories = {}
for i in temas:
    categories[i] = []
    # load categories first
    categoriesFile = _dataDir + "categorias_tema_" + str(i) + "_pnud_0.txt"
    with open(categoriesFile) as f:
        for line in f:
            categories[i].append(line[:-1])

<p>Computa las predicciones para cada tema y con cada uno de los modelos</p>

In [5]:
sizes = {}
sizes[1] = 37
sizes[2] = 44
sizes[3] = 12
sizes[4] = 21

predictions_dev_with_labels = {}
predictions_with_labels = {}

for prefix in m_prefixes:
    for suffix in m_suffixes:
        model_name = prefix + suffix
        predictions_with_labels[model_name] = {}
        predictions_dev_with_labels[model_name] = {}
        for tema in temas:
            model = fasttext.load_model(_models_task1 + prefix + str(tema) + "_best_" + suffix + ".bin")
            predictions_dev_with_labels[model_name][tema] = model.predict(
                dev_x[tema],k = sizes[tema])
            predictions_with_labels[model_name][tema] = model.predict(
                test_x[tema],k = sizes[tema])
            # free model
            # models[prefix + suffix][tema] = None


#for model_name in model_names:
#    predictions_with_labels[model_name] = {}
#    predictions_dev_with_labels[model_name] = {}
#    for tema in temas:
#        predictions_dev_with_labels[model_name][tema] = models[model_name][tema].predict(
#            dev_x[tema],k = sizes[tema])
#        predictions_with_labels[model_name][tema] = models[model_name][tema].predict(
#            test_x[tema],k = sizes[tema])

predictions = {}
predictions_dev = {}

# cambia las predicciones del tipo <ftlabel><C> a simplemente <C>
for model_name in model_names:
    predictions_dev[model_name] = {}
    predictions[model_name] = {}
    for tema in temas:
        predictions[model_name][tema] = []
        for pred_list_labels in predictions_with_labels[model_name][tema]:
            pred_list = []
            for label in pred_list_labels:
                pred_list.append(int(label[len(ftlabel):]))
            predictions[model_name][tema].append(pred_list)
        predictions_dev[model_name][tema] = []
        for pred_list_labels in predictions_dev_with_labels[model_name][tema]:
            pred_list = []
            for label in pred_list_labels:
                pred_list.append(int(label[len(ftlabel):]))
            predictions_dev[model_name][tema].append(pred_list)

            
# utility function to select the first prediction from a list of predictions and generate a 1D list of single predictions
def first_prediction(lists_of_predictions):
    out = []
    for predictions in lists_of_predictions:
        out.append(predictions[0])
    return out      

## Genera reportes para primera predicción

In [6]:
from sklearn import metrics

reported_models = ["_model_at_1", "_model_ptvec_at_1","_model_ng2_at_1","_model_ptvec_ng2_at_1"]
reported_models_labels = ['nptv','ptv','npn2','ptn2']

for i in temas:
    print("Tema " + str(i) + "\t\trecavg\tprecavg\tf1avg\tf1macro")
    for model,model_name in zip(reported_models,reported_models_labels):
        for pred,gold,set_name in zip(
            [predictions_dev[model][i],predictions[model][i]],
            [dev_y[i],test_y[i]],
            ['devs','test']):
            prediction = first_prediction(pred)
#            acc = round(100*metrics.accuracy_score(gold,prediction),2)
            rec = round(100*metrics.recall_score(gold,prediction,average='weighted'),2)
            prec = round(100*metrics.precision_score(gold,prediction,average='weighted'),2)
            f1 = round(100*metrics.f1_score(gold,prediction,average='weighted'),2)
            f1m = round(100*metrics.f1_score(gold,prediction,average='macro'),2)
            print(set_name + "-" + model_name + 
#                  "\t"+ str(acc)+ 
                  "\t"+ str(rec)+
                  "\t"+str(prec)+
                  "\t"+str(f1)+
                  "\t"+str(f1m))
#            print(metrics.classification_report(gold,prediction))
    print()

            

Tema 1		recavg	precavg	f1avg	f1macro
devs-nptv	65.19	64.08	64.12	49.66
test-nptv	65.89	65.06	64.9	51.56
devs-ptv	67.34	66.26	66.39	52.12
test-ptv	67.1	66.24	66.24	53.46
devs-npn2	64.8	62.44	63.22	44.79
test-npn2	64.89	62.41	63.29	44.39
devs-ptn2	68.27	67.18	67.28	51.86
test-ptn2	68.03	67.12	67.04	51.88

Tema 2		recavg	precavg	f1avg	f1macro
devs-nptv	71.84	71.05	71.04	54.68
test-nptv	68.56	67.95	67.89	54.08
devs-ptv	72.27	71.72	71.66	58.49
test-ptv	70.78	70.49	70.32	59.95
devs-npn2	69.25	67.28	67.23	44.4


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


test-npn2	67.09	64.95	65.05	42.39
devs-ptn2	72.36	71.69	71.67	54.9
test-ptn2	71.08	70.45	70.34	55.2

Tema 3		recavg	precavg	f1avg	f1macro
devs-nptv	75.85	75.97	75.6	70.28
test-nptv	75.08	75.14	74.86	70.06
devs-ptv	76.38	76.43	76.23	71.87
test-ptv	75.73	75.72	75.63	72.14
devs-npn2	76.85	76.89	76.53	70.41
test-npn2	75.85	75.86	75.55	69.98
devs-ptn2	77.52	77.54	77.34	72.7
test-ptn2	76.9	76.74	76.63	71.13

Tema 4		recavg	precavg	f1avg	f1macro
devs-nptv	69.97	68.96	69.06	60.21
test-nptv	68.45	67.91	67.84	59.0
devs-ptv	70.84	69.95	70.06	61.61
test-ptv	69.34	68.89	68.83	61.23
devs-npn2	69.16	67.82	68.04	57.6
test-npn2	68.55	67.94	67.7	57.82
devs-ptn2	71.35	70.61	70.74	62.46
test-ptn2	69.44	68.84	68.89	59.89



## Genera reporte para predicciones en listas (top_k)

In [7]:
# utility function to compute top k accuracy

def top_k_accuracy(gold,predicted,k):
    '''
    #Arguments
        gold: the true labels of the test cases (size N = number of test cases)
        predicted: ranked list of label predictions for every test case (size N x L, where L is assumed to be >= k)
        k: the number of elements in the predicted lists that should be considered to compute the metric
    #Returns
        The portion of cases (between 0 and 1) in which the true label value was among the first k predicted labels
    '''
    count = 0
    for g,pred_labels in zip(gold,predicted):
        if g in pred_labels[:k]:
                count += 1
    return count/len(gold)

In [8]:
top_k_options = [3,5,7,10]

reported_models_at_5 = ["_model_at_5", "_model_ptvec_at_5","_model_ng2_at_5","_model_ptvec_ng2_at_5"]
reported_models_labels_at_5 = ['nptv','ptv','npn2','ptn2']


for i in temas:
    head_str = "Tema " + str(i) + "\t\t"
    head_str += "\t".join([str(op) for op in top_k_options])
    print(head_str)
    for model, model_name in zip(reported_models_at_5,reported_models_labels_at_5):
        for prediction,gold,set_name in zip(
                [predictions_dev[model][i],predictions[model][i]],
                [dev_y[i],test_y[i]],
                ['devs','test']):
            data_str = set_name + "-" + model_name
            for k in top_k_options:
                top_k = round(100*top_k_accuracy(gold,prediction,k),2)
                data_str += "\t" + str(top_k)
            print(data_str)
    print()

Tema 1		3	5	7	10
devs-nptv	84.23	89.66	91.91	94.47
test-nptv	84.17	89.44	92.3	94.53
devs-ptv	85.34	90.76	93.3	95.45
test-ptv	85.64	90.72	93.26	95.58
devs-npn2	83.0	88.14	90.96	93.67
test-npn2	83.0	88.22	90.7	93.1
devs-ptn2	85.49	90.26	92.69	94.86
test-ptn2	85.14	90.22	92.52	94.64

Tema 2		3	5	7	10
devs-nptv	87.69	91.9	93.5	94.99
test-nptv	86.52	90.62	93.16	94.65
devs-ptv	88.86	92.68	94.1	95.7
test-ptv	88.44	92.33	94.6	95.95
devs-npn2	85.98	90.28	92.31	94.12
test-npn2	84.21	89.08	91.53	93.5
devs-ptn2	88.52	92.24	93.78	95.36
test-ptn2	87.85	91.81	93.82	95.63

Tema 3		3	5	7	10
devs-nptv	91.63	95.94	97.53	99.08
test-nptv	91.28	95.81	97.78	99.38
devs-ptv	92.25	96.61	97.73	99.1
test-ptv	92.37	96.36	98.03	99.35
devs-npn2	91.58	95.81	97.43	99.2
test-npn2	91.1	95.37	97.46	99.2
devs-ptn2	92.47	96.51	97.91	99.23
test-ptn2	91.78	95.79	97.76	99.38

Tema 4		3	5	7	10
devs-nptv	86.49	91.68	93.89	96.21
test-nptv	86.62	91.15	93.89	96.16
devs-ptv	87.35	92.24	94.53	96.69
test-ptv	87.38	92.52	94.76	97.0
de

In [9]:
import numpy as np

def ranking_sizes(gold,predicted):
    r_sizes = []
    for g,pred_labels in zip(gold,predicted):
        if g not in pred_labels:
            raise Exception('Label ' + str(g) + ' is not in the ranking.')
        r_sizes.append(pred_labels.index(g) + 1)
    return np.array(r_sizes)

In [10]:
percentile_options = [80,85,90,91,92,93,94,95]
for i in temas:
    head_str = "Tema " + str(i) + "\t\t"
    head_str += "\t".join([str(op)+"%" for op in percentile_options])
    print(head_str)
    for model, model_name in zip(reported_models_at_5,reported_models_labels_at_5):
        for prediction,gold,set_name in zip(
                [predictions_dev[model][i],predictions[model][i]],
                [dev_y[i],test_y[i]],
                ['devs','test']):
            data_str = set_name + "-" + model_name
            for k in percentile_options:
                percentile = int(np.percentile(ranking_sizes(gold,prediction),k))
                data_str += "\t" + str(percentile)
            print(data_str)
    print()

Tema 1		80%	85%	90%	91%	92%	93%	94%	95%
devs-nptv	3	4	6	7	8	8	10	11
test-nptv	3	4	6	7	7	8	9	11
devs-ptv	2	3	5	6	6	7	8	10
test-ptv	2	3	5	6	6	7	8	10
devs-npn2	3	4	7	8	8	10	11	13
test-npn2	3	4	7	8	9	10	12	15
devs-ptn2	2	3	5	6	7	8	9	11
test-ptn2	2	3	5	6	7	8	9	11

Tema 2		80%	85%	90%	91%	92%	93%	94%	95%
devs-nptv	2	3	4	5	6	7	9	10
test-nptv	2	3	5	6	6	7	9	12
devs-ptv	2	3	4	4	5	6	7	9
test-ptv	2	3	4	5	5	6	7	8
devs-npn2	2	3	5	6	7	8	10	12
test-npn2	3	4	6	7	8	10	11	14
devs-ptn2	2	3	4	4	5	6	8	10
test-ptn2	2	3	4	5	6	7	8	9

Tema 3		80%	85%	90%	91%	92%	93%	94%	95%
devs-nptv	2	2	3	3	4	4	4	5
test-nptv	2	2	3	3	4	4	4	5
devs-ptv	2	2	3	3	3	4	4	4
test-ptv	2	2	3	3	3	4	4	5
devs-npn2	2	2	3	3	4	4	4	5
test-npn2	2	2	3	3	4	4	5	5
devs-ptn2	2	2	3	3	3	4	4	5
test-ptn2	2	2	3	3	4	4	4	5

Tema 4		80%	85%	90%	91%	92%	93%	94%	95%
devs-nptv	2	3	5	5	6	7	8	9
test-nptv	2	3	5	5	6	7	8	9
devs-ptv	2	3	5	5	5	6	7	8
test-ptv	2	3	4	5	5	6	7	8
devs-npn2	2	3	5	5	6	7	8	9
test-npn2	2	3	5	6	6	7	8	9
devs-ptn2	2	3	4	5	5	6	7	8
test-ptn2	2	3	4	5	