In [9]:
import fasttext

In [10]:
# utility function for creating input files for fasttext

import string

def FTInputFile(categoriesFile,xFile,yFile,labelString,outFile):
    category = []
    i = 0
    with open(categoriesFile) as f:
        for line in f:
            category.append(str(i))
            i = i+1

    with open(xFile) as xf, open(yFile) as yf, open(outFile,'w') as out:
        for x,y in zip(xf,yf):
            strX = "".join([c for c in x[:-1] if c not in string.punctuation]).lower()
            out.write(labelString + category[int(y)] + " " + strX)
            out.write("\n")

In [11]:
# general vars

_dataDir = "../../data/"

ftlabel = "__label__"
wVectors = "../../word_vectors/wiki.es.vec"
dimensions = 300


In [12]:
# create input files for fasttext

_tmpTrainFile = "_train_"
_tmpDevFile = "_dev_"
_tmpTestFile = "_test_"

for tt in ["_train_", "_dev_", "_test_"]:
    for i in [1,2,3,4]:
        tema = str(i)
        FTInputFile(_dataDir + "categorias_tema_" + tema + "_pnud_0.txt",
                    _dataDir + "x" + tt + "tema_" + tema + "_categorias_pnud_0.txt",
                    _dataDir + "y" + tt + "tema_" + tema + "_categorias_pnud_0.txt",
                    ftlabel,
                    tt + tema + ".tmp")
        

In [18]:
# train N models for every "tema" and keep the best models on the dev set according to recall @1 and @5
# usign pretrained vectors wiki.es.vec

import itertools

# number of repetitions
N = 5

# temas
temas = [1,2,3,4]

# prefix for the generated model
model_name = "_model_ptvec_"

# parameters
dim = 300
lrs = [0.05, 0.06, 0.065]
window_sizes = [5]
epochs = [5,10]
#neg_samples = [5,6,7]
#word_ngrams = [1,2,3]

import os
import shutil
import sys


# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None 
    
    best_recall_at_1 = 0
    best_recall_at_5 = 0
    

    # best models in previous experiments
    if t == 1:
        best_recall_at_1 = 0.65
        best_recall_at_5 = 0.89   
        
    if t == 2:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91
    
    if t == 3:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91   
        
    if t == 4:
        best_recall_at_1 = 0.70
        best_recall_at_5 = 0.91


    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (lr,ws,e) in itertools.product(lrs,window_sizes,epochs):
        for i in range(0,N):
            if i == 0:
                print("Checking the combination " + str((lr,ws,e)))
            sys.stdout.write(str(i) + " ")
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e,
                                             pretrained_vectors = wVectors
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")
            if i == N-1:
                print(" (done)")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")

*** BEGIN tema 1: 
Checking the combination (0.05, 5, 5)
0 * updating best recall at 1 with parameters (0.05, 5, 5) best recall at 1 so far: 0.6720884840598569
* updating best recall at 5 with parameters (0.05, 5, 5) best recall at 5 so far: 0.9054435046627629
1 * updating best recall at 5 with parameters (0.05, 5, 5) best recall at 5 so far: 0.9071784862285838
2 3 4  (done)
Checking the combination (0.05, 5, 10)
0 1 2 3 4  (done)
Checking the combination (0.06, 5, 5)
0 * updating best recall at 1 with parameters (0.06, 5, 5) best recall at 1 so far: 0.6731728475384949
1 2 3 4  (done)
Checking the combination (0.06, 5, 10)
0 1 2 3 4  (done)
Checking the combination (0.065, 5, 5)
0 1 2 3 4  (done)
Checking the combination (0.065, 5, 10)
0 1 2 3 4  (done)
********************
*** FINISH tema 1: 
*** best recall at 1: 0.6731728475384949 with parameters (0.06, 5, 5)
*** model stored at _model_ptvec_1_best_at_1.bin
*** best recall at 5: 0.9071784862285838 with dimensions (0.05, 5, 5)
*** mo

TypeError: 'tuple' object does not support item assignment

In [19]:
# train N models for every "tema" and keep the best models on the dev set according to recall @1 and @5
# usign pretrained vectors wiki.es.vec

import itertools

# number of repetitions
N = 5

# temas
temas = [2,3,4]

# prefix for the generated model
model_name = "_model_ptvec_"

# parameters
dim = 300
lrs = [0.05, 0.06, 0.065]
window_sizes = [5]
epochs = [5,10]
#neg_samples = [5,6,7]
#word_ngrams = [1,2,3]

import os
import shutil
import sys


# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None 
    
    best_recall_at_1 = 0
    best_recall_at_5 = 0
    

    # best models in previous experiments
    if t == 1:
        best_recall_at_1 = 0.65
        best_recall_at_5 = 0.89   
        
    if t == 2:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91
    
    if t == 3:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91   
        
    if t == 4:
        best_recall_at_1 = 0.70
        best_recall_at_5 = 0.91


    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (lr,ws,e) in itertools.product(lrs,window_sizes,epochs):
        for i in range(0,N):
            if i == 0:
                print("Checking the combination " + str((lr,ws,e)))
            sys.stdout.write(str(i) + " ")
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e,
                                             pretrained_vectors = wVectors
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")
            if i == N-1:
                print(" (done)")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")

*** BEGIN tema 2: 
Checking the combination (0.05, 5, 5)
0 * updating best recall at 1 with parameters (0.05, 5, 5) best recall at 1 so far: 0.7188286433310456
* updating best recall at 5 with parameters (0.05, 5, 5) best recall at 5 so far: 0.9233584991992679
1 2 3 4  (done)
Checking the combination (0.05, 5, 10)
0 * updating best recall at 1 with parameters (0.05, 5, 10) best recall at 1 so far: 0.7218027911233127
1 2 3 * updating best recall at 1 with parameters (0.05, 5, 10) best recall at 1 so far: 0.7224891329215283
4  (done)
Checking the combination (0.06, 5, 5)
0 * updating best recall at 5 with parameters (0.06, 5, 5) best recall at 5 so far: 0.9242736215968886
1 * updating best recall at 5 with parameters (0.06, 5, 5) best recall at 5 so far: 0.9247311827956989
2 3 4  (done)
Checking the combination (0.06, 5, 10)
0 1 2 3 4  (done)
Checking the combination (0.065, 5, 5)
0 1 * updating best recall at 5 with parameters (0.065, 5, 5) best recall at 5 so far: 0.9251887439945092
2 

In [21]:
# best models
#
# Tema 1:
# recall at 1:
# recall at 5: 
#
# Tema 2:
# recall at 1:
# recall at 5: 
#
# Tema 3:
# recall at 1:
# recall at 5: 
#
# Tema 4:
# recall at 1:
# recall at 5: 
#

import fasttext

temas = [1,2,3,4]

# prefix for the models to load
model_name = "_model_ptvec_"

# load best models
models_at_1 = [None]*5
models_at_5 = [None]*5
for i in temas:
    models_at_1[i] = fasttext.load_model(model_name + str(i) + "_best_at_1.bin")
    models_at_5[i] = fasttext.load_model(model_name + str(i) + "_best_at_5.bin")

In [43]:
# utility function for reading list values for fasttext

import string

def read_text_file_for_ft_input(filename):
    with open(filename) as f:
        out = []
        for line in f:
            strdata = "".join([c for c in line[:-1] if c not in string.punctuation]).lower()
            ### This is because I found a string in the data that is empty after removing punctuation
            if strdata == '':
                strdata = ' '
            out.append(strdata)
    return out

def read_numbers_file_for_ft_input(filename):
    with open(filename) as f:
        out = []
        for line in f:
            out.append(int(line))
    return out

In [44]:
# load data to predict

train_x = [None]*5
train_y = [None]*5
test_x = [None]*5
test_y = [None]*5
dev_x = [None]*5
dev_y = [None]*5

for i in temas:

    train_x[i] = read_text_file_for_ft_input(
        _dataDir + "x_train_tema_" + str(i) + "_categorias_pnud_0.txt")
    train_y[i] = read_numbers_file_for_ft_input(
        _dataDir + "y_train_tema_" + str(i) + "_categorias_pnud_0.txt")
    
    dev_x[i] = read_text_file_for_ft_input(
        _dataDir + "x_dev_tema_" + str(i) + "_categorias_pnud_0.txt")
    dev_y[i] = read_numbers_file_for_ft_input(
        _dataDir + "y_dev_tema_" + str(i) + "_categorias_pnud_0.txt")
    
    test_x[i] = read_text_file_for_ft_input(
        _dataDir + "x_test_tema_" + str(i) + "_categorias_pnud_0.txt")
    test_y[i] = read_numbers_file_for_ft_input(
        _dataDir + "y_test_tema_" + str(i) + "_categorias_pnud_0.txt")



In [45]:
#sizes = [None,5,5,5,5]

predict_at_1_train = [None]*5
predict_at_5_train = [None]*5

predict_at_1_dev = [None]*5
predict_at_5_dev = [None]*5

predict_at_1_test = [None]*5
predict_at_5_test = [None]*5

# size of the list of predicted 
sizes = [None,1,1,1,1]
for i in temas:
    predict_at_1_train[i] = models_at_1[i].predict(train_x[i],k= sizes[i])
    predict_at_1_dev[i] = models_at_1[i].predict(dev_x[i],k= sizes[i])
    predict_at_1_test[i] = models_at_1[i].predict(test_x[i],k= sizes[i])

# size of the list of predicted 
sizes = [None,5,5,5,5]
for i in temas:    
    predict_at_5_train[i] = models_at_5[i].predict(train_x[i],k= sizes[i])   
    predict_at_5_dev[i] = models_at_5[i].predict(dev_x[i],k= sizes[i])    
    predict_at_5_test[i] = models_at_5[i].predict(test_x[i],k= sizes[i])    


In [46]:
# transform labels to int values

def label_to_int(L):
    for i in temas:
        for j in range(len(L[i])):
            for k in range(len(L[i][j])):
                L[i][j][k] = int(L[i][j][k][9:])

                                
label_to_int(predict_at_1_train)
label_to_int(predict_at_5_train)
label_to_int(predict_at_1_dev)
label_to_int(predict_at_5_dev)
label_to_int(predict_at_1_test)
label_to_int(predict_at_5_test)


In [47]:
import random
random.seed = 1284213

def random_guess(list,k):
    possible_values = []
    for v in list:
        if v not in possible_values:
            possible_values.append(v)
    out = []
    for v in list:
        ranking = []
        for i in range(0,k):
            y = random.choice(possible_values)
            ranking.append(y)
        out.append(ranking)
    return out

def print_metrics(at_N, tema):
    if at_N == 1:
        predicted_train = predict_at_1_train
        predicted_dev = predict_at_1_dev
        predicted_test = predict_at_1_test
    if at_N == 5:
        predicted_train = predict_at_5_train
        predicted_dev = predict_at_5_dev
        predicted_test = predict_at_5_test
       
    print("Tema " + str(tema))
    print("---------")
    for t in ['train','dev', 'test']:
        if t == 'train':
            gold = train_y[tema]
            predicted = predicted_train[i]
            
        if t == 'dev':
            gold = dev_y[i]
            predicted = predicted_dev[i]
            # predicted_random = random_guess(gold,at_N)
        if t == 'test':
            gold = test_y[i]
            predicted = predicted_test[i]
            # predicted_random = random_guess(gold,at_N)

        # compuet global recall at_N
        count = 0
        # count_rand = 0
        for g,a in zip(gold,predicted):
            if g not in a:
                count += 1
        N=len(gold)
                
        print(t + ": \t" + str(round((1-count/N)*100,1)) + "%")
    print()
   

In [48]:
# print metrics for all temas
# first at 1

print("********************************")
print("Metrics for best models Recall@1")
print("********************************")
print()
for i in temas:
    print_metrics(1,i)

print("********************************")
print("Metrics for best models Recall@5")
print("********************************")
print()
for i in temas:
    print_metrics(5,i)
    

********************************
Metrics for best models Recall@1
********************************

Tema 1
---------
train: 	76.7%
dev: 	67.3%
test: 	67.1%

Tema 2
---------
train: 	86.5%
dev: 	72.2%
test: 	70.7%

Tema 3
---------
train: 	84.4%
dev: 	76.4%
test: 	75.7%

Tema 4
---------
train: 	81.7%
dev: 	70.9%
test: 	69.4%

********************************
Metrics for best models Recall@5
********************************

Tema 1
---------
train: 	94.7%
dev: 	90.7%
test: 	90.5%

Tema 2
---------
train: 	97.0%
dev: 	92.6%
test: 	92.4%

Tema 3
---------
train: 	98.7%
dev: 	96.4%
test: 	96.1%

Tema 4
---------
train: 	97.5%
dev: 	92.2%
test: 	92.2%



In [49]:
from sklearn import metrics

# construct inputs for metrics.classification_report
def first_prediction(L):
    out = []
    for l in L:
        out.append(l[0])
    return out

for i in [1,2,3,4]:
    # load categories first
    categoriesFile = _dataDir + "categorias_tema_" + str(i) + "_pnud_0.txt"
    categories = []
    with open(categoriesFile) as f:
        for line in f:
            categories.append(line[:-1])

    predicted = first_prediction(predict_at_5_test[i])
    print("Tema " + str(i))
    print(metrics.classification_report(test_y[i],predicted,target_names=categories))

Tema 1
                                                          precision    recall  f1-score   support

                                          Amistad cívica       0.25      0.04      0.06        27
                                    Autonomía / Libertad       0.68      0.65      0.67       168
                                  Bien Común / Comunidad       0.66      0.72      0.69       276
                                              Ciudadanía       0.00      0.00      0.00        16
                                              Democracia       0.60      0.74      0.66       380
                                              Desarrollo       0.51      0.38      0.44        52
                                       Descentralización       0.91      0.92      0.91       307
                                                Dignidad       0.62      0.67      0.65       193
                                              Diversidad       0.40      0.26      0.32        72
            

  'precision', 'predicted', average, warn_for)


In [50]:
### what if we concatenate the name of the class???

sizes = [None,1,1,1,1]

for i in temas:
    categoriesFile = _dataDir + "categorias_tema_" + str(i) + "_pnud_0.txt"
    categories = []
    with open(categoriesFile) as f:
        for line in f:
            categories.append(line[:-1])

    # create a new list for testing test_x_plus_category 
    test_x_plus_category = []
    for f,cat in zip(test_x[i],test_y[i]):
        str_category = "".join([c for c in categories[cat] if c not in string.punctuation]).lower()
        test_x_plus_category.append(str_category + " " + f)
    
#    print(test_x_plus_category)
            
    predicted = first_prediction(models_at_1[i].predict(test_x_plus_category,k= sizes[i]))
    predicted_ints = []
    for l in predicted:
        predicted_ints.append(int(l[9:]))
#    print(predicted_ints)
                    
    print("Tema " + str(i))
    print(metrics.classification_report(test_y[i],predicted_ints,target_names=categories))

Tema 1
                                                          precision    recall  f1-score   support

                                          Amistad cívica       1.00      0.63      0.77        27
                                    Autonomía / Libertad       0.91      0.98      0.94       168
                                  Bien Común / Comunidad       0.87      0.98      0.92       276
                                              Ciudadanía       0.00      0.00      0.00        16
                                              Democracia       0.75      0.90      0.82       380
                                              Desarrollo       0.90      0.67      0.77        52
                                       Descentralización       0.98      0.99      0.98       307
                                                Dignidad       0.90      0.89      0.89       193
                                              Diversidad       0.60      0.46      0.52        72
            

  'precision', 'predicted', average, warn_for)


Tema 3
                                                                         precision    recall  f1-score   support

                                     Cumplimiento de las leyes y normas       0.94      0.97      0.95       369
                                  Cumplimiento de obligaciones fiscales       0.95      0.97      0.96       161
                Cumplimiento de tratados y obligaciones internacionales       0.96      0.99      0.98       161
        De protección y conservación de patrimonio histórico y cultural       0.99      1.00      1.00       443
                                          De satisfacer cargas públicas       0.95      0.93      0.94        41
                 Deberes de protección de conservación de la naturaleza       0.99      0.99      0.99       551
                        Ejercicio legítimo y no abusivo de los derechos       0.93      0.95      0.94       346
Protección, promoción y respeto de los derechos humanos y fundamentales       0.96      

In [51]:
for i in [1,2,3,4]:
    categoriesFile = _dataDir + "categorias_tema_" + str(i) + "_pnud_0.txt"
    categories = []
    gold = []
    k = 0
    with open(categoriesFile) as f:
        for line in f:
            str_category = "".join([c for c in line[:-1] if c not in string.punctuation]).lower()
            categories.append(str_category)
            gold.append(k)
            k+=1
#    print(categories)
#    print(gold)
     
    predicted = models_at_1[i].predict(categories,k= 1)
    predicted = first_prediction(models_at_1[i].predict(categories,k= 1))
    predicted_ints = []
    for l in predicted:
        predicted_ints.append(int(l[9:]))
#    print(predicted_ints)
                    
    print("Tema " + str(i))
    print(metrics.classification_report(gold,predicted_ints,target_names=categories))
    

Tema 1
                                                         precision    recall  f1-score   support

                                         amistad cívica       1.00      1.00      1.00         1
                                    autonomía  libertad       1.00      1.00      1.00         1
                                  bien común  comunidad       1.00      1.00      1.00         1
                                             ciudadanía       1.00      1.00      1.00         1
                                             democracia       0.50      1.00      0.67         1
                                             desarrollo       1.00      1.00      1.00         1
                                      descentralización       1.00      1.00      1.00         1
                                               dignidad       1.00      1.00      1.00         1
                                             diversidad       0.00      0.00      0.00         1
                      

  'precision', 'predicted', average, warn_for)
