In [1]:
import fasttext

In [6]:
# general vars

_dataDir = "../../data/tarea3/"
ftlabel = "__label__"

In [7]:
# utility function for creating input files for fasttext

import string

def FTInputFile(modosFile,xFile,yFile,labelString,outFile):
    modos = []
    i = 0
    with open(modosFile) as f:
        for line in f:
            modos.append(str(i))
            i = i+1

    with open(xFile) as xf, open(yFile) as yf, open(outFile,'w') as out:
        for x,y in zip(xf,yf):
            strX = "".join([c for c in x[:-1] if c not in string.punctuation]).lower()
            out.write(labelString + modos[int(y)] + " " + strX)
            out.write("\n")

# create input files for fasttext

_tmpTrainFile = "_train_"
_tmpDevFile = "_dev_"
_tmpTestFile = "_test_"

for tt in ["_train_", "_dev_", "_test_"]:
    FTInputFile(_dataDir + "modos.txt",
                    _dataDir + "x" + tt + "modo.txt",
                    _dataDir + "y" + tt + "modo.txt",
                    ftlabel,
                    tt + ".tmp")

In [26]:
# grid search for parameters metric = recall

import sys
import itertools
import os
import shutil


# prefix for the generated model
model_name = "_model_"

###
###

if os.path.isfile(model_name + "best.bin"):
    print("Previous existent model")
    best_prev_classifier = fasttext.load_model(model_name + "best.bin")
    best = best_prev_classifier.test(_tmpDevFile + ".tmp", k=1).recall
    print("Best previous recall " + str(round(best,4)))
else:
    print("No previous model")
    best = 0.0

# number of repetitions
N = 10
# parameters
dimensions = [40,50]
# dimensions = [30,40]
lrs = [0.081,0.085,0.09]
#lrs = [0.05]
window_sizes = [3,4,5]
# window_sizes = [5]
epochs = [7,10]
#epochs = [5]


# Perfomr a grid search for all combinations of parameters. 
# For every combination do N repetitions

for (dim,lr,ws,e) in itertools.product(dimensions,lrs,window_sizes,epochs):
    for i in range(0,N):
        if i == 0:
            print("Checking the combination " + str((dim,lr,ws,e)))
        sys.stdout.write(str(i)+" ")
        classifier = fasttext.supervised(_tmpTrainFile + ".tmp",
                                         model_name,
                                         label_prefix = ftlabel,
                                         dim = dim,
                                         lr = lr,
                                         ws = ws,
                                         epoch = e
                                        )

        result = classifier.test(_tmpDevFile + ".tmp", k=1)

        if result.recall > best:
            best = result.recall
            best_classifier = classifier
            best_parameters = (dim,lr,ws,e)
            shutil.copyfile(model_name + ".bin", model_name + "best.bin")
            print("* updating best recall with parameters " + str(best_parameters) + 
                  " best recall so far: " + str(best))

        os.remove(model_name + ".bin")
        if i == N-1:
            print(" (done)")

print("********************")
print("*** FINISH ")
print("*** best recall: " + str(best) + " with parameters " + str(best_parameters))
print("*** model stored at " + model_name + "best.bin")
print("********************")


Previous existent model
Best previous recall 0.8062
Checking the combination (40, 0.081, 3, 7)
0 1 2 3 4 5 * updating best recall with parameters (40, 0.081, 3, 7) best recall so far: 0.8062229437229437
6 7 8 9  (done)
Checking the combination (40, 0.081, 3, 10)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (40, 0.081, 4, 7)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (40, 0.081, 4, 10)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (40, 0.081, 5, 7)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (40, 0.081, 5, 10)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (40, 0.085, 3, 7)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (40, 0.085, 3, 10)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (40, 0.085, 4, 7)
0 * updating best recall with parameters (40, 0.085, 4, 7) best recall so far: 0.8063852813852814
1 2 3 4 * updating best recall with parameters (40, 0.085, 4, 7) best recall so far: 0.8064935064935065
5 6 7 8 9  (done)
Checking the comb

In [31]:
# utility function to select the first prediction from a list of predictions and generate a 1D list of single predictions
def first_prediction(lists_of_predictions):
    out = []
    for predictions in lists_of_predictions:
        out.append(predictions[0])
    return out

In [44]:
# grid search for parameters metric = f1-macro
# change best if there are already better precomputed models for f1-macro
best = 0.628

import sys
import itertools
import os
import shutil
from sklearn import metrics

# prefix for the generated model
model_name = "_model_f1_"


############# primero carga los datos para testear el modelo segun f1

import string

def read_text_file_for_ft_input(filename):
    with open(filename) as f:
        out = []
        for line in f:
            # keep tab to separate original concepts from justifications
            strdata = "".join([c for c in line[:-1] if c not in string.punctuation or c == '\t']).lower()
            if strdata == '':
                strdata = ' '
            out.append(strdata)
    return out

def read_numbers_file_for_ft_input(filename):
    with open(filename) as f:
        out = []
        for line in f:
            out.append(int(line))
    return out

dev_x = {}
dev_y = {}

dev_x = read_text_file_for_ft_input(
        _dataDir + "x_dev_modo.txt")
dev_y = read_numbers_file_for_ft_input(
        _dataDir + "y_dev_modo.txt")

modos = []
# load categories first
modosFile = _dataDir + "modos.txt"
with open(modosFile) as f:
    for line in f:
        modos.append(line[:-1])    

#############################

# number of repetitions
N = 10
# parameters
dimensions = [30,40,50,55]
# dimensions = [30,40]
lrs = [0.085,0.086,0.0863]
#lrs = [0.05]
window_sizes = [6,7]
# window_sizes = [5]
epochs = [10,12]
#epochs = [5]


# Perfomr a grid search for all combinations of parameters. 
# For every combination do N repetitions

for (dim,lr,ws,e) in itertools.product(dimensions,lrs,window_sizes,epochs):
    for i in range(0,N):
        if i == 0:
            print("Checking the combination " + str((dim,lr,ws,e)))
        sys.stdout.write(str(i)+" ")
        classifier = fasttext.supervised(_tmpTrainFile + ".tmp",
                                         model_name,
                                         label_prefix = ftlabel,
                                         dim = dim,
                                         lr = lr,
                                         ws = ws,
                                         epoch = e
                                        )

        ## ve que tan bien está el modelo respecto de f1
        predictions_dev_with_labels = classifier.predict(dev_x,k = 3)
        predictions_dev = []
        for pred_list in predictions_dev_with_labels:
            predictions_dev.append([int(p) for p in pred_list])
        f1 = metrics.f1_score(dev_y,first_prediction(predictions_dev),average='macro')

        if f1 > best:
            best = f1
            best_classifier = classifier
            best_parameters = (dim,lr,ws,e)
            shutil.copyfile(model_name + ".bin", model_name + "best.bin")
            print("* updating best f1-macro with parameters " + str(best_parameters) + 
                  " best f1-macro so far: " + str(best))

        os.remove(model_name + ".bin")
        if i == N-1:
            print(" (done)")

print("********************")
print("*** FINISH ")
print("*** best f1-macro: " + str(best) + " with parameters " + str(best_parameters))
print("*** model stored at " + model_name + "best.bin")
print("********************")


Checking the combination (30, 0.085, 6, 10)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.085, 6, 12)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.085, 7, 10)
0 1 * updating best f1-macro with parameters (30, 0.085, 7, 10) best f1-macro so far: 0.628383952168
2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.085, 7, 12)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.086, 6, 10)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.086, 6, 12)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.086, 7, 10)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.086, 7, 12)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.0863, 6, 10)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.0863, 6, 12)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.0863, 7, 10)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.0863, 7, 12)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (40, 0.085