In [2]:
import fasttext

In [4]:
# utility function for creating input files for fasttext

import string

def FTInputFile(categoriesFile,xFile,yFile,labelString,outFile):
    category = []
    i = 0
    with open(categoriesFile) as f:
        for line in f:
            category.append(str(i))
            i = i+1

    with open(xFile) as xf, open(yFile) as yf, open(outFile,'w') as out:
        for x,y in zip(xf,yf):
            strX = "".join([c for c in x[:-1] if c not in string.punctuation]).lower()
            out.write(labelString + category[int(y)] + " " + strX)
            out.write("\n")

In [None]:
# create input files for fasttext

_tmpTrainFile = "_train_"
_tmpDevFile = "_dev_"
_tmpTestFile = "_test_"

for tt in ["_train_", "_dev_", "_test_"]:
    for i in [1,2,3,4]:
        tema = str(i)
        FTInputFile(_dataDir + "categorias_tema_" + tema + "_pnud_0.txt",
                    _dataDir + "x" + tt + "tema_" + tema + "_categorias_pnud_0.txt",
                    _dataDir + "y" + tt + "tema_" + tema + "_categorias_pnud_0.txt",
                    ftlabel,
                    tt + tema + ".tmp")

In [3]:
# general vars

_dataDir = "../../data/"
ftlabel = "__label__"

In [2]:
# train N models for every "tema" and keep the best models on the dev set according to recall @1 and @5
# no pretrained vectors used

import itertools

# number of repetitions
N = 10

# temas
temas = [1,2,3,4]

# prefix for the generated model
model_name = "_model_"

# parameters
dimensions = [20,30,40,50,60,70,80]
lrs = [0.06, 0.056, 0.053, 0.05]
window_sizes = [4,5,6,7]
epochs = [5,7,10,15]
#neg_samples = [5,6,7]
#word_ngrams = [1,2,3]

import os
import shutil

# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None   
    
    best_recall_at_1 = 0
    best_recall_at_5 = 0

    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (dim,lr,ws,e) in itertools.product(dimensions,lrs,window_sizes,epochs):
        if dim != dim_ant:
            print("Currently checking dimension " + str(dim))
            dim_ant = dim
        for i in range(0,N):
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")

*** BEGIN tema 1: 
Currently checking dimension 20
* updating best recall at 1 with parameters (20, 0.06, 4, 5) best recall at 1 so far: 0.5775319887226198
* updating best recall at 5 with parameters (20, 0.06, 4, 5) best recall at 5 so far: 0.8306224246367382
* updating best recall at 5 with parameters (20, 0.06, 4, 5) best recall at 5 so far: 0.8336586423769248
* updating best recall at 5 with parameters (20, 0.06, 4, 5) best recall at 5 so far: 0.8345261331598351
* updating best recall at 1 with parameters (20, 0.06, 4, 5) best recall at 1 so far: 0.5781826068098026
* updating best recall at 5 with parameters (20, 0.06, 4, 5) best recall at 5 so far: 0.8360442420299284
* updating best recall at 1 with parameters (20, 0.06, 4, 7) best recall at 1 so far: 0.6052916937757536
* updating best recall at 5 with parameters (20, 0.06, 4, 7) best recall at 5 so far: 0.8581652569941445
* updating best recall at 1 with parameters (20, 0.06, 4, 7) best recall at 1 so far: 0.6102797657774887
* up

In [10]:
# best models always achieved with learning rate ~ 0.06, epochs >= 10, 
# dimensions >= 30 (until 50), and window size 5 or 6
# parameters for a new grid search for every tema

# try new parameters for tema = 1,2,4

import sys

# number of repetitions
N = 10

# temas
temas = [1,2,4]

# prefix for the generated model
model_name = "_model_"

# parameters
dimensions = [30,40,50]
lrs = [0.07, 0.08, 0.085, 0.09]
window_sizes = [5,6]
epochs = [15,20,25]

# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None

    
    if t == 1:
        best_recall_at_1 = 0.65
        best_recall_at_5 = 0.89   
        
    if t == 2:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91
        
    if t == 4:
        best_recall_at_1 = 0.70
        best_recall_at_5 = 0.91

    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (dim,lr,ws,e) in itertools.product(dimensions,lrs,window_sizes,epochs):
        for i in range(0,N):
            if i == 0:
                print("Checking the combination " + str((dim,lr,ws,e)))
            sys.stdout.write(str(i)+" ")
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")
            if i == N-1:
                print(" (done)")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")


*** BEGIN tema 1: 
Checking the combination (30, 0.07, 5, 15)
0 * updating best recall at 5 with parameters (30, 0.07, 5, 15) best recall at 5 so far: 0.8909130340490132
1 * updating best recall at 5 with parameters (30, 0.07, 5, 15) best recall at 5 so far: 0.891563652136196
2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.07, 5, 20)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.07, 5, 25)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.07, 6, 15)
0 1 2 3 4 * updating best recall at 5 with parameters (30, 0.07, 6, 15) best recall at 5 so far: 0.8917805248319237
5 6 7 8 9  (done)
Checking the combination (30, 0.07, 6, 20)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.07, 6, 25)
0 1 2 3 4 5 6 7 8 9  (done)
Checking the combination (30, 0.08, 5, 15)
0 1 2 3 * updating best recall at 1 with parameters (30, 0.08, 5, 15) best recall at 1 so far: 0.6525699414443722
* updating best recall at 5 with parameters (30, 0.08, 5, 15) best recall at 5 so

In [11]:
# only tema 1 is still under 0.7 @ 1 and 0.9 @ 5
# parameters for a new grid search only for tema 1

# try new parameters for tema = 1

# number of repetitions
N = 30

# temas
temas = [1]

# prefix for the generated model
model_name = "_model_"

# parameters
dimensions = [30,40,50]
lrs = [0.07, 0.073, 0.076, 0.08]
window_sizes = [4,5,6]
epochs = [13, 15, 17]


# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None

    
    if t == 1:
        best_recall_at_1 = 0.653
        best_recall_at_5 = 0.893   

    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (dim,lr,ws,e) in itertools.product(dimensions,lrs,window_sizes,epochs):
        for i in range(0,N):
            if i == 0:
                print("Checking the combination " + str((dim,lr,ws,e)))
            sys.stdout.write(str(i)+" ")
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")
            if i == N-1:
                print(" (done)")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with parameters " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")


*** BEGIN tema 1: 
Checking the combination (30, 0.07, 4, 13)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29  (done)
Checking the combination (30, 0.07, 4, 15)
0 1 2 3 4 5 6 7 8 9 10 * updating best recall at 5 with parameters (30, 0.07, 4, 15) best recall at 5 so far: 0.8932986337020169
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29  (done)
Checking the combination (30, 0.07, 4, 17)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29  (done)
Checking the combination (30, 0.07, 5, 13)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29  (done)
Checking the combination (30, 0.07, 5, 15)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29  (done)
Checking the combination (30, 0.07, 5, 17)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29  (done)
Checking the combination (30, 0.07, 6, 13)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18