In [9]:
import fasttext

In [10]:
# utility function for creating input files for fasttext

import string

def FTInputFile(categoriesFile,xFile,yFile,labelString,outFile):
    category = []
    i = 0
    with open(categoriesFile) as f:
        for line in f:
            category.append(str(i))
            i = i+1

    with open(xFile) as xf, open(yFile) as yf, open(outFile,'w') as out:
        for x,y in zip(xf,yf):
            strX = "".join([c for c in x[:-1] if c not in string.punctuation]).lower()
            out.write(labelString + category[int(y)] + " " + strX)
            out.write("\n")

In [11]:
# general vars

_dataDir = "../../data/"

ftlabel = "__label__"
wVectors = "../../word_vectors/wiki.es.vec"
dimensions = 300


In [12]:
# create input files for fasttext

_tmpTrainFile = "_train_"
_tmpDevFile = "_dev_"
_tmpTestFile = "_test_"

for tt in ["_train_", "_dev_", "_test_"]:
    for i in [1,2,3,4]:
        tema = str(i)
        FTInputFile(_dataDir + "categorias_tema_" + tema + "_pnud_0.txt",
                    _dataDir + "x" + tt + "tema_" + tema + "_categorias_pnud_0.txt",
                    _dataDir + "y" + tt + "tema_" + tema + "_categorias_pnud_0.txt",
                    ftlabel,
                    tt + tema + ".tmp")
        

In [18]:
# train N models for every "tema" and keep the best models on the dev set according to recall @1 and @5
# usign pretrained vectors wiki.es.vec

import itertools

# number of repetitions
N = 5

# temas
temas = [1,2,3,4]

# prefix for the generated model
model_name = "_model_ptvec_"

# parameters
dim = 300
lrs = [0.05, 0.06, 0.065]
window_sizes = [5]
epochs = [5,10]
#neg_samples = [5,6,7]
#word_ngrams = [1,2,3]

import os
import shutil
import sys


# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None 
    
    best_recall_at_1 = 0
    best_recall_at_5 = 0
    

    # best models in previous experiments
    if t == 1:
        best_recall_at_1 = 0.65
        best_recall_at_5 = 0.89   
        
    if t == 2:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91
    
    if t == 3:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91   
        
    if t == 4:
        best_recall_at_1 = 0.70
        best_recall_at_5 = 0.91


    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (lr,ws,e) in itertools.product(lrs,window_sizes,epochs):
        for i in range(0,N):
            if i == 0:
                print("Checking the combination " + str((lr,ws,e)))
            sys.stdout.write(str(i) + " ")
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e,
                                             pretrained_vectors = wVectors
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")
            if i == N-1:
                print(" (done)")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")

*** BEGIN tema 1: 
Checking the combination (0.05, 5, 5)
0 * updating best recall at 1 with parameters (0.05, 5, 5) best recall at 1 so far: 0.6720884840598569
* updating best recall at 5 with parameters (0.05, 5, 5) best recall at 5 so far: 0.9054435046627629
1 * updating best recall at 5 with parameters (0.05, 5, 5) best recall at 5 so far: 0.9071784862285838
2 3 4  (done)
Checking the combination (0.05, 5, 10)
0 1 2 3 4  (done)
Checking the combination (0.06, 5, 5)
0 * updating best recall at 1 with parameters (0.06, 5, 5) best recall at 1 so far: 0.6731728475384949
1 2 3 4  (done)
Checking the combination (0.06, 5, 10)
0 1 2 3 4  (done)
Checking the combination (0.065, 5, 5)
0 1 2 3 4  (done)
Checking the combination (0.065, 5, 10)
0 1 2 3 4  (done)
********************
*** FINISH tema 1: 
*** best recall at 1: 0.6731728475384949 with parameters (0.06, 5, 5)
*** model stored at _model_ptvec_1_best_at_1.bin
*** best recall at 5: 0.9071784862285838 with dimensions (0.05, 5, 5)
*** mo

TypeError: 'tuple' object does not support item assignment

In [19]:
# train N models for every "tema" and keep the best models on the dev set according to recall @1 and @5
# usign pretrained vectors wiki.es.vec

import itertools

# number of repetitions
N = 5

# temas
temas = [2,3,4]

# prefix for the generated model
model_name = "_model_ptvec_"

# parameters
dim = 300
lrs = [0.05, 0.06, 0.065]
window_sizes = [5]
epochs = [5,10]
#neg_samples = [5,6,7]
#word_ngrams = [1,2,3]

import os
import shutil
import sys


# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None 
    
    best_recall_at_1 = 0
    best_recall_at_5 = 0
    

    # best models in previous experiments
    if t == 1:
        best_recall_at_1 = 0.65
        best_recall_at_5 = 0.89   
        
    if t == 2:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91
    
    if t == 3:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91   
        
    if t == 4:
        best_recall_at_1 = 0.70
        best_recall_at_5 = 0.91


    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (lr,ws,e) in itertools.product(lrs,window_sizes,epochs):
        for i in range(0,N):
            if i == 0:
                print("Checking the combination " + str((lr,ws,e)))
            sys.stdout.write(str(i) + " ")
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e,
                                             pretrained_vectors = wVectors
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")
            if i == N-1:
                print(" (done)")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")

*** BEGIN tema 2: 
Checking the combination (0.05, 5, 5)
0 * updating best recall at 1 with parameters (0.05, 5, 5) best recall at 1 so far: 0.7188286433310456
* updating best recall at 5 with parameters (0.05, 5, 5) best recall at 5 so far: 0.9233584991992679
1 2 3 4  (done)
Checking the combination (0.05, 5, 10)
0 * updating best recall at 1 with parameters (0.05, 5, 10) best recall at 1 so far: 0.7218027911233127
1 2 3 * updating best recall at 1 with parameters (0.05, 5, 10) best recall at 1 so far: 0.7224891329215283
4  (done)
Checking the combination (0.06, 5, 5)
0 * updating best recall at 5 with parameters (0.06, 5, 5) best recall at 5 so far: 0.9242736215968886
1 * updating best recall at 5 with parameters (0.06, 5, 5) best recall at 5 so far: 0.9247311827956989
2 3 4  (done)
Checking the combination (0.06, 5, 10)
0 1 2 3 4  (done)
Checking the combination (0.065, 5, 5)
0 1 * updating best recall at 5 with parameters (0.065, 5, 5) best recall at 5 so far: 0.9251887439945092
2 