In [None]:
import fasttext

In [None]:
# utility function for creating input files for fasttext

import string

def FTInputFile(categoriesFile,xFile,yFile,labelString,outFile):
    category = []
    i = 0
    with open(categoriesFile) as f:
        for line in f:
            category.append(str(i))
            i = i+1

    with open(xFile) as xf, open(yFile) as yf, open(outFile,'w') as out:
        for x,y in zip(xf,yf):
            strX = "".join([c for c in x[:-1] if c not in string.punctuation]).lower()
            out.write(labelString + category[int(y)] + " " + strX)
            out.write("\n")

In [None]:
# general vars

_dataDir = "../../data/"

ftlabel = "__label__"
wVectors = "../../word_vectors/wiki.es.vec"
dimensions = 300


In [None]:
# create input files for fasttext

_tmpTrainFile = "_train_"
_tmpDevFile = "_dev_"
_tmpTestFile = "_test_"

for tt in ["_train_", "_dev_", "_test_"]:
    for i in [1,2,3,4]:
        tema = str(i)
        FTInputFile(_dataDir + "categorias_tema_" + tema + "_pnud_0.txt",
                    _dataDir + "x" + tt + "tema_" + tema + "_categorias_pnud_0.txt",
                    _dataDir + "y" + tt + "tema_" + tema + "_categorias_pnud_0.txt",
                    ftlabel,
                    tt + tema + ".tmp")
        

In [None]:
#********************
#*** FINISH tema 1: 
#*** best recall at 1: 0.6731728475384949 with parameters (0.06, 5, 5)
#*** model stored at _model_ptvec_1_best_at_1.bin
#*** best recall at 5: 0.9071784862285838 with dimensions (0.05, 5, 5)
#*** model stored at _model_ptvec_1_best_at_5.bin
#********************


# train N models for every "tema" and keep the best models on the dev set according to recall @1 and @5
# usign pretrained vectors wiki.es.vec

import itertools

# number of repetitions
N = 5

# temas
temas = [1,2,3,4]

# prefix for the generated model
model_name = "_model_ptvec_"

# parameters
dim = 300
lrs = [0.05, 0.06, 0.065]
window_sizes = [5]
epochs = [5,10]
#neg_samples = [5,6,7]
#word_ngrams = [1,2,3]

import os
import shutil
import sys


# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None 
    
    best_recall_at_1 = 0
    best_recall_at_5 = 0
    

    # best models in previous experiments
    if t == 1:
        best_recall_at_1 = 0.65
        best_recall_at_5 = 0.89   
        
    if t == 2:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91
    
    if t == 3:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91   
        
    if t == 4:
        best_recall_at_1 = 0.70
        best_recall_at_5 = 0.91


    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (lr,ws,e) in itertools.product(lrs,window_sizes,epochs):
        for i in range(0,N):
            if i == 0:
                print("Checking the combination " + str((lr,ws,e)))
            sys.stdout.write(str(i) + " ")
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e,
                                             pretrained_vectors = wVectors
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")
            if i == N-1:
                print(" (done)")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")

In [None]:
#********************
#*** FINISH tema 2: 
#*** best recall at 1: 0.7224891329215283 with parameters (0.05, 5, 10)
#*** model stored at _model_ptvec_2_best_at_1.bin
#*** best recall at 5: 0.9256463051933196 with dimensions (0.065, 5, 5)
#*** model stored at _model_ptvec_2_best_at_5.bin
#********************

#********************
#*** FINISH tema 3: 
#*** best recall at 1: 0.7637677547969101 with parameters (0.05, 5, 5)
#*** model stored at _model_ptvec_3_best_at_1.bin
#*** best recall at 5: 0.9641166209818092 with dimensions (0.05, 5, 5)
#*** model stored at _model_ptvec_3_best_at_5.bin
#********************

#********************
#*** FINISH tema 4: 
#*** best recall at 1: 0.7086513994910941 with parameters (0.05, 5, 5)
#*** model stored at _model_ptvec_4_best_at_1.bin
#*** best recall at 5: 0.9223918575063613 with dimensions (0.05, 5, 5)
#*** model stored at _model_ptvec_4_best_at_5.bin
#********************


# train N models for every "tema" and keep the best models on the dev set according to recall @1 and @5
# usign pretrained vectors wiki.es.vec

import itertools

# number of repetitions
N = 5

# temas
temas = [2,3,4]

# prefix for the generated model
model_name = "_model_ptvec_"

# parameters
dim = 300
lrs = [0.05, 0.06, 0.065]
window_sizes = [5]
epochs = [5,10]
#neg_samples = [5,6,7]
#word_ngrams = [1,2,3]

import os
import shutil
import sys


# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None 
    
    best_recall_at_1 = 0
    best_recall_at_5 = 0
    

    # best models in previous experiments
    if t == 1:
        best_recall_at_1 = 0.65
        best_recall_at_5 = 0.89   
        
    if t == 2:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91
    
    if t == 3:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91   
        
    if t == 4:
        best_recall_at_1 = 0.70
        best_recall_at_5 = 0.91


    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (lr,ws,e) in itertools.product(lrs,window_sizes,epochs):
        for i in range(0,N):
            if i == 0:
                print("Checking the combination " + str((lr,ws,e)))
            sys.stdout.write(str(i) + " ")
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e,
                                             pretrained_vectors = wVectors
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")
            if i == N-1:
                print(" (done)")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")

In [None]:
# BIGRAMS
# train N models for every "tema" and keep the best models on the dev set according to recall @1 and @5
# usign pretrained vectors wiki.es.vec

#********************
#*** FINISH tema 1: 
#*** best recall at 1: 0.6829321188462373 with parameters (0.08, 5, 5)
#*** model stored at _model_ptvec_ng2_1_best_at_1.bin
#*** best recall at 5: 0.9024072869225764 with dimensions (0.08, 5, 5)
#*** model stored at _model_ptvec_ng2_1_best_at_5.bin
#********************

#********************
#*** FINISH tema 2: 
#*** best recall at 1: 0.7236330359185541 with parameters (0.05, 5, 10)
#*** model stored at _model_ptvec_ng2_2_best_at_1.bin
#*** best recall at 5: 0.9233584991992679 with dimensions (0.08, 5, 5)
#*** model stored at _model_ptvec_ng2_2_best_at_5.bin
#********************

# STOPPED AFTER TEMA 2

import itertools

# number of repetitions
N = 5

# temas
temas = [1,2]

word_ngrams = 2
# prefix for the generated model
model_name = "_model_ptvec_ng" + str(word_ngrams) + "_"

# parameters
dim = 300
lrs = [0.05, 0.08, 0.1]
window_sizes = [5]
epochs = [5,10]
#neg_samples = [5,6,7]
#word_ngrams = [1,2,3]

import os
import shutil
import sys


# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None 
    
    best_recall_at_1 = 0
    best_recall_at_5 = 0

    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (lr,ws,e) in itertools.product(lrs,window_sizes,epochs):
        for i in range(0,N):
            if i == 0:
                print("Checking the combination " + str((lr,ws,e)))
            sys.stdout.write(str(i) + " ")
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e,
                                             pretrained_vectors = wVectors,
                                             word_ngrams = word_ngrams,
                                             bucket = 2000000
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")
            if i == N-1:
                print(" (done)")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")

In [None]:
# BIGRAMS
# train N models for every "tema" and keep the best models on the dev set according to recall @1 and @5
# usign pretrained vectors wiki.es.vec

#********************
#*** FINISH tema 3: 
#*** best recall at 1: 0.7754796910042362 with parameters (0.05, 5, 5)
#*** model stored at _model_ptvec_ng2_3_best_at_1.bin
#*** best recall at 5: 0.9613755295290306 with dimensions (0.05, 5, 5)
#*** model stored at _model_ptvec_ng2_3_best_at_5.bin
#********************

#********************
#*** FINISH tema 4: 
#*** best recall at 1: 0.7134860050890586 with parameters (0.08, 5, 5)
#*** model stored at _model_ptvec_ng2_4_best_at_1.bin
#*** best recall at 5: 0.9231552162849873 with dimensions (0.05, 5, 5)
#*** model stored at _model_ptvec_ng2_4_best_at_5.bin
#********************


import itertools

# number of repetitions
N = 3

# temas
temas = [3,4]

word_ngrams = 2
# prefix for the generated model
model_name = "_model_ptvec_ng" + str(word_ngrams) + "_"

# parameters
dim = 300
lrs = [0.05, 0.08]
window_sizes = [5]
epochs = [5,10]
#neg_samples = [5,6,7]
#word_ngrams = [1,2,3]

import os
import shutil
import sys


# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None 
    
    best_recall_at_1 = 0
    best_recall_at_5 = 0

    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (lr,ws,e) in itertools.product(lrs,window_sizes,epochs):
        for i in range(0,N):
            if i == 0:
                print("Checking the combination " + str((lr,ws,e)))
            sys.stdout.write(str(i) + " ")
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e,
                                             pretrained_vectors = wVectors,
                                             word_ngrams = word_ngrams,
                                             bucket = 2000000
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")
            if i == N-1:
                print(" (done)")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")