In [None]:
import fasttext

In [None]:
# utility function for creating input files for fasttext

import string

def FTInputFile(categoriesFile,xFile,yFile,labelString,outFile):
    category = []
    i = 0
    with open(categoriesFile) as f:
        for line in f:
            category.append(str(i))
            i = i+1

    with open(xFile) as xf, open(yFile) as yf, open(outFile,'w') as out:
        for x,y in zip(xf,yf):
            strX = "".join([c for c in x[:-1] if c not in string.punctuation]).lower()
            out.write(labelString + category[int(y)] + " " + strX)
            out.write("\n")

In [None]:
# general vars

_dataDir = "../../data/"
ftlabel = "__label__"

In [None]:
# create input files for fasttext

_tmpTrainFile = "_train_"
_tmpDevFile = "_dev_"
_tmpTestFile = "_test_"

for tt in ["_train_", "_dev_", "_test_"]:
    for i in [1,2,3,4]:
        tema = str(i)
        FTInputFile(_dataDir + "categorias_tema_" + tema + "_pnud_0.txt",
                    _dataDir + "x" + tt + "tema_" + tema + "_categorias_pnud_0.txt",
                    _dataDir + "y" + tt + "tema_" + tema + "_categorias_pnud_0.txt",
                    ftlabel,
                    tt + tema + ".tmp")

In [None]:
#********************
#*** FINISH tema 3: 
#*** best recall at 1: 0.7587839521554947 with parameters (70, 0.056, 4, 10)
#*** model stored at _model_3_best_at_1.bin
#*** best recall at 5: 0.9583852479441815 with dimensions (30, 0.053, 6, 10)
#*** model stored at _model_3_best_at_5.bin
#********************


# train N models for every "tema" and keep the best models on the dev set according to recall @1 and @5
# no pretrained vectors used

import itertools

# number of repetitions
N = 10

# temas
temas = [1,2,3,4]

# prefix for the generated model
model_name = "_model_"

# parameters
dimensions = [20,30,40,50,60,70,80]
lrs = [0.06, 0.056, 0.053, 0.05]
window_sizes = [4,5,6,7]
epochs = [5,7,10,15]
#neg_samples = [5,6,7]
#word_ngrams = [1,2,3]

import os
import shutil

# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None   
    
    best_recall_at_1 = 0
    best_recall_at_5 = 0

    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (dim,lr,ws,e) in itertools.product(dimensions,lrs,window_sizes,epochs):
        if dim != dim_ant:
            print("Currently checking dimension " + str(dim))
            dim_ant = dim
        for i in range(0,N):
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
********************
*** FINISH tema 2: 
*** best recall at 1: 0.7188286433310456 with parameters (40, 0.07, 6, 25)
*** model stored at _model_2_best_at_1.bin
*** best recall at 5: 0.9171814230153283 with dimensions (40, 0.08, 5, 20)
*** model stored at _model_2_best_at_5.bin
********************    print("********************")

In [None]:
#********************
#*** FINISH tema 2: 
#*** best recall at 1: 0.7188286433310456 with parameters (40, 0.07, 6, 25)
#*** model stored at _model_2_best_at_1.bin
#*** best recall at 5: 0.9171814230153283 with dimensions (40, 0.08, 5, 20)
#*** model stored at _model_2_best_at_5.bin
#********************

#********************
#*** FINISH tema 4: 
#*** best recall at 1: 0.7002544529262087 with parameters (50, 0.07, 6, 15)
#*** model stored at _model_4_best_at_1.bin
#*** best recall at 5: 0.916030534351145 with dimensions (30, 0.07, 6, 15)
#*** model stored at _model_4_best_at_5.bin
#********************



# best models always achieved with learning rate ~ 0.06, epochs >= 10, 
# dimensions >= 30 (until 50), and window size 5 or 6
# parameters for a new grid search for every tema

# try new parameters for tema = 1,2,4

import sys

# number of repetitions
N = 10

# temas
temas = [1,2,4]

# prefix for the generated model
model_name = "_model_"

# parameters
dimensions = [30,40,50]
lrs = [0.07, 0.08, 0.085, 0.09]
window_sizes = [5,6]
epochs = [15,20,25]

# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None

    
    if t == 1:
        best_recall_at_1 = 0.65
        best_recall_at_5 = 0.89   
        
    if t == 2:
        best_recall_at_1 = 0.71
        best_recall_at_5 = 0.91
        
    if t == 4:
        best_recall_at_1 = 0.70
        best_recall_at_5 = 0.91

    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (dim,lr,ws,e) in itertools.product(dimensions,lrs,window_sizes,epochs):
        for i in range(0,N):
            if i == 0:
                print("Checking the combination " + str((dim,lr,ws,e)))
            sys.stdout.write(str(i)+" ")
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")
            if i == N-1:
                print(" (done)")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")


In [None]:
# only tema 1 is still under 0.7 @ 1 and 0.9 @ 5
# parameters for a new grid search only for tema 1

# try new parameters for tema = 1

# number of repetitions
N = 30

# temas
temas = [1]

# prefix for the generated model
model_name = "_model_"

# parameters
dimensions = [30,40,50]
lrs = [0.07, 0.073, 0.076, 0.08]
window_sizes = [4,5,6]
epochs = [13, 15, 17]


# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None

    
    if t == 1:
        best_recall_at_1 = 0.653
        best_recall_at_5 = 0.893   

    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (dim,lr,ws,e) in itertools.product(dimensions,lrs,window_sizes,epochs):
        for i in range(0,N):
            if i == 0:
                print("Checking the combination " + str((dim,lr,ws,e)))
            sys.stdout.write(str(i)+" ")
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")
            if i == N-1:
                print(" (done)")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with parameters " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")


In [None]:
#BIGRAM RESULTS


#********************
#*** FINISH tema 2: 
#*** best recall at 1: 0.6922900938000458 with parameters (50, 0.1, 5, 10)
#*** model stored at _model_ng2_2_best_at_1.bin
#*** best recall at 5: 0.9032258064516129 with dimensions (40, 0.1, 5, 10)
#*** model stored at _model_ng2_2_best_at_5.bin
#********************

#********************
#*** FINISH tema 3: 
#*** best recall at 1: 0.7687515574383255 with parameters (50, 0.07, 6, 10)
#*** model stored at _model_ng2_3_best_at_1.bin
#*** best recall at 5: 0.9551457762272614 with dimensions (40, 0.07, 6, 10)
#*** model stored at _model_ng2_3_best_at_5.bin
#********************

#********************
#*** FINISH tema 4: 
#*** best recall at 1: 0.6926208651399491 with parameters (40, 0.1, 6, 10)
#*** model stored at _model_ng2_4_best_at_1.bin
#*** best recall at 5: 0.9127226463104325 with dimensions (30, 0.1, 6, 10)
#*** model stored at _model_ng2_4_best_at_5.bin
#********************

# train N models for every "tema" and keep the best models on the dev set according to recall @1 and @5
# no pretrained vectors used
# BIGRAMS!!!!!

import itertools

# number of repetitions
N = 10

# temas
temas = [1,2,3,4]

# prefix for the generated model
word_ngrams = 2
model_name = "_model_ng" + str(word_ngrams) + "_"

# parameters
dimensions = [30,40,50]
lrs = [0.05, 0.06, 0.07, 0.1]
window_sizes = [5,6]
epochs = [5,10]

import os
import shutil

# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None   
    
    best_recall_at_1 = 0.60
    best_recall_at_5 = 0.88

    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (dim,lr,ws,e) in itertools.product(dimensions,lrs,window_sizes,epochs):
        if dim != dim_ant:
            print("Currently checking dimension " + str(dim))
            dim_ant = dim
        for i in range(0,N):
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e,
                                             word_ngrams = word_ngrams,
                                             bucket = 2000000
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")

In [None]:
# retrain model for tema 1

#********************
#*** FINISH tema 1: 
#*** best recall at 1: 0.6484493602255476 with parameters (40, 0.1, 6, 10)
#*** model stored at _model_ng2_1_best_at_1.bin
#*** best recall at 5: 0.8809368900455433 with dimensions (40, 0.1, 6, 10)
#*** model stored at _model_ng2_1_best_at_5.bin
#********************



# train N models for every "tema" and keep the best models on the dev set according to recall @1 and @5
# no pretrained vectors used
# BIGRAMS!!!!!

import itertools

# number of repetitions
N = 10

# temas
temas = [1]

# prefix for the generated model
word_ngrams = 2
model_name = "_model_ng" + str(word_ngrams) + "_"

# parameters
dimensions = [30,40,50]
lrs = [0.05, 0.06, 0.07, 0.1]
window_sizes = [5,6]
epochs = [5,10]

import os
import shutil

# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None   
    
    best_recall_at_1 = 0.60
    best_recall_at_5 = 0.80

    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (dim,lr,ws,e) in itertools.product(dimensions,lrs,window_sizes,epochs):
        if dim != dim_ant:
            print("Currently checking dimension " + str(dim))
            dim_ant = dim
        for i in range(0,N):
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e,
                                             word_ngrams = word_ngrams,
                                             bucket = 2000000
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")

In [None]:
#####
#####
## TRIGRAMS did not help
#####
#####


# train N models for every "tema" and keep the best models on the dev set according to recall @1 and @5
# no pretrained vectors used
# TRIGRAMS!!!!!

import itertools

# number of repetitions
N = 5

# temas
temas = [1,2,3,4]

# prefix for the generated model
word_ngrams = 3
model_name = "_model_ng" + str(word_ngrams) + "_"

# parameters
dimensions = [30,40,50]
lrs = [0.07, 0.1]
window_sizes = [5,6]
epochs = [10,12]

import os
import shutil

# create a model for each "tema"
for t in temas:
    tema = str(t)
    
    best_classifier_at_1 = None
    best_parameters_at_1 = None
    best_classifier_at_5 = None
    best_parameters_at_5 = None   
    
    best_recall_at_1 = 0.60
    best_recall_at_5 = 0.80

    print("*** BEGIN tema " + tema + ": ")
    
    # Perfomr a grid search for all combinations of parameters. 
    # For every combination do N repetitions
    
    dim_ant = 0
    for (dim,lr,ws,e) in itertools.product(dimensions,lrs,window_sizes,epochs):
        if dim != dim_ant:
            print("Currently checking dimension " + str(dim))
            dim_ant = dim
        for i in range(0,N):
            classifier = fasttext.supervised("_train_" + tema + ".tmp",
                                             model_name + tema,
                                             label_prefix = ftlabel,
                                             dim = dim,
                                             lr = lr,
                                             ws = ws,
                                             epoch = e,
                                             word_ngrams = word_ngrams,
                                             bucket = 2000000
                                            )
            
            result_at_1 = classifier.test("_dev_" + tema + ".tmp", k=1)
            result_at_5 = classifier.test("_dev_" + tema + ".tmp", k=5)

            if result_at_1.recall > best_recall_at_1:
                best_recall_at_1 = result_at_1.recall
                best_classifier_at_1 = classifier
                best_parameters_at_1 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_1.bin")
                print("* updating best recall at 1 with parameters " + str(best_parameters_at_1) + " best recall at 1 so far: " + str(best_recall_at_1))

            if result_at_5.recall > best_recall_at_5:
                best_recall_at_5 = result_at_5.recall 
                best_classifier_at_5 = classifier
                best_parameters_at_5 = (dim,lr,ws,e)
                shutil.copyfile(model_name + tema + ".bin", model_name + tema + "_best_at_5.bin")
                print("* updating best recall at 5 with parameters " + str(best_parameters_at_5) + " best recall at 5 so far: " + str(best_recall_at_5))

            os.remove(model_name + tema + ".bin")

    print("********************")
    print("*** FINISH tema " + tema + ": ")
    print("*** best recall at 1: " + str(best_recall_at_1) + " with parameters " + str(best_parameters_at_1))
    print("*** model stored at " + model_name + tema + "_best_at_1.bin")
    print("*** best recall at 5: " + str(best_recall_at_5) + " with dimensions " + str(best_parameters_at_5))
    print("*** model stored at " + model_name + tema + "_best_at_5.bin")
    print("********************")