# Fasttext runs from python

In [1]:
import fasttext

In [2]:
BASEDIR = "/home/erikt/projects/newsgac/"
TRAINDIR = BASEDIR+"data/"
PRETRAINEDDIR = BASEDIR+"fasttext-runs/"
NEWSPAPERDIR = BASEDIR+"fastText/"
TRAINFILENAME = "link-articles-20190417-fasttext-train-1,2,7--collapsed9.txt"
TESTFILENAME = "link-articles-20190417-fasttext-test-1,2,7--collapsed9.txt"
WIKIFILENAME = "wiki.nl.vec"
NRCMODEL = "nrc-model.vec"
VOLKSKRANTMODEL = "volkskrant-model.vec"
METADATATEST = "link-articles-20190417-fasttext-test-collapsed9.txt"
LABELCOLUMNID = 0
NEWSPAPERCOLUMNID = 2
ALL = "ALL"
DIM = 300
EPOCH = 50

def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))
    
def readNewspapers(inFileName):
    newspapers = []
    inFile = open(inFileName,"r")
    for line in inFile: 
        newspapers.append(line.split()[NEWSPAPERCOLUMNID])
    inFile.close()
    return(newspapers)

def readTexts(inFileName,fromColumn=2):
    labels = []
    texts = []
    inFile = open(inFileName,"r")
    for line in inFile: 
        words = line.strip().split()
        labels.append(words[LABELCOLUMNID])
        texts.append(" ".join(words[fromColumn:]))
    inFile.close()
    return(labels,texts)

def evaluate(newspapers,labels,predictedLabels):
    correct = {}
    total = {}
    for i in range(0,len(labels)):
        if not newspapers[i] in total: total[newspapers[i]] = 0
        total[newspapers[i]] += 1
        if predictedLabels[i] == labels[i]:
            if not newspapers[i] in correct: correct[newspapers[i]] = 0
            correct[newspapers[i]] += 1
    for newspaper in total.keys():
        print("{0:.3f}".format(correct[newspaper]/total[newspaper]),newspaper)
    print("{0:.3f}".format(sum(correct.values())/sum(total.values())),ALL)

labelsTest,textsTest = readTexts(TRAINDIR+TESTFILENAME)
newspapersTest = readNewspapers(TRAINDIR+METADATATEST)

## Baseline model, no dictionary

In [3]:
model = fasttext.train_supervised(TRAINDIR+TRAINFILENAME,\
                                  dim=DIM,
                                  epoch=100)
print_results(*model.test(TRAINDIR+TESTFILENAME))

N	915
P@1	0.652
R@1	0.652


## Model using dictionary from Wikipedia

In [4]:
modelWiki = fasttext.train_supervised(TRAINDIR+TRAINFILENAME,\
                                      dim=DIM,
                                      epoch=EPOCH,\
                                      pretrainedVectors=PRETRAINEDDIR+WIKIFILENAME)
print_results(*modelWiki.test(TRAINDIR+TESTFILENAME))

N	915
P@1	0.673
R@1	0.673


## Model using dictionary from Wikipedia and Volkskrant

In [5]:
modelVolkskrant = fasttext.train_supervised(TRAINDIR+TRAINFILENAME,\
                                  dim=DIM,
                                  epoch=EPOCH,\
                                  pretrainedVectors=NEWSPAPERDIR+VOLKSKRANTMODEL)
print_results(*modelVolkskrant.test(TRAINDIR+TESTFILENAME))

N	915
P@1	0.671
R@1	0.671


## Model using dictionary from Wikipedia and NRC

In [6]:
modelNRC = fasttext.train_supervised(TRAINDIR+TRAINFILENAME,\
                                  dim=DIM,
                                  epoch=EPOCH,\
                                  pretrainedVectors=NEWSPAPERDIR+NRCMODEL)
print_results(*modelNRC.test(TRAINDIR+TESTFILENAME))

N	915
P@1	0.681
R@1	0.681


## Evaluation

In [7]:
labelsPredicted = model.predict(textsTest)
labelsPredicted = [label[0] for label in labelsPredicted[0]]
labelsPredictedWiki = modelWiki.predict(textsTest)
labelsPredictedWiki = [label[0] for label in labelsPredictedWiki[0]]
labelsPredictedVolkskrant = modelVolkskrant.predict(textsTest)
labelsPredictedVolkskrant = [label[0] for label in labelsPredictedVolkskrant[0]]
labelsPredictedNRC = modelNRC.predict(textsTest)
labelsPredictedNRC = [label[0] for label in labelsPredictedNRC[0]]

In [8]:
correct = len([i for i in range(0,len(labelsTest)) \
               if labelsPredicted[i] == labelsTest[i]])
correctWiki = len([i for i in range(0,len(labelsTest)) \
                   if labelsPredictedWiki[i] == labelsTest[i]])
correctVolkskrant = len([i for i in range(0,len(labelsTest)) \
                        if labelsPredictedVolkskrant[i] == labelsTest[i]])
correctNRC = len([i for i in range(0,len(labelsTest)) \
                        if labelsPredictedNRC[i] == labelsTest[i]])

correct/len(labelsTest),correctWiki/len(labelsTest),correctVolkskrant/len(labelsTest),correctNRC/len(labelsTest)

(0.6524590163934426, 0.673224043715847, 0.6710382513661202, 0.6808743169398908)

In [9]:
evaluate(newspapersTest,labelsTest,labelsPredicted)

0.638 NEWSPAPER=05NRC_Handelsblad
0.673 NEWSPAPER=06De_Telegraaf
0.639 NEWSPAPER=08De_Volkskrant
0.652 ALL


In [10]:
evaluate(newspapersTest,labelsTest,labelsPredictedWiki)

0.683 NEWSPAPER=05NRC_Handelsblad
0.681 NEWSPAPER=06De_Telegraaf
0.655 NEWSPAPER=08De_Volkskrant
0.673 ALL


In [11]:
evaluate(newspapersTest,labelsTest,labelsPredictedVolkskrant)

0.679 NEWSPAPER=05NRC_Handelsblad
0.662 NEWSPAPER=06De_Telegraaf
0.676 NEWSPAPER=08De_Volkskrant
0.671 ALL


In [12]:
evaluate(newspapersTest,labelsTest,labelsPredictedNRC)

0.695 NEWSPAPER=05NRC_Handelsblad
0.665 NEWSPAPER=06De_Telegraaf
0.689 NEWSPAPER=08De_Volkskrant
0.681 ALL


## Conclusions

Using word vectors from Wikipedia improves the overall score (0.652 &rarr; 0.673). Using pretrained word vectors from either Volkskrant or NRC texts also further improves the scores for these two newspapers (0.655 &rarr; 0.676 and 0.683 &rarr; 0.695).

Caution: these conclusions are based on single runs of fastText, which uses random initializations so scores can be different in different runs with the same parameters.