In [3]:
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
#  The senteval_example.ipynb, modified to work with for our skip-gram models.

from __future__ import absolute_import, division, unicode_literals

import sys
import numpy as np
import logging
import sklearn
import examples.data as data
import pickle

# Set PATHs
# path to senteval
PATH_TO_SENTEVAL = ''
# path to the NLP datasets 
PATH_TO_DATA = 'data'
# info about the model
model_info = "_300_5_20_0.025_0.001_2_5"
# path to skipgram embeddings
PATH_TO_VEC = 'skipgram_data_trained/word_vectors/wordvec{}.txt'.format(model_info)
# path to word2id
PATH_TO_ID = 'skipgram_data_trained/word2id/word2id{}.txt'.format(model_info)

# import SentEval
sys.path.insert(0, PATH_TO_SENTEVAL)
import senteval

def prepare(params, samples):
    with open(PATH_TO_ID, 'rb') as f:
        params.word2id = pickle.load(f)

    # load skipgram/word2vec format 
    params.word_vec = data.get_wordvec(PATH_TO_VEC, params.word2id)
    # dimensionality of skipgram embeddings
    params.wvec_dim = 300
    return

def batcher(params, batch):
    """
    Use the average of word embeddings as a sentence representation.
    Each batch consists of one vector for sentence.
    Here you can process each sentence of the batch, 
    or a complete batch (you may need masking for that).
    
    """
    # if a sentence is empty dot is set to be the only token
    # you can change it into NULL dependening in your model
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []

    for sent in batch:
        sentvec = []
        # the format of a sentence is a lists of words (tokenized and lowercased)
        for word in sent:
            if word in params.word_vec:
                # [number of words, embedding dimensionality]
                sentvec.append(params.word_vec[word])
        if not sentvec:
            vec = np.zeros(params.wvec_dim)
            # [number of words, embedding dimensionality]
            sentvec.append(vec)
        # average of word embeddings for sentence representation
        # [embedding dimansionality]
        sentvec = np.mean(sentvec, 0)
        embeddings.append(sentvec)
    # [batch size, embedding dimensionality]
    embeddings = np.vstack(embeddings)
    return embeddings


# Set params for SentEval
# we use logistic regression (usepytorch: Fasle) and kfold 10
# In this dictionary you can add extra information that you model needs for initialization
# for example the path to a dictionary of indices, of hyper parameters
# this dictionary is passed to the batched and the prepare fucntions
params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False, 'kfold': 10}
# this is the config for the NN classifier but we are going to use scikit-learn logistic regression with 10 kfold
# usepytorch = False 
#params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
#                                 'tenacity': 3, 'epoch_size': 2}

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    se = senteval.engine.SE(params_senteval, batcher, prepare)
    
    # here you define the NLP taks that your embedding model is going to be evaluated
    # in (https://arxiv.org/abs/1802.05883) we use the following :
    # SICKRelatedness (Sick-R) needs torch cuda to work (even when using logistic regression), 
    # but STS14 (semantic textual similarity) is a similar type of semantic task
    transfer_tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC',
                      'MRPC', 'SICKEntailment', 'STS14']
    # senteval prints the results and returns a dictionary with the scores
    results = se.eval(transfer_tasks)
    print(results)


2018-05-31 19:13:20,471 : ***** Transfer task : MR *****


2018-05-31 19:13:26,690 : Found 70427 words with word vectors, out of         70427 words
2018-05-31 19:13:26,712 : Generating sentence embeddings
2018-05-31 19:13:27,447 : Generated sentence embeddings
2018-05-31 19:13:27,449 : Training sklearn-LogReg with (inner) 10-fold cross-validation
2018-05-31 19:13:57,338 : Best param found at split 1: l2reg = 0.25                 with score 70.93
2018-05-31 19:14:26,094 : Best param found at split 2: l2reg = 0.25                 with score 70.8
2018-05-31 19:14:54,239 : Best param found at split 3: l2reg = 0.5                 with score 71.29
2018-05-31 19:15:21,462 : Best param found at split 4: l2reg = 0.25                 with score 70.84
2018-05-31 19:15:49,255 : Best param found at split 5: l2reg = 0.25                 with score 71.25
2018-05-31 19:16:17,648 : Best param found at split 6: l2reg = 0.25                 with score 71.22
2018-05-31 19:16:45,619 : Best param found at 

2018-05-31 19:31:51,537 : Found 70427 words with word vectors, out of         70427 words
2018-05-31 19:31:51,770 : Computing embedding for test
2018-05-31 19:31:52,187 : Computed test embeddings
2018-05-31 19:31:52,188 : Computing embedding for dev
2018-05-31 19:31:52,235 : Computed dev embeddings
2018-05-31 19:31:52,236 : Computing embedding for train
2018-05-31 19:31:52,615 : Computed train embeddings
2018-05-31 19:31:52,694 : Training sklearn-LogReg with standard validation..
2018-05-31 19:32:00,432 : [('reg:0.25', 67.8), ('reg:0.5', 67.6), ('reg:1', 68.2), ('reg:2', 67.2), ('reg:4', 68.2), ('reg:8', 67.4)]
2018-05-31 19:32:00,433 : Validation : best param found is reg = 1 with score             68.2
2018-05-31 19:32:00,434 : Evaluating...
2018-05-31 19:32:01,603 : 
Dev acc : 68.2 Test acc : 69.35 for                        SICK entailment

2018-05-31 19:32:01,617 : ***** Transfer task : STS14 *****


2018-05-31 19:32:06,708 : Found 70427 words with word vectors, out of         704

{'MRPC': {'ntest': 1725, 'acc': 71.71, 'f1': 81.22, 'ndev': 4076, 'devacc': 70.49}, 'SUBJ': {'ntest': 10000, 'acc': 85.27, 'ndev': 10000, 'devacc': 85.3}, 'SICKEntailment': {'ntest': 4927, 'acc': 69.35, 'ndev': 500, 'devacc': 68.2}, 'SST2': {'ntest': 1821, 'acc': 72.98, 'ndev': 872, 'devacc': 73.85}, 'TREC': {'ntest': 500, 'acc': 62.4, 'ndev': 5452, 'devacc': 63.37}, 'CR': {'ntest': 3775, 'acc': 75.55, 'ndev': 3775, 'devacc': 75.63}, 'MR': {'ntest': 10662, 'acc': 70.91, 'ndev': 10662, 'devacc': 71.0}, 'STS14': {'images': {'spearman': SpearmanrResult(correlation=0.731063787213996, pvalue=2.6200347805747797e-126), 'pearson': (0.7636180264787633, 2.941828355536969e-144), 'nsamples': 750}, 'all': {'spearman': {'mean': 0.6317067629922892, 'wmean': 0.6424190545421659}, 'pearson': {'mean': 0.6617292181491062, 'wmean': 0.6720056528400011}}, 'deft-news': {'spearman': SpearmanrResult(correlation=0.6871623024175811, pvalue=2.98346157189256e-43), 'pearson': (0.7384544393947768, 6.220629104998448e-