In [1]:
from __future__ import absolute_import, division, unicode_literals
import sys
import io
import numpy as np
import os.path
from os import path
import logging
import hashlib
import json
import pandas as pd

In [2]:
import tensorflow as tf
import torch
from transformers import BertTokenizer, BertModel, TFBertModel

In [3]:
# Set PATHs
PATH_TO_SENTEVAL = './senteval/SentEval'
PATH_TO_DATA = PATH_TO_SENTEVAL + '/data'
STS_path = PATH_TO_SENTEVAL + '/data/downstream/STS'
EMBEDDINGS_path = PATH_TO_SENTEVAL + '/examples/embeddings'

In [4]:
# import SentEval
sys.path.insert(0, PATH_TO_SENTEVAL)
import senteval

In [5]:
dictionaries = {}
results = {}

In [6]:
# Get word vectors from vocabulary (glove, word2vec, fasttext ..)
def get_wordvec(path_to_vec, word2id): #TODO hierarchizace slov podle vet
    word_vec = {}

    with io.open(path_to_vec, 'r', encoding='utf-8') as f:
        # if word2vec or fasttext file : skip first line "next(f)"
        for line in f:
            word, vec = line.split(' ', 1)
            if word in word2id:
                word_vec[word] = np.fromstring(vec, sep=' ')

    logging.info('Found {0} words with word vectors, out of \
        {1} words'.format(len(word_vec), len(word2id)))
    return word_vec


# SentEval prepare and batcher
class Prepare:

    def __init__(self, path_to_vec, wvec_dim=768):
        self.path_to_vec = path_to_vec
        self.wvec_dim = wvec_dim

    def run(self, params, sentences):
        params.sent2id = sentences
        params.word_vec = get_wordvec(self.path_to_vec, params.sent2id)
        params.wvec_dim = self.wvec_dim
        return

def batcher(params, batch):
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []

    for sent in batch:
        sentvec = []
        if sent in params.word_vec:
            sentvec.append(params.word_vec[sent]) #TODO vyber podle vety, ne jen podle slowa
        if not sentvec:
            vec = np.zeros(params.wvec_dim)
            sentvec.append(vec)
        sentvec = np.mean(sentvec, 0)
        embeddings.append(sentvec)

    embeddings = np.vstack(embeddings)
    return embeddings


# Set params for SentEval
params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
                                 'tenacity': 3, 'epoch_size': 2}

In [7]:
def torch_embedding(token, model, tokenizer):
    input_ids = torch.tensor(tokenizer.encode(token)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states.detach().numpy()


def tf_embedding(token, model, tokenizer):
    input_ids = tf.constant(tokenizer.encode(token))[None, :]  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states.numpy()

In [8]:
def torch_mean_pooling_embedding(token, model, tokenizer):
    input_ids = torch.tensor(tokenizer.encode(token)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    arr = last_hidden_states.detach().numpy()
    return np.mean(arr[0], axis=0)

In [9]:
class Job():

    def __init__(self,
                 test='STSCZ2',
                 model_spec='bert-base-multilingual-cased',
                 technology='torch',
                 name=None,
                 sentences=None,
                 lang='CZ',
                 datasets=['headlines', 'images'],
                 embed_with=None,
                 embeddings=None,
                 path_to_embeddings=None,
                 tokenizer=None,
                 model=None,
                 wvec_dim=768):
        self.test = test
        self.model_spec = model_spec
        self.technology = technology
        self.name = \
            name \
                if name is not None else \
                '_'.join([test, model_spec, technology])
        self.sentences = sentences
        self.lang = lang
        self.datasets = datasets
        self.embeddings = embeddings
        self.path_to_embeddings = \
            path_to_embeddings \
                if path_to_embeddings is not None else \
                EMBEDDINGS_path + '/' + self.name
        if embed_with is None:
            self.embed = \
                torch_embedding \
                    if technology == 'torch' else \
                    tf_embedding
        else:
            self.embed = embed_with
        self.tokenizer = tokenizer
        self.model = model
        self.wvec_dim = wvec_dim
        self.results = None

    def get_sentences(self):
        if self.sentences is not None:
            return self.sentences

        print('Getting sentences for ' + self.name)
        sentences = []

        fpath = STS_path + '/' + self.test + ('-cz-test' if self.lang == 'CZ' else '-en-test')
        for dataset in self.datasets:
            sent1, sent2 = zip(*[l.split("\t") for l in
                                 io.open(fpath + '/STS.input.%s.txt' % dataset,
                                         encoding='utf8').read().splitlines()])
            sentences += sent1 + sent2

        self.sentences = sentences
        return self.sentences

    def get_embeddings(self):
        if self.embeddings is not None:
            return self.embeddings
        if path.exists(self.path_to_embeddings):
            self.embeddings = self.load_embeddings()
            return self.embeddings

        if self.tokenizer is None:
            print('Loading tokenizer for ' + self.name)
            self.tokenizer = BertTokenizer.from_pretrained(self.model_spec)
        if self.model is None:
            print('Loading model for ' + self.name)
            self.model = \
                BertModel.from_pretrained(self.model_spec) \
                    if self.technology == 'torch' else \
                    TFBertModel.from_pretrained(self.model_spec)
        print('Processing embeddings for ' + self.name)
        embeddings = {}
        for i, sent in enumerate(list(self.get_sentences())):
            hashed = hashlib.md5(sent.encode('utf-8')).hexdigest()
            embeddings[hashed] = self.embed(sent, self.model, self.tokenizer)
            print("%i/%i:%s" % (i, len(self.get_sentences()), sent))
        self.embeddings = embeddings
        self.save_embeddings()
        return self.embeddings

    def load_embeddings(self):
        print('Loading embeddings for ' + self.name)
        embeddings = {}
        with io.open(EMBEDDINGS_path + '/' + self.name, 'r', encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                embeddings[word] = np.fromstring(vec, sep=' ')
        return embeddings

    def save_embeddings(self):
        embeddings = self.get_embeddings()
        print('Saving embeddings for ' + self.name)
        with io.open(self.path_to_embeddings, 'w', encoding='utf-8') as f:
            for token in embeddings:
                tokens = [token] + [str(x) for x in embeddings[token]]
                f.write(' '.join(tokens) + '\n')

    def run_evaluation(self):
        if not path.exists(self.path_to_embeddings):
            self.save_embeddings()
        print('Running evaluation for ' + self.name)
        se = senteval.engine.SE(params_senteval, batcher, Prepare(self.path_to_embeddings, wvec_dim=self.wvec_dim))
        transfer_tasks = [self.test]
        self.results = se.eval(transfer_tasks)
        return self.results

    def get_results(self):
        if self.results is not None:
            return self.results
        return self.run_evaluation()

    def save_results(self):
        results = self.get_results()
        print('Saving results for ' + self.name)
        with io.open(EMBEDDINGS_path + '/results.json', 'a', encoding='utf-8') as f:
            data = {self.name: results}
            json_data = json.dumps(data)
            f.write(json_data + '\n')
            return json_data


In [6]:
STSCZ_bert_base_multilingual_cased_torch_job = Job()

In [7]:
STSCZ_dictionary = STSCZ_bert_base_multilingual_cased_torch_job.get_dictionary()

Getting dictionary for STSCZ_bert-base-multilingual-cased_torch


In [None]:
STSCZ_bert_base_multilingual_cased_torch_job.get_embeddings()

In [9]:
STSCZ_bert_base_multilingual_cased_torch_job.get_results()

Running evaluation for STSCZ_bert-base-multilingual-cased_torch


{'STSCZ': {'headlines': {'pearson': (0.0017701255986185087,
    0.9662166676605177),
   'spearman': SpearmanrResult(correlation=-0.020812356817649466, pvalue=0.61846238688042),
   'nsamples': 575},
  'images': {'pearson': (0.043712113796052915, 0.20296506069815348),
   'spearman': SpearmanrResult(correlation=0.06488102600778692, pvalue=0.05865110464755204),
   'nsamples': 850},
  'all': {'pearson': {'mean': 0.02274111969733571,
    'wmean': 0.02678815364621096},
   'spearman': {'mean': 0.022034334595068726, 'wmean': 0.030302994341382766}}}}

In [12]:
dictionaries['STSCZ'] = STSCZ_dictionary
results[STSCZ_bert_base_multilingual_cased_torch_job.name] = STSCZ_bert_base_multilingual_cased_torch_job.get_results()

In [23]:
raw_results = []
with io.open(EMBEDDINGS_path + '/results.json', 'r') as f:
    for line in f:
        raw_results.append(json.loads(line))
raw_results

[{'STSCZ_bert-base-multilingual-cased_torch': {'STSCZ': {'headlines': {'pearson': [0.45181167547936035,
      2.854290863573569e-30],
     'spearman': [0.509163722179652, 2.9853923460035995e-39],
     'nsamples': 575},
    'images': {'pearson': [0.43734476852886184, 5.049647454736709e-41],
     'spearman': [0.539174813744424, 2.845041547860588e-65],
     'nsamples': 850},
    'all': {'pearson': {'mean': 0.4445782220041111,
      'wmean': 0.44318229238608053},
     'spearman': {'mean': 0.524169267962038, 'wmean': 0.5270650750428493}}}}},
 {'STSCZ2_bert-base-multilingual-cased_torch': {'STSCZ2': {'headlines': {'pearson': [0.38382098407064996,
      1.272624384364285e-21],
     'spearman': [0.4257876270230112, 1.0026060356177619e-26],
     'nsamples': 575},
    'images': {'pearson': [0.3982399674501556, 1.088782977184562e-33],
     'spearman': [0.4543518361367437, 1.5890018878238387e-44],
     'nsamples': 850},
    'all': {'pearson': {'mean': 0.3910304757604028,
      'wmean': 0.392421781

In [24]:
results = {}
for res in raw_results:
    for x in res:
        results[x] = res[x]
results

{'STSCZ_bert-base-multilingual-cased_torch': {'STSCZ': {'headlines': {'pearson': [0.45181167547936035,
     2.854290863573569e-30],
    'spearman': [0.509163722179652, 2.9853923460035995e-39],
    'nsamples': 575},
   'images': {'pearson': [0.43734476852886184, 5.049647454736709e-41],
    'spearman': [0.539174813744424, 2.845041547860588e-65],
    'nsamples': 850},
   'all': {'pearson': {'mean': 0.4445782220041111,
     'wmean': 0.44318229238608053},
    'spearman': {'mean': 0.524169267962038, 'wmean': 0.5270650750428493}}}},
 'STSCZ2_bert-base-multilingual-cased_torch': {'STSCZ2': {'headlines': {'pearson': [0.38382098407064996,
     1.272624384364285e-21],
    'spearman': [0.4257876270230112, 1.0026060356177619e-26],
    'nsamples': 575},
   'images': {'pearson': [0.3982399674501556, 1.088782977184562e-33],
    'spearman': [0.4543518361367437, 1.5890018878238387e-44],
    'nsamples': 850},
   'all': {'pearson': {'mean': 0.3910304757604028,
     'wmean': 0.39242178117421467},
    'spea

In [25]:
res_by_task = {}
for res in results:
    for task in results[res]:
        task_results = res_by_task.get(task, {})
        for dataset in results[res][task]:
            dataset_results = task_results.get(dataset, {})
            dataset_results[res] = results[res][task][dataset]
            task_results[dataset] = dataset_results
        res_by_task[task] = task_results
res_by_task

{'STSCZ': {'headlines': {'STSCZ_bert-base-multilingual-cased_torch': {'pearson': [0.45181167547936035,
     2.854290863573569e-30],
    'spearman': [0.509163722179652, 2.9853923460035995e-39],
    'nsamples': 575},
   'STSCZ_multi_cased_L-12_H-768_A-12_torch': {'pearson': [0.5101076658989366,
     2.0534831136245208e-39],
    'spearman': [0.5523454236645796, 3.105297653554739e-47],
    'nsamples': 575},
   'STSCZ_bg_cs_pl_ru_cased_L-12_H-768_A-12_v1_torch': {'pearson': [0.5250992742990885,
     4.59294727664545e-42],
    'spearman': [0.5596413291539634, 1.0599526893408327e-48],
    'nsamples': 575},
   'STSCZ_multi_cased_L-12_H-768_A-12_mean_pooled_torch': {'pearson': [0.5020967154907711,
     4.738714900955174e-38],
    'spearman': [0.5593170350255514, 1.2338383169105442e-48],
    'nsamples': 575},
   'STSCZ_bert-base-multilingual-cased_torch_headlines': {'pearson': [0.45181167547936035,
     2.854290863573569e-30],
    'spearman': [0.509163722179652, 2.9853923460035995e-39],
    'nsa

In [26]:
PEARSON='pearson'
SPEARMAN='spearman'
ALL_DATASETS='all'
MEAN='wmean'

pearson_results = {}
spearman_results = {}
all_results = {}
for task in res_by_task:
    pearson_results[task] = {}
    spearman_results[task] = {}
    gen = (dataset for dataset in res_by_task[task] if dataset != ALL_DATASETS)
    for dataset in gen:
        pearson_results[task][dataset] = {}
        spearman_results[task][dataset] = {}
        for res in res_by_task[task][dataset]:
            pearson_results[task][dataset][res] = res_by_task[task][dataset][res][PEARSON][0]
            spearman_results[task][dataset][res] = res_by_task[task][dataset][res][SPEARMAN][0]
    all_results[task] = {}
    all_results[task][PEARSON] = {}
    all_results[task][SPEARMAN] = {}
    for res in res_by_task[task][ALL_DATASETS]:
        all_results[task][PEARSON][res] = res_by_task[task][ALL_DATASETS][res][PEARSON][MEAN]
        all_results[task][SPEARMAN][res] = res_by_task[task][ALL_DATASETS][res][SPEARMAN][MEAN]
    
pearson_results

{'STSCZ': {'headlines': {'STSCZ_bert-base-multilingual-cased_torch': 0.45181167547936035,
   'STSCZ_multi_cased_L-12_H-768_A-12_torch': 0.5101076658989366,
   'STSCZ_bg_cs_pl_ru_cased_L-12_H-768_A-12_v1_torch': 0.5250992742990885,
   'STSCZ_multi_cased_L-12_H-768_A-12_mean_pooled_torch': 0.5020967154907711,
   'STSCZ_bert-base-multilingual-cased_torch_headlines': 0.45181167547936035,
   'STSCZ_bert-base-multilingual-cased_torch_all': 0.45181167547936035,
   'STSCZ_multi_cased_L-12_H-768_A-12_mean_pooled_torch_all': 0.5020967154907711,
   'STSCZ_cc_cs_300_fasttext_sentence': 0.5820461373692433,
   'STSCZ_cc_cs_300_fasttext_word': 0.45243410576300863,
   'STSCZ_bg_cs_pl_ru_cased_L-12_H-768_A-12_v1_mean_pooled_torch_all': 0.5125391564169962,
   'STSCZ_wiki_cs_300_fasttext_sentence': 0.6458306696096886,
   'STSCZ_wiki_cs_300_fasttext_word': 0.6286271983412663,
   'STSCZ_elmo_139': 0.5147392972800048},
  'images': {'STSCZ_bert-base-multilingual-cased_torch': 0.43734476852886184,
   'STSCZ_m

In [27]:
for task in pearson_results:
    display(task)
    df = pd.DataFrame(pearson_results[task])
    df = df.sort_values('headlines', ascending=False)
    display(df)

'STSCZ'

Unnamed: 0,headlines,images,headlines2013-2015CZ_Lemma,headlines2013-2015CZ_POSTag,headlines2013-2015CZ_Stem,imagesCZ2014-2015shuffled,imagesCZ2014-2015shuffled_Lemma,imagesCZ2014-2015shuffled_POSTag,imagesCZ2014-2015shuffled_Stem
STSCZ_wiki_cs_300_fasttext_sentence,0.645831,0.701858,0.61205,0.386879,0.630623,0.701858,0.736,0.453499,0.757723
STSCZ_wiki_cs_300_fasttext_word,0.628627,0.729674,0.628071,0.27558,0.671686,0.729674,0.780287,0.250657,0.790972
STSCZ_cc_cs_300_fasttext_sentence,0.582046,0.704813,0.587486,0.362638,0.573576,0.704813,0.739374,0.442313,0.735375
STSCZ_bg_cs_pl_ru_cased_L-12_H-768_A-12_v1_torch,0.525099,0.518032,,,,,,,
STSCZ_elmo_139,0.514739,0.623943,0.531189,0.138582,0.511911,0.623943,0.680458,0.260029,0.623689
STSCZ_bg_cs_pl_ru_cased_L-12_H-768_A-12_v1_mean_pooled_torch_all,0.512539,0.515134,0.516721,0.251757,0.497297,0.515134,0.582982,0.408983,0.599791
STSCZ_multi_cased_L-12_H-768_A-12_torch,0.510108,0.530078,,,,,,,
STSCZ_multi_cased_L-12_H-768_A-12_mean_pooled_torch,0.502097,0.55895,,,,,,,
STSCZ_multi_cased_L-12_H-768_A-12_mean_pooled_torch_all,0.502097,0.55895,0.482988,0.279447,0.503367,0.55895,0.634224,0.431461,0.613311
STSCZ_cc_cs_300_fasttext_word,0.452434,0.552362,0.439307,0.239384,0.418486,0.552362,0.636856,0.171691,0.64285


'STSCZ2'

Unnamed: 0,headlines,images,headlines2013-2015CZ_Lemma,headlines2013-2015CZ_POSTag,headlines2013-2015CZ_Stem,imagesCZ2014-2015shuffled,imagesCZ2014-2015shuffled_Lemma,imagesCZ2014-2015shuffled_POSTag,imagesCZ2014-2015shuffled_Stem
STSCZ2_wiki_cs_300_fasttext_sentence,0.645831,0.701858,0.61205,0.386879,0.630623,0.701858,0.736,0.453499,0.757723
STSCZ2_wiki_cs_300_fasttext_word,0.604203,0.677268,0.611361,0.390843,0.610813,0.67729,0.675058,0.483375,0.711391
STSCZ2_cc_cs_300_fasttext_sentence,0.582046,0.704813,0.587486,0.362638,0.573576,0.704813,0.739374,0.442313,0.735375
STSCZ2_cc_cs_300_fasttext_word,0.576794,0.55488,0.620098,0.404856,0.591322,0.554866,0.607401,0.48461,0.602833
STSCZ2_multi_cased_L-12_H-768_A-12_mean_pooled_torch,0.55853,0.627369,,,,,,,
STSCZ2_multi_cased_L-12_H-768_A-12_mean_pooled_torch_all,0.55853,0.627369,0.567745,0.316137,0.550061,0.627369,0.619582,0.455821,0.6079
STSCZ2_multi_cased_L-12_H-768_A-12_torch,0.554428,0.61289,,,,,,,
STSCZ2_bg_cs_pl_ru_cased_L-12_H-768_A-12_v1_mean_pooled_torch_all,0.537396,0.581298,0.571214,0.286464,0.553849,0.581298,0.592106,0.451115,0.607814
STSCZ2_bg_cs_pl_ru_cased_L-12_H-768_A-12_v1_torch,0.534047,0.572633,,,,,,,
STSCZ2_bert-base-multilingual-cased_torch,0.383821,0.39824,,,,,,,


'STS16'

Unnamed: 0,answer-answer,headlines,plagiarism,postediting,question-question
STS16_cc_en_300_fasttext_sentence,0.348939,0.640158,0.566509,0.588734,0.300979
STS16_wiki_cs_300_fasttext_word,0.209975,0.612075,0.504599,0.549974,0.210506
STS16_wiki_cs_300_fasttext_sentence,0.239957,0.599148,0.488628,0.534806,0.184559
STS16_bert-base-cased_torch,0.353111,0.589201,0.435743,0.421822,0.24392
STS16_bert-base-cased_mean_pooled_torch,0.321103,0.542163,0.490023,0.396938,0.242072
STS16_cc_en_300_fasttext_word,0.184348,0.440647,0.287512,0.425261,-0.007292


'STS162'

Unnamed: 0,answer-answer,headlines,plagiarism,postediting,question-question
STS162_cc_en_300_fasttext_sentence,0.348939,0.640158,0.566509,0.588734,0.300979
STS162_bert-base-cased_mean_pooled_torch,0.555094,0.633482,0.765528,0.802286,0.456705
STS162_wiki_cs_300_fasttext_word,0.32601,0.626984,0.662303,0.766695,0.429011
STS162_wiki_cs_300_fasttext_sentence,0.239957,0.599148,0.488628,0.534806,0.184559
STS162_cc_en_300_fasttext_word,0.360372,0.54808,0.643174,0.740356,0.086159
STS162_bert-base-cased_torch,0.376031,0.503646,0.613159,0.701976,0.069374


In [28]:
for task in pearson_results:
    display(task)
    df = pd.DataFrame(all_results[task])
    df = df.sort_values(PEARSON, ascending=False)
    display(df)

'STSCZ'

Unnamed: 0,pearson,spearman
STSCZ_wiki_cs_300_fasttext_sentence,0.634602,0.645182
STSCZ_wiki_cs_300_fasttext_word,0.61929,0.625226
STSCZ_cc_cs_300_fasttext_sentence,0.616563,0.624195
STSCZ_multi_cased_L-12_H-768_A-12_mean_pooled_torch,0.536009,0.589423
STSCZ_multi_cased_L-12_H-768_A-12_torch,0.52202,0.576311
STSCZ_bg_cs_pl_ru_cased_L-12_H-768_A-12_v1_torch,0.520884,0.571077
STSCZ_multi_cased_L-12_H-768_A-12_mean_pooled_torch_all,0.518153,0.566517
STSCZ_elmo_139,0.513847,0.531696
STSCZ_bg_cs_pl_ru_cased_L-12_H-768_A-12_v1_mean_pooled_torch_all,0.496374,0.553772
STSCZ_cc_cs_300_fasttext_word,0.467745,0.486335


'STSCZ2'

Unnamed: 0,pearson,spearman
STSCZ2_wiki_cs_300_fasttext_sentence,0.634602,0.645156
STSCZ2_cc_cs_300_fasttext_sentence,0.616563,0.624252
STSCZ2_wiki_cs_300_fasttext_word,0.613073,0.618057
STSCZ2_multi_cased_L-12_H-768_A-12_mean_pooled_torch,0.599592,0.612282
STSCZ2_multi_cased_L-12_H-768_A-12_torch,0.5893,0.605837
STSCZ2_bg_cs_pl_ru_cased_L-12_H-768_A-12_v1_torch,0.557063,0.577191
STSCZ2_cc_cs_300_fasttext_word,0.556476,0.572473
STSCZ2_multi_cased_L-12_H-768_A-12_mean_pooled_torch_all,0.556184,0.573161
STSCZ2_bg_cs_pl_ru_cased_L-12_H-768_A-12_v1_mean_pooled_torch_all,0.536216,0.556067
STSCZ2_bert-base-multilingual-cased_torch,0.392422,0.442826


'STS16'

Unnamed: 0,pearson,spearman
STS16_cc_en_300_fasttext_sentence,0.493155,0.542878
STS16_wiki_cs_300_fasttext_word,0.421575,0.486014
STS16_wiki_cs_300_fasttext_sentence,0.414491,0.481977
STS16_bert-base-cased_torch,0.413597,0.527323
STS16_bert-base-cased_mean_pooled_torch,0.401948,0.512354
STS16_cc_en_300_fasttext_word,0.273957,0.345844


'STS162'

Unnamed: 0,pearson,spearman
STS162_bert-base-cased_mean_pooled_torch,0.645878,0.655573
STS162_wiki_cs_300_fasttext_word,0.563231,0.590259
STS162_cc_en_300_fasttext_sentence,0.493155,0.542878
STS162_cc_en_300_fasttext_word,0.484478,0.498747
STS162_bert-base-cased_torch,0.461828,0.498818
STS162_wiki_cs_300_fasttext_sentence,0.414491,0.481977
