In [1]:
#import annoy

import numpy as np
import gensim, zipfile

from scipy.stats import spearmanr, kendalltau, rankdata
from scipy.spatial.distance import cosine, euclidean, norm

import matplotlib.pyplot as plt
from collections import Counter



In [2]:
def preprop(filename, model):

    with zipfile.ZipFile(model, 'r') as archive:
        stream = archive.open('model.bin')
        m = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)
    print(len(m.index_to_key))
    
    triplets=[]
    with open(filename, 'r') as f:
        ff=f.readlines()
        for i in ff[1:]:
            fff=i.split(',')
            if fff[0] in m and fff[1] in m and fff[0] != fff[1]: #tiger, tiger(frozenset for one element is bad)
                        triplets.append([fff[0], fff[1], fff[4][:-1]])
            else:
                pass
    print(len(triplets))
    
    return triplets, m

In [3]:
def validate(g, m, y_pred_iv, sim_func):
    
    gold_pairs=[]
    w2v_pairs=[]
    ivis_pairs=[]

    for i,j in g.dict.keys():
        gold_pairs.append(g.sim(i, j))
        w2v_pairs.append(sim_func(m[i], m[j]))#m.distance(i, j))

        dI = y_pred_iv[list(g.vocab).index(i)]
        dII = y_pred_iv[list(g.vocab).index(j)]
        result = sim_func(dI, dII)#1-
        ivis_pairs.append(result)

    results=(spearmanr(gold_pairs, w2v_pairs), 
             spearmanr(gold_pairs, ivis_pairs), 
             kendalltau(gold_pairs, w2v_pairs), 
             kendalltau(gold_pairs, ivis_pairs))
    return results

In [4]:
class Gold():
    
    def __init__(self, triplets):
        self.dict={frozenset([i[0], i[1]]): float(i[2]) for i in triplets} #problem if i[0]==i[1], bypassed in preprop
        self.vocab=np.array(list(set.union(set(list(zip(*triplets))[0]), set(list(zip(*triplets))[1])))) #nice hack:)
        
    def __getitem__(self, index):
        return self.vocab[index]
    
    def sim(self, first_word, second_word):
        return self.dict[frozenset((first_word, second_word))]
        
    def sim_matrix(self, k):
        res=np.zeros((len(self.vocab), k), dtype="int64")
        for ind_i, obj_i in enumerate(self.vocab):
            cur_res=[]
            for ind_j, obj_j in enumerate(self.vocab):
                try:
                    cur_res.append(self.sim(obj_i, obj_j))
                except:
                    pass
            cur_res=sorted(cur_res, reverse=True) #по убыванию
            if len(cur_res) < k:
                cur_res=cur_res+[-1]*(k-len(cur_res))
            res[ind_i,:]=cur_res[:k]
        return res
    
    def nei_matrix(self, k):
        res=np.zeros((len(self.vocab), k), dtype="int64")
        for ind_i, obj_i in enumerate(self.vocab):
            cur_res=[]
            for ind_j, obj_j in enumerate(self.vocab):
                try:
                    cur_res.append((self.sim(obj_i, obj_j), ind_j))
                except:
                    pass
            cur_res.sort(key=lambda x:x[0], reverse=True) #по убыванию
            cur_list=list(zip(*cur_res))[1]
            if len(cur_list) < k:
                cur_list=list(cur_list)+[ind_i]*(k-len(cur_list))
            res[ind_i,:]=cur_list[:k]
        return res
    
    def nei_list(self):
        res=[]
        for ind_i, obj_i in enumerate(self.vocab):
            cur_res=[]
            for ind_j, obj_j in enumerate(self.vocab):
                try:
                    cur_res.append((self.sim(obj_i, obj_j), ind_j))
                except:
                    pass
            cur_res.sort(key=lambda x:x[0], reverse=True) #по убыванию
            cur_list=list(zip(*cur_res))[1]
            #res[ind_i,:]=cur_list
            res.append(cur_list)
        return res

# Ivis based part

In [6]:
from tensorflow.keras.utils import Sequence
from abc import ABC#, abstractmethod

In [7]:
from sklearn.base import BaseEstimator

In [8]:
class TripletGenerator(Sequence, ABC):
    def __init__(self, X, neighbour_matrix, batch_size=32):
        print(batch_size, X)
        if batch_size > X.shape[0]:
            raise Exception('''batch_size value larger than num_rows in dataset
                            (batch_size={}, rows={}). Lower batch_size to a
                            smaller value.'''.format(batch_size, X.shape[0]))
        self.X = X
        self.neighbour_matrix = neighbour_matrix
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(self.X.shape[0] / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_indices = range(idx * self.batch_size,
                              min((idx + 1) * self.batch_size, self.X.shape[0]))

        label_batch = self.get_labels(batch_indices)
        triplet_batch = [self.get_triplet(row_index)
                         for row_index in batch_indices]

        if issparse(self.X):
            triplet_batch = [[e.toarray()[0] for e in t] for t in triplet_batch]

        triplet_batch = np.array(triplet_batch)

        return self.output_triplets(triplet_batch, label_batch)

    def get_triplet(self, idx):
        triplet = []
        neighbour_list = self.get_neighbours(idx)
        neighbour_list = np.array(neighbour_list, dtype=np.uint32)
        #print(neighbour_list)

        if len(neighbour_list) > 1:
            # Take a random neighbour as positive
            positive_ind = np.random.randint(0, len(neighbour_list)-1)

            # Take a random non-neighbour as negative
            # Pick a random index until one fits constraint, usually faster.
            negative_ind = np.random.randint(positive_ind+1, len(neighbour_list))
        else:
            positive_ind=0
            negative_ind=0

        positive_ind=neighbour_list[positive_ind]
        negative_ind=neighbour_list[negative_ind]

        triplet = [self.X[idx], self.X[positive_ind], self.X[negative_ind]] #add #changed
        return triplet

    def get_neighbours(self, idx):
        return self.neighbour_matrix[idx]
    '''
    @abstractmethod
    def get_labels(self, batch_indices):
        raise NotImplementedError("Override this method with a concrete implementation")

    @abstractmethod
    def output_triplets(self, triplet_batch, label_batch):
        raise NotImplementedError("Override this method with a concrete implementation")
    '''

In [9]:
class UnsupervisedTripletGenerator(TripletGenerator):
    def __init__(self, X, neighbour_matrix, batch_size=32):
        super().__init__(X, neighbour_matrix, batch_size)
        self.placeholder_labels = np.empty(batch_size, dtype=np.uint8)

    def get_labels(self, batch_indices):
        return self.placeholder_labels[:len(batch_indices)]

    def output_triplets(self, triplet_batch, label_batch):
        return tuple([triplet_batch[:, 0], triplet_batch[:, 1], triplet_batch[:, 2]]), label_batch

In [10]:
def generator_from_neighbour_matrix(X, Y, neighbour_matrix, batch_size):
     return UnsupervisedTripletGenerator(X, neighbour_matrix, batch_size=batch_size)

In [11]:
loss_dict = {}

In [12]:
def register_loss(loss_fn=None, *, name=None):
    """Registers a class definition or Callable as an ivis loss function.
    A mapping will be created between the name and the loss function passed.
    If a class definition is provided, an instance will be created, passing the name
    as an argument.

    If no name is provided to this function, the name of the passed function will be used
    as a key.

    The loss function must have two parameters, (y_true, y_pred)
    and calculates the loss for a batch of triplet inputs (y_pred).
    y_pred is expected to be of shape: (3, batch_size, embedding_dims).

    Usage:
        .. code-block:: python

            @register_loss
            def custom_loss(y_true, y_pred):
                pass
            model = Ivis(distance='custom_loss')"""

    if loss_fn is None:
        return functools.partial(register_loss, name=name)

    key = name or loss_fn.__name__
    if isinstance(loss_fn, type):
        loss_dict[key] = loss_fn(name=key)
    else:
        loss_dict[key] = loss_fn
    return loss_fn

In [13]:
def triplet_loss(distance='pn'):
    """Returns a previously registered triplet loss function associated
    with the string 'distance'. If passed a callable, just returns it."""
    if callable(distance):
        return distance
    try:
        loss_fn = loss_dict[distance]
        return loss_fn
    except KeyError:
        raise ValueError("Loss function {} not registered with ivis".format(distance))

In [14]:
import functools

In [15]:
@register_loss(name='euclidean')
class EuclideanTripletLoss:
    """Calculates the standard triplet loss between anchor, positive and negative
    examples in a triplet based on euclidean distance."""
    def __init__(self, margin=0, name=None):
        self.margin = margin
        name = name or self.__class__.__name__
        self.__name__ = name
    def __call__(self, y_true, y_pred):
        anchor, positive, negative = tf.unstack(y_pred)
        return K.mean(K.maximum(euclidean_distance(anchor, positive) - euclidean_distance(anchor, negative) + self.margin, 0))

In [16]:
@register_loss(name='cosine')
class CosineTripletLoss:
    """Calculates the standard triplet loss between anchor, positive and negative
    examples in a triplet based on cosine distance."""
    def __init__(self, margin=0, name=None):
        self.margin = margin
        name = name or self.__class__.__name__
        self.__name__ = name
    def __call__(self, y_true, y_pred):
        anchor, positive, negative = tf.unstack(y_pred)
        return K.mean(K.maximum(cosine_distance(anchor, positive) - cosine_distance(anchor, negative) + self.margin, 0))

In [17]:
from tensorflow.keras.callbacks import EarlyStopping

In [18]:
def triplet_network(base_network, embedding_dims=2, embedding_l2=0.0):
    """ Creates a triplet Siamese Neural Network from a base_network.
    The base network will have an extra Dense layer of the requested embedding_dims added to
    the end if embedding_dims is not None.

    The outputs of the three network heads will be stacked into the shape:
    (3, batch_size, embedding_dims) unless embedding_dims is None (in which case the existing dims
    of last base_network layer will be used).

    Outputs: tuple(
        model: tf.keras.models.Model. The constructed triplet Siamese network
        processed_a: tf.keras.layers.Dense. Result of applying the base_network to anchor input.
        processed_p: tf.keras.layers.Dense. Result of applying the base_network to positive input.
        processed_n: tf.keras.layers.Dense. Result of applying the base_network to negative input.
    ) """

    def output_shape(shapes):
        shape1, _, _ = shapes
        return (3, shape1[0],)
   
    input_a = Input(shape=base_network.input_shape[1:])
    input_p = Input(shape=base_network.input_shape[1:])
    input_n = Input(shape=base_network.input_shape[1:])
    '''
    if embedding_dims is None:
        embeddings = base_network.output
    else:
        embeddings = Dense(embedding_dims,
                           kernel_regularizer=l2(embedding_l2))(base_network.output)
    '''
    embeddings = base_network.output
    network = Model(base_network.input, embeddings)

    processed_a = network(input_a)
    processed_p = network(input_p)
    processed_n = network(input_n)

    triplet = Lambda(K.stack,
                     output_shape=output_shape,
                     name='stacked_triplets')([processed_a,
                                               processed_p,
                                               processed_n],)
    model = Model([input_a, input_p, input_n], triplet)

    return model, processed_a, processed_p, processed_n

In [19]:
from scipy.sparse import issparse

In [20]:
from tensorflow.keras import backend as K

In [21]:
def euclidean_distance(x, y):
    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=-1, keepdims=True), K.epsilon()))

In [22]:
def cosine_distance(x, y):
    return 1 - tf.math.reduce_sum(tf.nn.l2_normalize(x, axis=1) * tf.nn.l2_normalize(y, axis=1))

In [23]:
import tensorflow as tf

In [24]:
class KerasSequence(tf.keras.utils.Sequence):
    """Wraps inputs into a Keras Sequence to allow Keras models to predict on
    arbitrary inputs which may be out of memory."""
    def __init__(self, X, batch_size=32):
        self.X = X
        self.batch_size = batch_size
        self.placeholder_labels = np.empty(batch_size, dtype=np.uint8)
    def __len__(self):
        return int(np.ceil(self.X.shape[0] / float(self.batch_size)))
    def __getitem__(self, index):
        batch_indices = range(index * self.batch_size, min((index + 1) * self.batch_size, self.X.shape[0]))
        batch = np.array([self.X[i] for i in batch_indices])
        placeholder_labels = self.placeholder_labels[:len(batch_indices)]
        return batch, placeholder_labels

In [25]:
class Corrector(BaseEstimator):

    def __init__(self, 
                 embedding_dims=2, 
                 k=150, 
                 distance='pn', 
                 batch_size=128,
                 epochs=1000, 
                 n_epochs_without_progress=20, 
                 model=None,
                 neighbour_matrix=None, 
                 verbose=1):

        self.embedding_dims = embedding_dims
        self.k = k
        self.distance = distance
        self.batch_size = batch_size
        self.epochs = epochs
        self.n_epochs_without_progress = n_epochs_without_progress
        
        self.model = model
        self.model_ = None
                
        self.neighbour_matrix = neighbour_matrix
        self.verbose = verbose
        
        self.encoder = None
        self.loss_history_ = []
        self.callbacks = []

    def _fit(self, X, Y=None, shuffle_mode=True):

        datagen = generator_from_neighbour_matrix(X, Y,
                                                  neighbour_matrix=self.neighbour_matrix,
                                                  batch_size=self.batch_size)

        loss_monitor = 'loss'
        try:
            triplet_loss_func = triplet_loss(distance=self.distance)
        except KeyError:
            raise ValueError('Loss function `{}` not implemented.'.format(self.distance))

        self.model_, anchor_embedding, _, _ = \
            triplet_network(self.model,
                            embedding_dims=self.embedding_dims)

        self.model_.compile(optimizer='adam', loss=triplet_loss_func)
            
        self.encoder = self.model_.layers[3]

        if self.verbose > 0:
            print('Training neural network')

        hist = self.model_.fit(
            datagen,
            epochs=self.epochs,
            callbacks=self.callbacks + [EarlyStopping(monitor=loss_monitor,
                                                      patience=self.n_epochs_without_progress)],
            shuffle=shuffle_mode,
            steps_per_epoch=int(np.ceil(X.shape[0] / self.batch_size)),
            verbose=self.verbose)
        self.loss_history_ += hist.history['loss']

    '''
    def fit(self, X, Y=None, shuffle_mode=True):
        self._fit(X, Y, shuffle_mode)
        return self

    def fit_transform(self, X, Y=None, shuffle_mode=True):
        self.fit(X, Y, shuffle_mode)
        return self.transform(X)

    def transform(self, X):
        embedding = self.encoder.predict(KerasSequence(X, batch_size=self.batch_size),
                                         verbose=self.verbose)
        return embedding
    '''
 
    def fit_transform(self, X, Y=None, shuffle_mode=True):
        self._fit(X, Y, shuffle_mode)
        embedding = self.encoder.predict(KerasSequence(X, batch_size=self.batch_size),
                                         verbose=self.verbose)
        return embedding

# Эксперименты

In [26]:
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, AlphaDropout, Lambda, Dropout
from tensorflow.keras import backend as K

from tensorflow.keras.regularizers import l2

In [27]:
def do_experiment(lemm, expert, func):
    
    inputs = Input(shape=300)
    q = Dense(300,# activation=None,
              kernel_initializer='lecun_normal')(inputs)
    p=Model(inputs, q)
    
    model='../'+lemm
    dataset='../'+expert
    t, m = preprop(dataset, model)
    g=Gold(t)
    x = m[g.vocab]
    
    iv = Corrector(
            embedding_dims=300, 
            distance=func, 
            batch_size = 200,
            epochs=100, 
            neighbour_matrix=g.nei_list(), 
            model=p)#, margin=0)
    
    y_pred_iv = iv.fit_transform(x)
    
    print("aaa")
    if func == 'euclidean':
        return validate(g, m, y_pred_iv, euclidean)
    elif func == 'cosine':
        return validate(g, m, y_pred_iv, cosine)

## 1. lemmatize wordsim euclidean

In [31]:
res = do_experiment('5.zip', 'WordSim-353-standartized.csv', 'euclidean')
res

273992
349
200 [[ 0.041929  0.063848  0.003732 ...  0.142079 -0.027583  0.069721]
 [ 0.082989 -0.022708  0.104097 ...  0.046328 -0.084275  0.020042]
 [ 0.069058 -0.006184 -0.089413 ... -0.082803 -0.013278  0.022188]
 ...
 [-0.051873 -0.052932  0.086663 ...  0.011292  0.003633 -0.019811]
 [ 0.053438  0.051815  0.001467 ... -0.048915  0.069824 -0.068646]
 [ 0.119095  0.09691   0.044006 ...  0.045905 -0.042312 -0.002537]]
Training neural network
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epo

(SpearmanrResult(correlation=-0.7084941993379477, pvalue=1.797352350373244e-54),
 SpearmanrResult(correlation=-0.7551552839322319, pvalue=1.2995209699129477e-65),
 KendalltauResult(correlation=-0.5129792004348761, pvalue=2.8047085167872706e-46),
 KendalltauResult(correlation=-0.560144394070951, pvalue=7.760110477155038e-55))

In [32]:
res

(SpearmanrResult(correlation=-0.7084941993379477, pvalue=1.797352350373244e-54),
 SpearmanrResult(correlation=-0.7551552839322319, pvalue=1.2995209699129477e-65),
 KendalltauResult(correlation=-0.5129792004348761, pvalue=2.8047085167872706e-46),
 KendalltauResult(correlation=-0.560144394070951, pvalue=7.760110477155038e-55))

## 2. lemmatize wordsim cosine

In [28]:
res = do_experiment('5.zip', 'WordSim-353-standartized.csv', 'cosine')
res

273992
349
200 [[ 0.113458 -0.062431  0.060432 ...  0.007231 -0.013264  0.024604]
 [-0.018223 -0.062071 -0.054039 ... -0.039914  0.055739 -0.067869]
 [ 0.032351 -0.025752  0.024376 ...  0.031781 -0.002401 -0.024485]
 ...
 [ 0.014075 -0.035853  0.069998 ... -0.014092 -0.032933 -0.051645]
 [ 0.110389  0.050713 -0.005143 ... -0.046878  0.013409 -0.004632]
 [ 0.046215  0.093834  0.078839 ... -0.011977 -0.044008 -0.076616]]
Training neural network
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
aaa


(SpearmanrResult(correlation=-0.7085277934443607, pvalue=1.7677073163502675e-54),
 SpearmanrResult(correlation=-0.6461645694394698, pvalue=1.2431906986455948e-42),
 KendalltauResult(correlation=-0.5130121830877685, pvalue=2.7679842373359427e-46),
 KendalltauResult(correlation=-0.45634798541869387, pvalue=5.480159384148773e-37))

In [30]:
res

(SpearmanrResult(correlation=-0.7085277934443607, pvalue=1.7677073163502675e-54),
 SpearmanrResult(correlation=-0.6461645694394698, pvalue=1.2431906986455948e-42),
 KendalltauResult(correlation=-0.5130121830877685, pvalue=2.7679842373359427e-46),
 KendalltauResult(correlation=-0.45634798541869387, pvalue=5.480159384148773e-37))

## 3. lemmatize MEN euclidean

In [31]:
res = do_experiment('5.zip', 'MEN-standartized.csv', 'euclidean')
res

273992
2995
200 [[ 0.043955 -0.042658  0.096431 ... -0.036177  0.005109  0.081497]
 [-0.04143   0.044345  0.036994 ... -0.014002 -0.048183  0.024704]
 [ 0.064861 -0.031078  0.021033 ... -0.011793  0.049018 -0.036995]
 ...
 [ 0.014758 -0.101612  0.017391 ... -0.073066  0.097357 -0.034404]
 [ 0.013813  0.05582   0.082128 ...  0.044952  0.020532  0.083864]
 [ 0.049241  0.004803  0.044597 ...  0.043673 -0.02894   0.010062]]
Training neural network
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Ep

(SpearmanrResult(correlation=-0.760625728585474, pvalue=0.0),
 SpearmanrResult(correlation=-0.9207950550718719, pvalue=0.0),
 KendalltauResult(correlation=-0.5554079065856331, pvalue=0.0),
 KendalltauResult(correlation=-0.7578220602855057, pvalue=0.0))

In [32]:
res

(SpearmanrResult(correlation=-0.760625728585474, pvalue=0.0),
 SpearmanrResult(correlation=-0.9207950550718719, pvalue=0.0),
 KendalltauResult(correlation=-0.5554079065856331, pvalue=0.0),
 KendalltauResult(correlation=-0.7578220602855057, pvalue=0.0))

## 4. lemmatize MEN cosine

In [33]:
res = do_experiment('5.zip', 'MEN-standartized.csv', 'cosine')
res

273992
2995
200 [[ 0.043955 -0.042658  0.096431 ... -0.036177  0.005109  0.081497]
 [-0.04143   0.044345  0.036994 ... -0.014002 -0.048183  0.024704]
 [ 0.064861 -0.031078  0.021033 ... -0.011793  0.049018 -0.036995]
 ...
 [ 0.014758 -0.101612  0.017391 ... -0.073066  0.097357 -0.034404]
 [ 0.013813  0.05582   0.082128 ...  0.044952  0.020532  0.083864]
 [ 0.049241  0.004803  0.044597 ...  0.043673 -0.02894   0.010062]]
Training neural network
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
aaa


(SpearmanrResult(correlation=-0.7606257987339587, pvalue=0.0),
 SpearmanrResult(correlation=-0.7300099849425212, pvalue=0.0),
 KendalltauResult(correlation=-0.5554083579388052, pvalue=0.0),
 KendalltauResult(correlation=-0.5260564098110365, pvalue=0.0))

In [34]:
res

(SpearmanrResult(correlation=-0.7606257987339587, pvalue=0.0),
 SpearmanrResult(correlation=-0.7300099849425212, pvalue=0.0),
 KendalltauResult(correlation=-0.5554083579388052, pvalue=0.0),
 KendalltauResult(correlation=-0.5260564098110365, pvalue=0.0))

## 5. lemmatize simlex euclidean

In [36]:
res = do_experiment('5.zip', 'SimLex-999-standartized.csv', 'euclidean')
res

273992
996
200 [[ 0.081944  0.070888  0.012175 ... -0.027376 -0.043142 -0.04819 ]
 [ 0.043955 -0.042658  0.096431 ... -0.036177  0.005109  0.081497]
 [ 0.031562 -0.068909 -0.016195 ...  0.054254  0.057617  0.044717]
 ...
 [ 0.064343  0.041916 -0.069158 ...  0.059846 -0.029133  0.082279]
 [ 0.028022  0.005436  0.070518 ...  0.148056 -0.011683  0.057725]
 [-0.058465 -0.02852   0.065594 ...  0.016635  0.000731  0.02499 ]]
Training neural network
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epo

(SpearmanrResult(correlation=-0.41030069393086, pvalue=9.982710446211431e-42),
 SpearmanrResult(correlation=-0.6088200032688278, pvalue=4.605823534215415e-102),
 KendalltauResult(correlation=-0.2868649444143602, pvalue=1.0235961422640682e-41),
 KendalltauResult(correlation=-0.4332655427732197, pvalue=7.876411855231044e-93))

In [37]:
res

(SpearmanrResult(correlation=-0.41030069393086, pvalue=9.982710446211431e-42),
 SpearmanrResult(correlation=-0.6088200032688278, pvalue=4.605823534215415e-102),
 KendalltauResult(correlation=-0.2868649444143602, pvalue=1.0235961422640682e-41),
 KendalltauResult(correlation=-0.4332655427732197, pvalue=7.876411855231044e-93))

## 6. lemmatize simlex cosine

In [38]:
res = do_experiment('5.zip', 'SimLex-999-standartized.csv', 'cosine')
res

273992
996
200 [[ 0.081944  0.070888  0.012175 ... -0.027376 -0.043142 -0.04819 ]
 [ 0.043955 -0.042658  0.096431 ... -0.036177  0.005109  0.081497]
 [ 0.031562 -0.068909 -0.016195 ...  0.054254  0.057617  0.044717]
 ...
 [ 0.064343  0.041916 -0.069158 ...  0.059846 -0.029133  0.082279]
 [ 0.028022  0.005436  0.070518 ...  0.148056 -0.011683  0.057725]
 [-0.058465 -0.02852   0.065594 ...  0.016635  0.000731  0.02499 ]]
Training neural network
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
aaa


(SpearmanrResult(correlation=-0.410309572203932, pvalue=9.93912834716579e-42),
 SpearmanrResult(correlation=-0.39960135466712693, pvalue=1.7698736971824294e-39),
 KendalltauResult(correlation=-0.2868689881829592, pvalue=1.0209434354198436e-41),
 KendalltauResult(correlation=-0.2774631824216426, pvalue=3.870934663169294e-39))

In [39]:
res

(SpearmanrResult(correlation=-0.410309572203932, pvalue=9.93912834716579e-42),
 SpearmanrResult(correlation=-0.39960135466712693, pvalue=1.7698736971824294e-39),
 KendalltauResult(correlation=-0.2868689881829592, pvalue=1.0209434354198436e-41),
 KendalltauResult(correlation=-0.2774631824216426, pvalue=3.870934663169294e-39))

## 7. NON lemmatize wordsim euclidean

In [40]:
res = do_experiment('6.zip', 'WordSim-353-standartized.csv', 'euclidean')
res

302866
350
200 [[ 0.000587 -0.084463 -0.01506  ...  0.000696  0.019693  0.043772]
 [ 0.033611  0.023678  0.025785 ...  0.031816  0.091718  0.044066]
 [ 0.062105  0.0978   -0.069382 ...  0.034753  0.087597 -0.003909]
 ...
 [ 0.041623 -0.001717 -0.007894 ... -0.000554  0.067371 -0.008428]
 [ 0.14492  -0.013378  0.096322 ... -0.072303  0.088542  0.056065]
 [-0.025886  0.072847 -0.041639 ...  0.040265 -0.06546   0.064321]]
Training neural network
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epo

(SpearmanrResult(correlation=-0.7149844011090453, pvalue=4.828088525235992e-56),
 SpearmanrResult(correlation=-0.76228687922488, pvalue=9.963420483917114e-68),
 KendalltauResult(correlation=-0.5201605756056357, pvalue=1.1475990675828784e-47),
 KendalltauResult(correlation=-0.5642027943767713, pvalue=9.1635421972056e-56))

In [41]:
res

(SpearmanrResult(correlation=-0.7149844011090453, pvalue=4.828088525235992e-56),
 SpearmanrResult(correlation=-0.76228687922488, pvalue=9.963420483917114e-68),
 KendalltauResult(correlation=-0.5201605756056357, pvalue=1.1475990675828784e-47),
 KendalltauResult(correlation=-0.5642027943767713, pvalue=9.1635421972056e-56))

## 8. NON lemmatize wordsim cosine

In [42]:
res = do_experiment('6.zip', 'WordSim-353-standartized.csv', 'cosine')
res

302866
350
200 [[ 0.000587 -0.084463 -0.01506  ...  0.000696  0.019693  0.043772]
 [ 0.033611  0.023678  0.025785 ...  0.031816  0.091718  0.044066]
 [ 0.062105  0.0978   -0.069382 ...  0.034753  0.087597 -0.003909]
 ...
 [ 0.041623 -0.001717 -0.007894 ... -0.000554  0.067371 -0.008428]
 [ 0.14492  -0.013378  0.096322 ... -0.072303  0.088542  0.056065]
 [-0.025886  0.072847 -0.041639 ...  0.040265 -0.06546   0.064321]]
Training neural network
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
aaa


(SpearmanrResult(correlation=-0.7149844011090453, pvalue=4.828088525235992e-56),
 SpearmanrResult(correlation=-0.6417422290411958, pvalue=5.1859191910230406e-42),
 KendalltauResult(correlation=-0.5201605756056357, pvalue=1.1475990675828784e-47),
 KendalltauResult(correlation=-0.45024396323031074, pvalue=3.768619169459944e-36))

In [44]:
res

(SpearmanrResult(correlation=-0.7149844011090453, pvalue=4.828088525235992e-56),
 SpearmanrResult(correlation=-0.6417422290411958, pvalue=5.1859191910230406e-42),
 KendalltauResult(correlation=-0.5201605756056357, pvalue=1.1475990675828784e-47),
 KendalltauResult(correlation=-0.45024396323031074, pvalue=3.768619169459944e-36))

## 9. NON lemmatize MEN euclidean

In [45]:
res = do_experiment('6.zip', 'MEN-standartized.csv', 'euclidean')
res

302866
2990
200 [[ 0.186102 -0.01899   0.043054 ...  0.011151  0.066     0.015289]
 [ 0.077927  0.00074   0.035034 ...  0.041995 -0.001263 -0.019273]
 [ 0.100433 -0.048706  0.009059 ... -0.066203  0.045879 -0.01523 ]
 ...
 [ 0.169143 -0.029529  0.095029 ... -0.075348  0.14589  -0.045791]
 [ 0.070053  0.033203  0.066227 ...  0.023616 -0.073134 -0.032462]
 [ 0.152246 -0.070508 -0.035468 ...  0.023608  0.031194 -0.013409]]
Training neural network
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Ep

(SpearmanrResult(correlation=-0.740759892454674, pvalue=0.0),
 SpearmanrResult(correlation=-0.9189012016876532, pvalue=0.0),
 KendalltauResult(correlation=-0.5344762699724819, pvalue=0.0),
 KendalltauResult(correlation=-0.7563675319674827, pvalue=0.0))

In [46]:
res

(SpearmanrResult(correlation=-0.740759892454674, pvalue=0.0),
 SpearmanrResult(correlation=-0.9189012016876532, pvalue=0.0),
 KendalltauResult(correlation=-0.5344762699724819, pvalue=0.0),
 KendalltauResult(correlation=-0.7563675319674827, pvalue=0.0))

## 10. NON lemmatize MEN cosine

In [47]:
res = do_experiment('6.zip', 'MEN-standartized.csv', 'cosine')
res

302866
2990
200 [[ 0.186102 -0.01899   0.043054 ...  0.011151  0.066     0.015289]
 [ 0.077927  0.00074   0.035034 ...  0.041995 -0.001263 -0.019273]
 [ 0.100433 -0.048706  0.009059 ... -0.066203  0.045879 -0.01523 ]
 ...
 [ 0.169143 -0.029529  0.095029 ... -0.075348  0.14589  -0.045791]
 [ 0.070053  0.033203  0.066227 ...  0.023616 -0.073134 -0.032462]
 [ 0.152246 -0.070508 -0.035468 ...  0.023608  0.031194 -0.013409]]
Training neural network
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
aaa


(SpearmanrResult(correlation=-0.7407603538008262, pvalue=0.0),
 SpearmanrResult(correlation=-0.7102571241219192, pvalue=0.0),
 KendalltauResult(correlation=-0.5344766032338761, pvalue=0.0),
 KendalltauResult(correlation=-0.506733352037184, pvalue=0.0))

In [49]:
res

(SpearmanrResult(correlation=-0.7407603538008262, pvalue=0.0),
 SpearmanrResult(correlation=-0.7102571241219192, pvalue=0.0),
 KendalltauResult(correlation=-0.5344766032338761, pvalue=0.0),
 KendalltauResult(correlation=-0.506733352037184, pvalue=0.0))

## 11. NON lemmatize simlex euclidean

In [50]:
res = do_experiment('6.zip', 'SimLex-999-standartized.csv', 'euclidean')
res

302866
994
200 [[ 0.071775  0.019376  0.092361 ...  0.007572  0.144104 -0.051339]
 [ 0.186102 -0.01899   0.043054 ...  0.011151  0.066     0.015289]
 [ 0.032881 -0.033576  0.039352 ... -0.023929  0.057046  0.126246]
 ...
 [ 0.136787  0.020429 -0.111552 ... -0.028137  0.0664   -0.054412]
 [ 0.006196 -0.03032   0.088371 ...  0.151074 -0.009712  0.067667]
 [-0.037721  0.054703  0.135236 ...  0.015538  0.046769  0.046268]]
Training neural network
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epo

(SpearmanrResult(correlation=-0.39672429489470695, pvalue=8.194700471508046e-39),
 SpearmanrResult(correlation=-0.6191010230400267, pvalue=3.1092436402981607e-106),
 KendalltauResult(correlation=-0.27664369440557113, pvalue=7.649997002013556e-39),
 KendalltauResult(correlation=-0.4424614177329303, pvalue=1.542306862157365e-96))

In [51]:
res

(SpearmanrResult(correlation=-0.39672429489470695, pvalue=8.194700471508046e-39),
 SpearmanrResult(correlation=-0.6191010230400267, pvalue=3.1092436402981607e-106),
 KendalltauResult(correlation=-0.27664369440557113, pvalue=7.649997002013556e-39),
 KendalltauResult(correlation=-0.4424614177329303, pvalue=1.542306862157365e-96))

## 12. NON lemmatize simlex cosine

In [52]:
res = do_experiment('6.zip', 'SimLex-999-standartized.csv', 'cosine')
res

302866
994
200 [[ 0.071775  0.019376  0.092361 ...  0.007572  0.144104 -0.051339]
 [ 0.186102 -0.01899   0.043054 ...  0.011151  0.066     0.015289]
 [ 0.032881 -0.033576  0.039352 ... -0.023929  0.057046  0.126246]
 ...
 [ 0.136787  0.020429 -0.111552 ... -0.028137  0.0664   -0.054412]
 [ 0.006196 -0.03032   0.088371 ...  0.151074 -0.009712  0.067667]
 [-0.037721  0.054703  0.135236 ...  0.015538  0.046769  0.046268]]
Training neural network
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
aaa


(SpearmanrResult(correlation=-0.39672429489470695, pvalue=8.194700471508046e-39),
 SpearmanrResult(correlation=-0.38096873447343776, pvalue=1.0984456903073025e-35),
 KendalltauResult(correlation=-0.27664369440557113, pvalue=7.649997002013556e-39),
 KendalltauResult(correlation=-0.2639437644683027, pvalue=1.637063841100438e-35))

In [53]:
res

(SpearmanrResult(correlation=-0.39672429489470695, pvalue=8.194700471508046e-39),
 SpearmanrResult(correlation=-0.38096873447343776, pvalue=1.0984456903073025e-35),
 KendalltauResult(correlation=-0.27664369440557113, pvalue=7.649997002013556e-39),
 KendalltauResult(correlation=-0.2639437644683027, pvalue=1.637063841100438e-35))