# Evaluation of Poincare Embeddings

This notebook demonstrates how well poincare embeddings trained using this [implementation](https://github.com/TatsuyaShirakawa/poincare-embedding) perform on the tasks detailed in the [original paper](https://arxiv.org/pdf/1705.08039.pdf).

This is the list of tasks - 
1. WordNet reconstruction
2. WordNet link prediction
3. Link prediction in collaboration networks
4. Lexical entailment on HyperLex

A more detailed explanation of the tasks and the evaluation methodology is present in the individual evaluation subsections.

## 1. Setup

TODO

## 2. Training
TODO

## 3. Loading the embeddings

### 3.1 C++ embeddings

In [60]:
%load_ext line_profiler

In [1]:
% cd ../..

/home/jayant/projects/gensim


In [2]:
embeddings_dir = '/home/jayant/projects/poincare-embedding/work'  # TODO: put model files into repo?

In [69]:
import os

from gensim.models.keyedvectors import KeyedVectors
import numpy as np
from scipy.spatial.distance import euclidean, pdist
from smart_open import smart_open

def transform_cpp_embedding_to_kv(input_file, output_file, encoding='utf8'):
    """Given a C++ embedding tsv filepath, converts it to a KeyedVector-supported file"""
    with smart_open(input_file, 'rb') as f:
        lines = [line.decode(encoding) for line in f]
    if not len(lines):
         raise ValueError("file is empty")
    first_line = lines[0]
    parts = first_line.rstrip().split("\t")
    model_size = len(parts) - 1
    vocab_size = len(lines)
    with open(output_file, 'w') as f:
        f.write('%d %d\n' % (vocab_size, model_size))
        for line in lines:
            f.write(line.replace('\t', ' '))
    
        
class PoincareEmbedding(object):
    """Load and perform distance operations on poincare embeddings"""

    def __init__(self, keyed_vectors):
        """Initialize PoincareEmbeddings via a KeyedVectors instance"""
        self.kv = keyed_vectors
    
    @staticmethod
    def poincare_dist(vector_1, vector_2):
        """Return poincare distance between two vectors"""
        norm_1 = np.linalg.norm(vector_1)
        norm_2 = np.linalg.norm(vector_2)
        euclidean_dist = euclidean(vector_1, vector_2)
        return np.arccosh(
            1 + 2 * (
                (euclidean_dist ** 2) / ((1 - norm_1 ** 2) * (1 - norm_2 ** 2))
            )
        )
        
    @classmethod
    def load_poincare_cpp(cls, input_filename):
        """Load embeddings trained via C++ Poincare model

        Args:
            filepath (str): Path to tsv file containing embeddings

        Returns:
            PoincareEmbedding instance

        """
        keyed_vectors_filename = input_filename + '.kv'
        transform_cpp_embedding_to_kv(input_filename, keyed_vectors_filename)
        keyed_vectors = KeyedVectors.load_word2vec_format(keyed_vectors_filename)
        os.unlink(keyed_vectors_filename)
        return cls(keyed_vectors)
       
    def get_distances(self, term_1, terms):
        """Returns distance between vector for term and vectors for given terms
        Args:
            term_1 (str)
            terms (list/tuple/set): terms for which distance from vector for term_1 is to be returned

        Returns:
            List of Poincare distances between term_1 and all terms in `terms` (list of floats)

        """
        term_1_vector = self.kv.word_vec(term_1)

        vocab = self.kv.vocab
        terms_indices = [vocab[term].index for term in terms]
        other_vectors = self.kv.syn0[terms_indices]
        
        euclidean_dists = np.linalg.norm(term_1_vector - other_vectors, axis=1)
        norm_1 = np.linalg.norm(term_1_vector)
        other_norms = np.linalg.norm(other_vectors, axis=1)

        return list(np.arccosh(
            1 + 2 * (
                (euclidean_dists ** 2) / ((1 - norm_1 ** 2) * (1 - other_norms ** 2))
            )
        ))
        
        
    def get_distance(self, term_1, term_2):
        """Returns distance between vectors for input terms

        Args:
            term_1 (str)
            term_2 (str)

        Returns:
            Poincare distance between the two terms (float)
        
        Note:
            Raises KeyError if either term_1 or term_2 is absent from vocabulary

        """
        vector_1, vector_2 = self.kv[term_1], self.kv[term_2]
        return self.poincare_dist(vector_1, vector_2)

In [70]:
filenames = [
#     'wordnet_embeddings_2.tsv',
#     'wordnet_embeddings_5.tsv',
    'wordnet_embeddings_10.tsv',
#     'wordnet_embeddings_20.tsv',
#     'wordnet_embeddings_50.tsv',
#     'wordnet_embeddings_100.tsv',
]
embeddings = {fname: PoincareEmbedding.load_poincare_cpp(os.path.join(embeddings_dir, fname)) for fname in filenames}

## 4. Evaluation

### 4.1 WordNet reconstruction

In [74]:
import csv
from collections import defaultdict
import itertools

def get_rank(sorted_list, given_value):
    """Return rank of given value in sorted list (sorted in increasing order)"""
    for i, value in enumerate(sorted_list, start=1):
        if given_value < value:
            return i
    return len(sorted_list) + 1

class ReconstructionEvaluation(object):
    """Evaluating reconstruction on given network for any embeddings"""
    def __init__(self, filepath):
        """Initialize evaluation instance with tsv file containing relation pairs
        
        Args:
            filepath (str): path to tsv file containing relation pairs
        
        Returns
            ReconstructionEvaluation instance

        """
        items = set()
        relation_pairs = defaultdict(set)
        with smart_open(filepath, 'r') as f:
            reader = csv.reader(f, delimiter='\t')
            for row in reader:
                assert len(row) == 2, 'Hypernym pair has more than two items'
                relation_pairs[row[0]].add(row[1])
                items.update(row)
        self.items = items
        self.relation_pairs = relation_pairs
    
    def get_negative_instances(self, item, max_n=None):
        """Get all item that don't have a relation with given item
        
        Args:
            item (str): item for which negative instances are to be returned
            max_n (int or None): Return at most max_n negative instances. Return all if None
        
        Returns:
            set of items which don't have a relation with input item

        """
        related_items = self.relation_pairs[item]
        negative_items = {item_ for item_ in self.items if item_ not in related_items}
        if max_n is not None:
            return set(itertools.islice(negative_items, max_n))
        else:
            return negative_items

    def evaluate_reconstruction(self, embeddings):
        """Evaluate mean rank and MAP for reconstruction for given embeddings
            
        Args:
            embeddings (PoincareEmbedding instance): embeddings for which evaluation is to be done
        
        Returns:
            ??

        """
        ranks = []
        for i, item in enumerate(self.items):
            if not i % 1000:
                print('Evaluating item %d %s' % (i, item))
            if item not in self.relation_pairs:
                continue
            positive_items = self.relation_pairs[item]
            negative_items = self.get_negative_instances(item)
            negative_item_distances = embeddings.get_distances(item, negative_items)
            negative_item_distances = sorted(negative_item_distances)
            positive_item_distances = embeddings.get_distances(item, positive_items)
            positive_item_ranks = [get_rank(negative_item_distances, positive_item_distance) for positive_item_distance in positive_item_distances]
            ranks += positive_item_ranks
            if i > 100:
                break
        return ranks
        

In [75]:
eval_instance = ReconstructionEvaluation(os.path.join(embeddings_dir, 'wordnet_noun_hypernyms.tsv'))

In [76]:
%lprun -f PoincareEmbedding.get_distances eval_instance.evaluate_reconstruction(test_embedding)

Evaluating item 0 pica.n.01




In [77]:
%%time
print(np.mean(eval_instance.evaluate_reconstruction(test_embedding)))

Evaluating item 0 pica.n.01




84.080472103
CPU times: user 12.6 s, sys: 0 ns, total: 12.6 s
Wall time: 12.6 s


### 4.2 WordNet link prediction

### 4.3 HyperLex lexical entailment

### 4.1 Link Prediction for collaboration networks