# Evaluation of Poincare Embeddings

This notebook demonstrates how well poincare embeddings trained using this [implementation](https://github.com/TatsuyaShirakawa/poincare-embedding) perform on the tasks detailed in the [original paper](https://arxiv.org/pdf/1705.08039.pdf).

This is the list of tasks - 
1. WordNet reconstruction
2. WordNet link prediction
3. Link prediction in collaboration networks
4. Lexical entailment on HyperLex

A more detailed explanation of the tasks and the evaluation methodology is present in the individual evaluation subsections.

## 1. Setup

TODO

## 2. Training
TODO

## 3. Loading the embeddings

### 3.1 C++ embeddings

In [2]:
%load_ext line_profiler
%load_ext snakeviz

In [3]:
% cd ../..

/home/jayant/projects/gensim


In [4]:
embeddings_dir = '/home/jayant/projects/poincare-embedding/work'  # TODO: put model files into repo?

In [68]:
import os

from gensim.models.keyedvectors import KeyedVectors
import numpy as np
from pygtrie import Trie
from scipy.spatial.distance import euclidean, pdist
from smart_open import smart_open

def transform_cpp_embedding_to_kv(input_file, output_file, encoding='utf8'):
    """Given a C++ embedding tsv filepath, converts it to a KeyedVector-supported file"""
    with smart_open(input_file, 'rb') as f:
        lines = [line.decode(encoding) for line in f]
    if not len(lines):
         raise ValueError("file is empty")
    first_line = lines[0]
    parts = first_line.rstrip().split("\t")
    model_size = len(parts) - 1
    vocab_size = len(lines)
    with open(output_file, 'w') as f:
        f.write('%d %d\n' % (vocab_size, model_size))
        for line in lines:
            f.write(line.replace('\t', ' '))
    
        
class PoincareEmbedding(object):
    """Load and perform distance operations on poincare embeddings"""

    def __init__(self, keyed_vectors):
        """Initialize PoincareEmbeddings via a KeyedVectors instance"""
        self.kv = keyed_vectors
        self.init_key_trie()
        
    def init_key_trie(self):
        """Setup trie containing vocab keys for quick prefix lookups"""
        self.key_trie = Trie()
        for key in self.kv.vocab:
            self.key_trie[key] = True
    
    @staticmethod
    def poincare_dist(vector_1, vector_2):
        """Return poincare distance between two vectors"""
        norm_1 = np.linalg.norm(vector_1)
        norm_2 = np.linalg.norm(vector_2)
        euclidean_dist = euclidean(vector_1, vector_2)
        return np.arccosh(
            1 + 2 * (
                (euclidean_dist ** 2) / ((1 - norm_1 ** 2) * (1 - norm_2 ** 2))
            )
        )
        
    @classmethod
    def load_poincare_cpp(cls, input_filename):
        """Load embeddings trained via C++ Poincare model

        Args:
            filepath (str): Path to tsv file containing embeddings

        Returns:
            PoincareEmbedding instance

        """
        keyed_vectors_filename = input_filename + '.kv'
        transform_cpp_embedding_to_kv(input_filename, keyed_vectors_filename)
        keyed_vectors = KeyedVectors.load_word2vec_format(keyed_vectors_filename)
        os.unlink(keyed_vectors_filename)
        return cls(keyed_vectors)
    
    def find_matching_keys(self, word):
        """Find all senses of given word in embedding vocabulary"""
        matches = self.key_trie.items('%s.' % word)
        matching_keys = [''.join(key_chars) for key_chars, value in matches]
        return matching_keys

    def get_vector(self, term):
        """Return vector for given term"""
        return self.kv.word_vec(term)
        
    def get_all_distances(self, term):
        """Return distances to all terms for given term, including itself"""
        term_vector = self.kv.word_vec(term)
        all_vectors = self.kv.syn0
        
        euclidean_dists = np.linalg.norm(term_vector - all_vectors, axis=1)
        norm = np.linalg.norm(term_vector)
        all_norms = np.linalg.norm(all_vectors, axis=1)
        return np.arccosh(
            1 + 2 * (
                (euclidean_dists ** 2) / ((1 - norm ** 2) * (1 - all_norms ** 2))
            )
        )
        
    def get_distance(self, term_1, term_2):
        """Returns distance between vectors for input terms

        Args:
            term_1 (str)
            term_2 (str)

        Returns:
            Poincare distance between the two terms (float)
        
        Note:
            Raises KeyError if either term_1 or term_2 is absent from vocabulary

        """
        vector_1, vector_2 = self.kv[term_1], self.kv[term_2]
        return self.poincare_dist(vector_1, vector_2)

In [85]:
filenames = [
#     'wordnet_embeddings_2.tsv',
#     'wordnet_embeddings_5.tsv',
#     'wordnet_embeddings_10.tsv',
    'wordnet_embeddings_20.tsv',
    'wordnet_embeddings_20_ep50.tsv',
    'wordnet_embeddings_50.tsv',
#     'wordnet_embeddings_50_ep100.tsv',
    'wordnet_embeddings_100.tsv',
]
embeddings = {fname: PoincareEmbedding.load_poincare_cpp(os.path.join(embeddings_dir, fname)) for fname in filenames}

In [100]:
test_embedding = embeddings['wordnet_embeddings_20_ep50.tsv']

In [101]:
%%time
for i, term in enumerate(test_embedding.kv.vocab.keys(), start=1):
    if i > 1000:
        break
    dists = test_embedding.get_all_distances(term)

CPU times: user 7.11 s, sys: 0 ns, total: 7.11 s
Wall time: 7.11 s


## 4. Evaluation

### 4.1 WordNet reconstruction

In [102]:
import csv
from collections import defaultdict
import itertools


class ReconstructionEvaluation(object):
    """Evaluating reconstruction on given network for any embeddings"""
    def __init__(self, filepath, embedding):
        """Initialize evaluation instance with tsv file containing relation pairs and embedding to be evaluated
        
        Args:
            filepath (str): path to tsv file containing relation pairs
            embedding (PoincareEmbedding instance): embedding to be evaluated
        
        Returns
            ReconstructionEvaluation instance

        """
        items = set()
        embedding_vocab = embedding.kv.vocab
        positive_relations = defaultdict(set)
        with smart_open(filepath, 'r') as f:
            reader = csv.reader(f, delimiter='\t')
            for row in reader:
                assert len(row) == 2, 'Hypernym pair has more than two items'
                item_1_index = embedding_vocab[row[0]].index
                item_2_index = embedding_vocab[row[1]].index
                positive_relations[item_1_index].add(item_2_index)
                items.update([item_1_index, item_2_index])
        self.items = items
        self.positive_relations = positive_relations
        self.embedding = embedding
    
    
    @staticmethod
    def get_positive_item_ranks(distances, positive_item_indices):
        """Given a numpy array of distances and indices of positive items, compute ranks of positive item distances
        
        Args:
            distances (numpy float array): np array of all distances for a specific item
            positive_item_indices (list): list of indices of positive items
        
        Returns:
            list of ranks of positive items in the same order as `positive_indices`
        """
        positive_item_distances = distances[positive_item_indices]
        negative_item_distances = np.ma.array(distances, mask=False)
        negative_item_distances.mask[positive_item_indices] = True
        # Compute how many negative item distances are less than each positive item distance, plus 1 for rank
        ranks = (negative_item_distances < positive_item_distances[:, np.newaxis]).sum(axis=1) + 1
        return list(ranks) 

    def evaluate_reconstruction(self, max_n=None):
        """Evaluate mean rank and MAP for reconstruction
            
        Args:
            max_n (int or None): Maximum number of positive relations to evaluate, all if max_n is None
        
        Returns:
            ??

        """
        ranks = []
        for i, item in enumerate(self.items, start=1):
            if not i % 1000:
                print('Evaluating item number %d: %s' % (i, item))
            if item not in self.positive_relations:
                continue
            positive_items = list(self.positive_relations[item])
            item_term = self.embedding.kv.index2word[item]
            item_distances = self.embedding.get_all_distances(item_term)
            positive_item_ranks = self.get_positive_item_ranks(item_distances, positive_items)
            ranks += positive_item_ranks
            if max_n is not None and i > max_n:
                break
        return np.mean(ranks)
        

In [103]:
eval_instance = ReconstructionEvaluation(os.path.join(embeddings_dir, 'wordnet_noun_hypernyms.tsv'), test_embedding)

In [104]:
%%time
print(np.mean(eval_instance.evaluate_reconstruction(max_n=1000)))

Evaluating item number 1000: 999
74.4394798266
CPU times: user 9.34 s, sys: 0 ns, total: 9.34 s
Wall time: 9.34 s


### 4.2 WordNet link prediction
TODO (tricky)

### 4.3 HyperLex lexical entailment

In [105]:
from scipy.stats import spearmanr

class LexicalEntailmentEvaluation(object):
    """Evaluating reconstruction on given network for any embeddings"""
    def __init__(self, filepath, embedding):
        """Initialize evaluation instance with HyperLex text file containing relation pairs
        
        Args:
            filepath (str): path to HyperLex text file
            embedding (PoincareEmbedding instance): embedding to be evaluated
        
        Returns
            LexicalEntailmentEvaluation instance

        """
        expected_scores = {}
        with smart_open(filepath, 'r') as f:
            reader = csv.DictReader(f, delimiter=' ')
            for row in reader:
                word_1, word_2 = row['WORD1'], row['WORD2']
                expected_scores[(word_1, word_2)] = float(row['AVG_SCORE'])
        self.scores = expected_scores
        self.embedding = embedding
        self.alpha = 1000
    
    def score_function(self, word_1, word_2):
        """Given two terms, return the predicted score for them (extent to which term_1 is a type of term_2)"""
        try:
            word_1_terms = self.embedding.find_matching_keys(word_1)
            word_2_terms = self.embedding.find_matching_keys(word_2)
        except KeyError:
            raise ValueError("No matching terms found for either %s or %s" % (word_1, word_2))
        min_distance = np.inf
        min_term_1, min_term_2 = None, None
        for term_1 in word_1_terms:
            for term_2 in word_2_terms:
                distance = self.embedding.get_distance(term_1, term_2)
                if distance < min_distance:
                    min_term_1, min_term_2 = term_1, term_2
                    min_distance = distance
        try:
            assert min_term_1 is not None and min_term_2 is not None
        except AssertionError:
            import pdb
            pdb.set_trace()
        vector_1, vector_2 = self.embedding.get_vector(min_term_1), self.embedding.get_vector(min_term_2)
        norm_1, norm_2 = np.linalg.norm(vector_1), np.linalg.norm(vector_2)
        return -1 * (1 + self.alpha * (norm_2 - norm_1)) * distance
        
    def evaluate_spearman(self, embeddings):
        """Evaluate spearman scores for lexical entailment for given embeddings
            
        Args:
            embeddings (PoincareEmbedding instance): embeddings for which evaluation is to be done
        
        Returns:
            ??

        """
        predicted_scores = []
        expected_scores = []
        skipped = 0
        count = 0
        for (word_1, word_2), expected_score in self.scores.items():
            try:
                predicted_score = self.score_function(word_1, word_2)
            except ValueError:
                skipped += 1
                continue
            count += 1
            predicted_scores.append(predicted_score)
            expected_scores.append(expected_score)
        print('Skipped pairs: %d out of %d' % (skipped, len(self.scores)))
        spearman = spearmanr(expected_scores, predicted_scores)
        return spearman


In [106]:
eval_instance = LexicalEntailmentEvaluation(os.path.join(embeddings_dir, 'nouns-verbs', 'hyperlex-nouns.txt'), test_embedding)

In [107]:
eval_instance.evaluate_spearman(test_embedding)

Skipped pairs: 182 out of 2163


SpearmanrResult(correlation=0.44053139956543591, pvalue=7.7820211828687839e-95)

### 4.1 Link Prediction for collaboration networks
TODO (tricky)

### 4.1 Link Prediction for collaboration networks
TODO (tricky)