## Import data

In [22]:
import numpy as np
import numpy.linalg as la
from reach import Reach
import os
import scipy.stats as stats
import sklearn.preprocessing
from tqdm.notebook import tqdm
import random
import matplotlib.pyplot as plt

In [3]:
#pathnames
relpron_path = './relpron_translation.txt'
verbs_path = './verbmatrices'
vectorspath = './embeddings/sonar-160.txt'

Import the RELPRON test sentences. This dataset consists of pairs of nouns and relative clauses with a transtitive verb, describing some property or action related to them. An example:

> SBJ vliegtuig: vaartuig dat hoogte bereikt/bereik
> [SBJ airplaine: craft that reaches height]

Note that items also list whether the clause is subject or object relative, and gives the root for inflected verbs (and nouns).

In [4]:
#import relpron data
relpron_file = open(relpron_path,'r', encoding='latin-1')
items_raw = relpron_file.readlines()
relpron_file.close()

In [6]:
#store relpron data

class Sentence:
    def __init__(self, string):
        key, self.propertystr = string.strip().split(': ')
        
        self.relation, self.termN = key.split(' ')
        self.rel = self.relation[0]
        
        propertylst = self.propertystr.split(' ')
        self.headN = propertylst[0]
        self.argN = propertylst[2].split('/')[0]
        _, self.V = propertylst[-1].split('/')
        
    def __str__(self):
        return self.termN + ': ' + self.propertystr
        
verbfiles = [filename[:-4] for filename in os.listdir(verbs_path)]

items = dict()
for item in items_raw:
    sent = Sentence(item)
    if sent.V + '|O' in verbfiles and sent.V + '|S' in verbfiles:
        if sent.termN not in items:
            items[sent.termN] = [sent]
        else:
            items[sent.termN].append(sent)

Loading the noun embeddings and the verb matrices

In [7]:
#load vectors
r = Reach.load(vectorspath)

#load matrices
verbmatrices = dict()
for verbfile in os.listdir(verbs_path):
    verbmatrices[verbfile[:-4]] = np.load(verbs_path+'/'+verbfile)

## Generate composed vectors

In [9]:
variance_control = True
mean_std = 0.08

class PhraseComposer:
    def __init__(self, sentence):
        """Load vectors and matrices for the phrase"""
        self.relation = sentence.rel
        if self.relation == 'O':
            self.inverse_relation = 'S'
        else:
            self.inverse_relation = 'O'
        
        try:
            self.arg_v = r[sentence.argN]
            self.arg_v = self.Scale(self.arg_v)
            
            self.head_v = r[sentence.headN]
            self.head_v = self.Scale(self.head_v)
            
            self.V_v = r[sentence.V.split('_')[0]]
            self.VO_m =  verbmatrices[sent.V+'|O']
            self.VS_m = verbmatrices[sent.V+'|S']
            
            self.valid = True
        
        except KeyError:
            self.valid = False
        
    def Scale(self, v):
        """Apply standard scaling to argument vector as was done in training."""
        scaler = sklearn.preprocessing.StandardScaler()
        scaler.set_params(with_std=variance_control)
        v = scaler.fit_transform(v[:, np.newaxis])
        v = np.squeeze(v)
        if variance_control:
            v = mean_std * v
        return v
    
    def Compose(self, method):
        """Direct to proper composition method based on input string"""
        if method == 'Addition':
            return self.Addition()
        if method == 'PLF':
            return self.PLF(self.relation)
        if method == 'VArg':
            return self.VArg(self.relation)
        if method == 'VHn':
            return self.VHn(self.relation)
        if method == 'iPLF':
            return self.PLF(self.inverse_relation)
        
    def Addition(self):
        return np.add(self.head_v, self.arg_v, self.V_v)

    def VHn(self, relation):
        if not relation:
            relation = self.relation  #cant put a property of the object as a default argument so keep it like this
        if relation == 'O':
            return np.dot(self.head_v, self.VO_m)
        else:
            return np.dot(self.head_v, self.VS_m)
    
    def VArg(self, relation):
        if relation == 'O':
            return np.dot(self.arg_v, self.VS_m)
        else:
            return np.dot(self.arg_v, self.VO_m)
    
    def PLF(self, relation):
        return np.add(self.VArg(relation), self.VHn(relation))
    
    def iPLF(self):
        return(np.add(self.VArg(self.inverse_relation), self.VHn(self.inverse_relation)))         
        

In [10]:
methods = ['Addition', 'VHn', 'VArg', 'PLF', 'iPLF']
methods_S = ['VHn', 'VArg', 'PLF', 'iPLF'] #methods with their result in the S space
methods_N = ['Addition'] #methods with the results in the N space

terms = items.keys()

results = dict()

for term in tqdm(terms):
    sents = items[term]
    for sent in sents:
        composer = PhraseComposer(sent)
        if composer.valid:
            sentresults = dict()
            for method in methods:
                property_v = composer.Compose(method)
                sentresults[method] = property_v

            sentresults['target'] = r[sent.termN]
            results[sent] = sentresults

HBox(children=(IntProgress(value=0, max=111), HTML(value='')))




## Check some sizes

More of a technicality. 

Checking the number of trained verbs is a bit cumbersome, because subject and object transformations are trained separately, and were only included if they had enough data. So some verbs have a subject but no object representation, or vice versa.

In [66]:
print('Number of terms in corpus:', len(items))

total_clauses = sum(len(items[term]) for term in items)
print('Total number of relative clauses:', total_clauses)

print('Relative clauses discarded:', total_clauses - len(results))

print('Relative clauses included in results:', len(results))

print('Average number of clauses per term:', round(len(results) / len(items), 2))

roles_for_verb = lambda key : set(verb[-1] for verb in verbmatrices if verb[:-2] == key[:-2])
verbs = {verbkey[:-2] : roles_for_verb(verbkey) for verbkey in verbmatrices}
print('Number of verbs:', len([verb for verb in verbs if len(verbs[verb]) == 2]))

Number of terms in corpus: 111
Total number of relative clauses: 411
Relative clauses discarded: 17
Relative clauses included in results: 394
Average number of clauses per term: 3.55
Number of verbs: 122


## Evaluate methods by ranking sentences based on term similarity

In [14]:
#similarity functions

def cosine(v1, v2):
    product = np.dot(v1, v2)
    norm = np.linalg.norm(v1) * np.linalg.norm(v2)
    return product / norm

def correlation(v1, v2):
    return stats.pearsonr(v1, v2)

In [15]:
#rank sentences based on similarity

def rankSentences (target_v, results_dict, method='Addition', sim_function=cosine):
    """Returns a list of the results in the results dictionary, ranked by their similarity 
    with the target vector. Method and similarity function are parameters."""

    allsents = list(results_dict.keys())
    similarity = lambda sent: sim_function(target_v, results_dict[sent][method])
    return(sorted(allsents, key=similarity, reverse=True))

In [17]:
def evaluate_ranking(target_term, sorted_list):
    """Rate the ranking (output of rankSentences) for a sorted list using MAP. 
    The ideal ranking puts all sentences with the target term on the top. 
    Returns 1 for perfect sorting, 0 for worst sorting."""
    
    #count how many sentences in the list use the target term
    target_sents = list(filter(lambda sent: sent.termN == target_term, sorted_list))
    count = len(target_sents)
    
    errors = []
    
    for i in range(len(sorted_list)):
        term = sorted_list[i].termN
        if i < count:
            correct = term == target_term
            error = int(not correct) * (count - i) / count
        else:
            correct = term != target_term
            error = int(not correct) * (i + 1 - count) / (count)
        errors.append(error)
    
    return 1 - (sum(errors) / len(sorted_list))
   

Now we can apply the rankings to evaluate composition methods. If we evaluate a representation of a sentence, we rank all sentences in the corpus based on their similarity to the sentence vector. Ideally, all items with the same term are ranked on top.

To make things a bit more spicy, some the functions below allow you to pass both a base method and a "corpus method". The *base method* is used to represent the sentence, while the *corpus method* is used to represent the rest of the corpus. This way we can ask all of the following questions:
* Does PLF represent similar relative clauses close to each other?
* Does inverted PLF still represent similar clauses close to each other?
* If you invert the argument structure of a single sentence, does it still rank similarly to other, non-inverted sentences describing the same term?
Note that cross-comparing methods like this is only possible if they represent clauses in the same vector space. All methods represent clauses in the S space except for addition and elementwise multiplication, which represent it in the N space.

In [19]:
def evaluate_sent(sent, method, corpus_method, sim_function=cosine):
    """Evaluate the ranking based on the representation of a given sentence. It should rank sentence with 
    the same term on top."""
    term = sent.termN
    target_v = results[sent][method]
    ranking =  rankSentences(target_v, results, corpus_method, sim_function)
    evaluation = evaluate_ranking(term, ranking)
    return evaluation

def evaluate_method(sentences, method, corpus_method, sim_function=cosine):
    """Evaluate a method by looping through all sentences."""
    if not corpus_method:
        corpus_method = method
        
    evaluations = [evaluate_sent(sent, method, corpus_method, sim_function) for sent in sentences]
    return round(sum(evaluations) / len(sentences), 3)

So how do the methods compare?

In [20]:
print('\t\t\tranking')

for method in methods:
    print(' ' * (16 - len(method)) + method, 
          round(evaluate_method(results.keys(), method), 3),
          sep='\t')

			ranking
        Addition	0.744
             VHn	0.819
            VArg	0.779
             PLF	0.802
            iPLF	0.798


In [68]:
#just of a bit of fun:
#evaluate based on similarity between property vectors and term vectors (only useful for addition)

terms = {sent.termN : results[sent]['target'] for sent in results.keys()}

def evaluate_term(term, target_v, method='Addition', sim_function=cosine):
    ranking = rankSentences(target_v, results, method, sim_function)
    evaluation = evaluate_ranking(term, ranking)
    return evaluation

def evaluate_method_by_term(terms, method='Addition', sim_function=cosine):
    evaluations = [evaluate_term(term, terms[term], method, sim_function) for term in terms]
    return round(sum(evaluations) / len(terms), 3)

## Similarity to PLF

In the last section, we looked at composition methods in isolation. We notice that most of them represent semantic distance pretty well! But does that mean that their results are similar to the PLF results? Or are they preserving distance in their own space?

One way to see this: how do they rank PLF results? PLF will give the same results as above (hopefully!). The other methods essentially represent some loss of information: we only consider the head noun, we only consider the argument, or we mess up the argument structure.

In [33]:
print('\t\t\tranking')

for method in methods_S:
    print(' ' * (16 - len(method)) + method, 
          round(evaluate_method(results.keys(), method = method, corpus_method = 'PLF'), 3),
          sep='\t')

			ranking
             VHn	0.8
            VArg	0.783
             PLF	0.802
            iPLF	0.797


These numbers are still really high! Let's look at the direct similarity with the PLF results.

In [35]:
def compare_methods(method_1, method_2, sim_function = cosine):
    similarities = [sim_function(results[sent][method_1], results[sent][method_2]) for sent in results]
    return similarities

methods_compared_to_PLF = {method : compare_methods(method, 'PLF') for method in methods_S}

In [36]:
for method in methods_compared_to_PLF:
    print(method, sum(methods_compared_to_PLF[method])/len(methods_compared_to_PLF[method]))

VHn 0.9582530229166685
VArg 0.949044962419609
PLF 1.0
iPLF 0.9682177219853254


0.0