In [1]:
import pylab as pl
%matplotlib inline
pl.rcParams['figure.figsize'] = 15, 15

In [2]:
import os.path
import csv
import numpy as np
import scipy.spatial
import pandas as pd

In [3]:

os.chdir("prepared_corpora/msrp_ns_va_nophrase_mfcwsd/")

In [4]:
def embedding_distance(embeddings, id1, id2):
    ii = id1 - 1  #Change from 1 indexed id, to 0 indexex embedding index
    jj = id2 - 1
    return scipy.spatial.distance.cosine(embeddings[ii,:], embeddings[jj,:])

In [5]:
def load_link_distances(link_filename, embeddings):
    links = np.loadtxt(link_filename, delimiter=",",skiprows=1, dtype=np.int)
    dists = pd.Series()
    for link in links:
        assert(len(link)==2)
        phrase_id = link[0]
        var_phrase_id = link[1]
        
        dists.loc[phrase_id] = embedding_distance(embeddings, phrase_id, var_phrase_id)
    return dists



In [6]:
def get_dists_table(embedding_filename, max_corruption = 10):
    embeddings = np.loadtxt(embedding_filename, delimiter=",")
    dists = pd.DataFrame()
    dists["paraphrase"] = load_link_distances("paraphrases.txt", embeddings)
    for corruption_level in range(1,max_corruption+1):
        link_filename = str(corruption_level)+"verb_anto_semantic_corruptions.txt"
        dists["verb_anto_" + str(corruption_level)] = load_link_distances(link_filename, embeddings)
        link_filename = str(corruption_level)+"noun_sym_semantic_corruptions.txt"
        dists["noun_sym_" + str(corruption_level)] = load_link_distances(link_filename, embeddings)
    return dists  
    

In [69]:
rae_dists = get_dists_table("outVectors_RAE2011.csv")
wiki_doc2vec_dists = get_dists_table("outVectors_wiki_doc2vec.csv")
hansard_doc2vec_dists = get_dists_table("outVectors_hansard_doc2vec.csv")
wiki_sentence_doc2vec_dists = get_dists_table("outVectors_wiki_sentence_doc2vec.csv")
wiki_sentence_concat_doc2vec_dists = get_dists_table("outVectors_wiki_sentence_doc2vec.csv")



In [67]:
def get_comparative_distances_table(dists, max_changes = 10):
    def n_closer_than_paraphrase(prefix):
        """
        prefix : eg "verb_anto_" or  "noun_sym_"
        """
        return [(dists.paraphrase > dists[prefix+str(nchanges)]).sum()/dists[prefix+str(nchanges)].count() for nchanges in range(1,11)]


    def compare_sym_anto_distances():
        def inner():
            for nchanges in range(1,max_changes+1):
                verb_antos = dists["verb_anto_"+str(nchanges)]
                noun_syms =  dists["noun_sym_"+str(nchanges)]
                valid_dists = dists[np.logical_and(pd.notnull(verb_antos), pd.notnull(noun_syms))]
                yield (verb_antos > noun_syms).sum()/len(valid_dists)
        return list(inner())
    
    comparative_distances = pd.DataFrame()
    comparative_distances["n_changes"] = list(range(1,max_changes+1))
    comparative_distances["noun_sym_lt_para"] = n_closer_than_paraphrase("noun_sym_")
    comparative_distances["verb_anto_lt_para"] = n_closer_than_paraphrase("verb_anto_")
    comparative_distances["noun_sym_lt_verb_anto"] = compare_sym_anto_distances()
    return comparative_distances
    
    
    



In [72]:
get_comparative_distances_table(wiki_sentence_doc2vec_dists)

Unnamed: 0,n_changes,noun_sym_lt_para,verb_anto_lt_para,noun_sym_lt_verb_anto
0,1,0.913365,0.917674,0.493107
1,2,0.839179,0.866102,0.498886
2,3,0.761414,0.780488,0.391304
3,4,0.675444,0.833333,0.25
4,5,0.584256,1.0,
5,6,0.517799,,
6,7,0.355769,,
7,8,0.363636,,
8,9,0.555556,,
9,10,,,


In [73]:
get_comparative_distances_table(wiki_sentence_concat_doc2vec_dists)

Unnamed: 0,n_changes,noun_sym_lt_para,verb_anto_lt_para,noun_sym_lt_verb_anto
0,1,0.913365,0.917674,0.493107
1,2,0.839179,0.866102,0.498886
2,3,0.761414,0.780488,0.391304
3,4,0.675444,0.833333,0.25
4,5,0.584256,1.0,
5,6,0.517799,,
6,7,0.355769,,
7,8,0.363636,,
8,9,0.555556,,
9,10,,,


In [74]:
get_comparative_distances_table(rae_dists)

Unnamed: 0,n_changes,noun_sym_lt_para,verb_anto_lt_para,noun_sym_lt_verb_anto
0,1,0.949637,0.936949,0.607635
1,2,0.905506,0.89661,0.581292
2,3,0.866274,0.792683,0.434783
3,4,0.827132,0.833333,0.0
4,5,0.788438,1.0,
5,6,0.754045,,
6,7,0.75,,
7,8,0.727273,,
8,9,0.888889,,
9,10,,,


In [75]:
get_comparative_distances_table(wiki_doc2vec_dists)

Unnamed: 0,n_changes,noun_sym_lt_para,verb_anto_lt_para,noun_sym_lt_verb_anto
0,1,0.898019,0.896439,0.488158
1,2,0.813375,0.855932,0.447661
2,3,0.718704,0.817073,0.521739
3,4,0.593017,0.916667,0.25
4,5,0.473555,0.5,
5,6,0.38835,,
6,7,0.365385,,
7,8,0.318182,,
8,9,0.111111,,
9,10,,,


In [76]:
get_comparative_distances_table(hansard_doc2vec_dists)

Unnamed: 0,n_changes,noun_sym_lt_para,verb_anto_lt_para,noun_sym_lt_verb_anto
0,1,0.901646,0.907873,0.489219
1,2,0.819371,0.835593,0.518931
2,3,0.733432,0.719512,0.5
3,4,0.60103,0.583333,0.25
4,5,0.531365,0.0,
5,6,0.453074,,
6,7,0.317308,,
7,8,0.318182,,
8,9,0.111111,,
9,10,,,


In [None]:
comparative_distances["n_changes"] = list(range(1,11))
#comparative_distances["RAE_noun_sym_lt_para"] = n_closer_than_paraphrase("noun_sym_", rae_dists)
#comparative_distances["RAE_verb_anto_lt_para"] = n_closer_than_paraphrase("verb_anto_", rae_dists)
comparative_distances["wiki_doc2vec_noun_sym_lt_para"] = n_closer_than_paraphrase("noun_sym_", wiki_doc2vec_dists)
comparative_distances["wiki_doc2vec_verb_anto_lt_para"] = n_closer_than_paraphrase("verb_anto_", wiki_doc2vec_dists)
#comparative_distances["hansard_doc2vec_noun_sym_lt_para"] = n_closer_than_paraphrase("noun_sym_", hansard_doc2vec_dists)
#comparative_distances["hansard_doc2vec_verb_anto_lt_para"] = n_closer_than_paraphrase("verb_anto_", hansard_doc2vec_dists)
comparative_distances

comparative_distances.loc[0:max_changes-2,"rae_noun_sym_lt_verb_anto"] = compare_sym_anto_distances(rae_dists,max_changes)
comparative_distances.loc[0:max_changes-2,"wiki_doc2vec_noun_sym_lt_verb_anto"] = compare_sym_anto_distances(wiki_doc2vec_dists,max_changes)
comparative_distances.loc[0:max_changes-2,"hansard_doc2vec_noun_sym_lt_verb_anto"] = compare_sym_anto_distances(hansard_doc2vec_dists,max_changes)
comparative_distances

In [None]:
def drop_null_cols(df):
    keep_cols = [col for col in df.columns if not(all(pd.isnull(df.loc[:,col])))]
    return df.loc[:,keep_cols]
    


In [None]:
drop_null_cols(hansard_doc2vec_dists).hist(sharex=True)

In [None]:
drop_null_cols(wiki_doc2vec_dists).hist(sharex=True)

In [None]:
drop_null_cols(rae_dists).hist(sharex=True)