In [1]:
import pylab as pl
%matplotlib inline
pl.rcParams['figure.figsize'] = 15, 15

In [2]:
import os.path
import csv
import numpy as np
import scipy.spatial
import pandas as pd

In [3]:

os.chdir("prepared_corpora/msrp_ns_va_nophrase_mfcwsd/")

In [4]:
def embedding_distance(embeddings, id1, id2):
    ii = id1 - 1  #Change from 1 indexed id, to 0 indexex embedding index
    jj = id2 - 1
    return scipy.spatial.distance.cosine(embeddings[ii,:], embeddings[jj,:])

In [5]:

def load_links(link_filename):
    return np.loadtxt(link_filename, delimiter=",",skiprows=1, dtype=np.int)

def load_link_distances(link_filename, embeddings):
    links = load_links(link_filename)
    dists = pd.Series()
    for link in links:
        assert(len(link)==2)
        phrase_id = link[0]
        var_phrase_id = link[1]
        
        dists.loc[phrase_id] = embedding_distance(embeddings, phrase_id, var_phrase_id)
    return dists



In [6]:
def get_dists_table(embedding_filename, max_corruption = 10):
    embeddings = np.loadtxt(embedding_filename, delimiter=",")
    dists = pd.DataFrame()
    dists["paraphrase"] = load_link_distances("paraphrases.txt", embeddings)
    for corruption_level in range(1,max_corruption+1):
        link_filename = str(corruption_level)+"verb_anto_semantic_corruptions.txt"
        dists["verb_anto_" + str(corruption_level)] = load_link_distances(link_filename, embeddings)
        link_filename = str(corruption_level)+"noun_sym_semantic_corruptions.txt"
        dists["noun_sym_" + str(corruption_level)] = load_link_distances(link_filename, embeddings)
    return dists  
    

In [51]:
a=Counter(["aye", "bee","bee", "cee"])
b=Counter(["aye", "bee","bee", "see"])
keys = list(set(a.keys()).union(b.keys()))
avec = [a[key] for key in  keys]
avec

[2, 1, 1, 0]

In [54]:
def get_bow_dists_table(max_corruption = 10):
    from collections import Counter
    
    bows = [Counter(sent.split()) for sent in open("phrases.txt",'r')]
    
    def get_dist(linenum1, linenum2):
        from distance import jaccard
        bow1 = bows[linenum1-1]
        bow2 = bows[linenum2-1]
        keys = list(set(bow1.keys()).union(bow2.keys()))
        vec1 = [bow1[key] for key in  keys]
        vec2 = [bow2[key] for key in  keys]
        return scipy.spatial.distance.cosine(vec1,vec2)

    def get_dists(link_filename):
        dists = pd.Series()
        for link in load_links(link_filename):
            dists.loc[link[0]] = get_dist(link[0],link[1])
        return dists
        

    dists = pd.DataFrame()
    dists["paraphrase"] = get_dists("paraphrases.txt")
    for corruption_level in range(1,max_corruption+1):
        dists["verb_anto_" + str(corruption_level)] = get_dists(str(corruption_level)+"verb_anto_semantic_corruptions.txt")
        dists["noun_sym_" + str(corruption_level)] = get_dists(str(corruption_level)+"noun_sym_semantic_corruptions.txt")
    return dists  
    


In [27]:
def get_sow_jaccard_dists_table(max_corruption = 10):
    
    bows = [set(sent.split()) for sent in open("phrases.txt",'r')]
    
    def get_dist(linenum1, linenum2):
        from distance import jaccard
        s1 = bows[linenum1-1]
        s2 = bows[linenum2-1]
        return jaccard(s1,s2)

    def get_dists(link_filename):
        dists = pd.Series()
        for link in load_links(link_filename):
            dists.loc[link[0]] = get_dist(link[0],link[1])
        return dists
        

    dists = pd.DataFrame()
    dists["paraphrase"] = get_dists("paraphrases.txt")
    for corruption_level in range(1,max_corruption+1):
        dists["verb_anto_" + str(corruption_level)] = get_dists(str(corruption_level)+"verb_anto_semantic_corruptions.txt")
        dists["noun_sym_" + str(corruption_level)] = get_dists(str(corruption_level)+"noun_sym_semantic_corruptions.txt")
    return dists  
    


In [55]:
bow_dists = get_bow_dists_table()



In [None]:
rae_dists = get_dists_table("outVectors_RAE2011.csv")
wiki_doc2vec_dists = get_dists_table("outVectors_wiki_doc2vec.csv")
hansard_doc2vec_dists = get_dists_table("outVectors_hansard_doc2vec.csv")
wiki_sentence_doc2vec_dists = get_dists_table("outVectors_wiki_sentence_doc2vec.csv")
wiki_sentence_concat_doc2vec_dists = get_dists_table("outVectors_wiki_sentence_doc2vec.csv")
dbow_dists = get_dists_table("outVectors_wiki_sentence_model_dbow.csv")
random_dists = get_dists_table("outVectors_random.csv")



In [24]:
def get_comparative_distances_table(dists, max_changes = 10):
    def n_closer_than_paraphrase(prefix):
        """
        prefix : eg "verb_anto_" or  "noun_sym_"
        """
        return [(dists.paraphrase > dists[prefix+str(nchanges)]).sum()/dists[prefix+str(nchanges)].count() for nchanges in range(1,11)]


    def compare_sym_anto_distances():
        def inner():
            for nchanges in range(1,max_changes+1):
                verb_antos = dists["verb_anto_"+str(nchanges)]
                noun_syms =  dists["noun_sym_"+str(nchanges)]
                valid_dists = dists[np.logical_and(pd.notnull(verb_antos), pd.notnull(noun_syms))]
                yield (verb_antos > noun_syms).sum()/len(valid_dists)
        return list(inner())
    
    comparative_distances = pd.DataFrame()
    comparative_distances["n_changes"] = list(range(1,max_changes+1))
    comparative_distances["noun_sym_lt_para"] = n_closer_than_paraphrase("noun_sym_")
    comparative_distances["verb_anto_lt_para"] = n_closer_than_paraphrase("verb_anto_")
    comparative_distances["noun_sym_lt_verb_anto"] = compare_sym_anto_distances()
    return comparative_distances
    
    
    



In [56]:
bow_dists

Unnamed: 0,paraphrase,verb_anto_1,noun_sym_1,verb_anto_2,noun_sym_2,verb_anto_3,noun_sym_3,verb_anto_4,noun_sym_4,verb_anto_5,...,verb_anto_6,noun_sym_6,verb_anto_7,noun_sym_7,verb_anto_8,noun_sym_8,verb_anto_9,noun_sym_9,verb_anto_10,noun_sym_10
1,0.667036,,0.024390,,,,,,,,...,,,,,,,,,,
3,0.667036,,,,,,,,,,...,,,,,,,,,,
4,0.436116,0.027027,0.027027,,0.054054,,0.081081,,0.108108,,...,,,,,,,,,,
11,0.436116,0.029412,0.029412,,0.058824,,0.088235,,,,...,,,,,,,,,,
16,0.206425,,0.028571,,0.057143,,0.085714,,,,...,,,,,,,,,,
20,0.206425,,0.041667,,0.083333,,,,,,...,,,,,,,,,,
23,0.228900,,0.045455,,0.090909,,0.136364,,,,...,,,,,,,,,,
27,0.228900,,0.027027,,0.054054,,0.081081,,,,...,,,,,,,,,,
31,0.599108,,0.063618,,0.106181,,0.148743,,0.191306,,...,,,,,,,,,,
36,0.599108,0.047619,0.047619,,0.095238,,0.142857,,0.190476,,...,,,,,,,,,,


In [57]:
get_comparative_distances_table(bow_dists)

Unnamed: 0,n_changes,noun_sym_lt_para,verb_anto_lt_para,noun_sym_lt_verb_anto
0,1,0.997628,0.996733,0.009544
1,2,0.987643,0.984746,0.044543
2,3,0.94757,0.939024,0.065217
3,4,0.832856,1.0,0.25
4,5,0.654367,0.0,
5,6,0.491909,,
6,7,0.317308,,
7,8,0.090909,,
8,9,0.111111,,
9,10,,,


In [11]:
get_comparative_distances_table(wiki_sentence_doc2vec_dists)

Unnamed: 0,n_changes,noun_sym_lt_para,verb_anto_lt_para,noun_sym_lt_verb_anto
0,1,0.913365,0.917674,0.493107
1,2,0.839179,0.866102,0.498886
2,3,0.761414,0.780488,0.391304
3,4,0.675444,0.833333,0.25
4,5,0.584256,1.0,
5,6,0.517799,,
6,7,0.355769,,
7,8,0.363636,,
8,9,0.555556,,
9,10,,,


In [12]:
get_comparative_distances_table(wiki_sentence_concat_doc2vec_dists)

Unnamed: 0,n_changes,noun_sym_lt_para,verb_anto_lt_para,noun_sym_lt_verb_anto
0,1,0.913365,0.917674,0.493107
1,2,0.839179,0.866102,0.498886
2,3,0.761414,0.780488,0.391304
3,4,0.675444,0.833333,0.25
4,5,0.584256,1.0,
5,6,0.517799,,
6,7,0.355769,,
7,8,0.363636,,
8,9,0.555556,,
9,10,,,


In [13]:
get_comparative_distances_table(rae_dists)

Unnamed: 0,n_changes,noun_sym_lt_para,verb_anto_lt_para,noun_sym_lt_verb_anto
0,1,0.949637,0.936949,0.607635
1,2,0.905506,0.89661,0.581292
2,3,0.866274,0.792683,0.434783
3,4,0.827132,0.833333,0.0
4,5,0.788438,1.0,
5,6,0.754045,,
6,7,0.75,,
7,8,0.727273,,
8,9,0.888889,,
9,10,,,


In [None]:
get_comparative_distances_table(wiki_doc2vec_dists)

In [None]:
get_comparative_distances_table(hansard_doc2vec_dists)

In [None]:
comparative_distances["n_changes"] = list(range(1,11))
#comparative_distances["RAE_noun_sym_lt_para"] = n_closer_than_paraphrase("noun_sym_", rae_dists)
#comparative_distances["RAE_verb_anto_lt_para"] = n_closer_than_paraphrase("verb_anto_", rae_dists)
comparative_distances["wiki_doc2vec_noun_sym_lt_para"] = n_closer_than_paraphrase("noun_sym_", wiki_doc2vec_dists)
comparative_distances["wiki_doc2vec_verb_anto_lt_para"] = n_closer_than_paraphrase("verb_anto_", wiki_doc2vec_dists)
#comparative_distances["hansard_doc2vec_noun_sym_lt_para"] = n_closer_than_paraphrase("noun_sym_", hansard_doc2vec_dists)
#comparative_distances["hansard_doc2vec_verb_anto_lt_para"] = n_closer_than_paraphrase("verb_anto_", hansard_doc2vec_dists)
comparative_distances

comparative_distances.loc[0:max_changes-2,"rae_noun_sym_lt_verb_anto"] = compare_sym_anto_distances(rae_dists,max_changes)
comparative_distances.loc[0:max_changes-2,"wiki_doc2vec_noun_sym_lt_verb_anto"] = compare_sym_anto_distances(wiki_doc2vec_dists,max_changes)
comparative_distances.loc[0:max_changes-2,"hansard_doc2vec_noun_sym_lt_verb_anto"] = compare_sym_anto_distances(hansard_doc2vec_dists,max_changes)
comparative_distances

In [None]:
def drop_null_cols(df):
    keep_cols = [col for col in df.columns if not(all(pd.isnull(df.loc[:,col])))]
    return df.loc[:,keep_cols]
    


In [None]:
drop_null_cols(hansard_doc2vec_dists).hist(sharex=True)

In [None]:
drop_null_cols(wiki_doc2vec_dists).hist(sharex=True)

In [None]:
drop_null_cols(rae_dists).hist(sharex=True)