In [6]:
import xml.etree.ElementTree as ET
import copy
import collections
import pandas as pd

In [7]:
paraphrase_types =["mor_inflectional",		#inflectional changes  
"mor_modal_verb",				#modal verb changes
"mor_derivational",			#derivational changes
"lex_spelling_and_format",	#spelling and format changes
"lex_same_polarity",			#same polarity substitutions
"lex_synt_ana",				#synthetic/analytic substitutions 
"lex_opposite_polarity",		#opposite polarity substitutions
"lex_converse",				#converse substitutions
"syn_diathesis",				#diathesis alternations
"syn_negation",				#negation switching
"syn_ellipsis",				#ellipsis
"syn_coordination",			#coordination changes
"syn_subord_nesting",			#subordination and nesting changes
"dis_punct_format",			#punctuation and format changes
"dis_direct_indirect",		#direct/indirect style alternations
"dis_sent_modality",			#sentence modality changes
"syn_dis_structure",			#syntax/discourse structure changes
"semantic",					#semantic based changes
"order",						#change of order
"addition_deletion",			#addition/deletion
"identical",					#identical
"non_paraphrases"]			#non-paraphrases





In [8]:
def proc_relation(rel):
    phrase1, phrase2 = map(lambda e:e.text, rel.findall(".//frase"))
    
    paraphrase_types_count = collections.Counter( {ptype:0 for ptype in paraphrase_types})
    paraphrase_types_count.update([fenomen.attrib["type"] for fenomen in rel.findall(".//fenomen")])
    return phrase1, phrase2, paraphrase_types_count
    

In [9]:
def load_corpus(filename= "corpora/P4P/P4P_corpus_v1.xml"):
    df = pd.DataFrame(columns=["phrase1", "phrase2"]+paraphrase_types) #depending on Order of set=order of counter keys
    
    xml= ET.ElementTree(file=filename)
    
    for ii,rel in enumerate(xml.findall("./")):
        phrase1, phrase2, paraphrase_types_count = proc_relation(rel)
        df.loc[ii,"phrase1"] = phrase1
        df.loc[ii,"phrase2"] = phrase2
        for ptype,count in paraphrase_types_count.items():
            df.loc[ii,ptype]=count
    return df
    


In [10]:
corpus = load_corpus()



In [16]:
import nltk
import pickle
import scipy.spatial
import gensim

In [13]:
model = None

with open("../QuestionTime/hansard_model_1000.pickle", "br") as model_fs:
    model = pickle.load(model_fs)

In [14]:

def get_phrase_vec(phrase):
    return model.infer_vector(nltk.word_tokenize(str.lower(phrase)))

corpus["phrase_1_embeddings"] = corpus.phrase1.apply(get_phrase_vec)
corpus["phrase_2_embeddings"] = corpus.phrase2.apply(get_phrase_vec)

In [21]:
corpus["dist"] = [scipy.spatial.distance.cosine(pv1,pv2) for pv1,pv2 in zip(corpus.phrase_1_embeddings, corpus.phrase_2_embeddings)]

In [22]:
corpus

Unnamed: 0,phrase1,phrase2,mor_inflectional,mor_modal_verb,mor_derivational,lex_spelling_and_format,lex_same_polarity,lex_synt_ana,lex_opposite_polarity,lex_converse,...,dis_sent_modality,syn_dis_structure,semantic,order,addition_deletion,identical,non_paraphrases,phrase_1_embeddings,phrase_2_embeddings,dist
0,All art is imitation of nature. One does not n...,"In order to move us, it needs no reference to ...",1,0,2,0,6,2,0,1,...,0,0,0,1,5,0,0,"[-0.00519385, -0.037466, 0.0448609, -0.0461574...","[0.0776415, 0.0298453, -0.0294426, 0.0797808, ...",6.344981e-01
1,He has selected a personage for his drama with...,He has selected a personage for his drama with...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,"[0.00393154, -0.106649, -0.0544358, -0.0755887...","[0.00393154, -0.106649, -0.0544358, -0.0755887...",8.689791e-08
2,This question is linked closely to the often-d...,"This Query is, of course, intimately connected...",0,0,0,1,14,2,0,0,...,0,0,0,1,4,0,0,"[0.00309913, 0.0814331, 0.0372751, -0.0159998,...","[-0.0544547, -0.0322061, 0.0716397, 0.0685044,...",4.052371e-01
3,Since the principles regulating the constituti...,Having thus laid down and discussed the princi...,0,0,1,0,8,0,0,0,...,0,0,0,0,2,0,0,"[-0.0481721, 0.086038, 0.0788518, 0.0594807, 0...","[0.0552344, -0.0469779, -0.0159258, 0.0917185,...",4.198920e-01
4,"I've got some things to take care of up-town, ...","""I've got a few errands up-town, and you just ...",0,0,0,1,9,1,0,1,...,0,0,0,1,1,0,0,"[0.0740122, 0.0315192, -0.0795063, 0.051549, 0...","[0.000614293, 0.12436, -0.0023657, 0.0531562, ...",4.557178e-01
5,The seeds of the aging trees blew by the thous...,In countless thousands the winged seeds float ...,1,0,0,0,6,0,0,0,...,0,0,1,3,1,0,0,"[-0.106994, -0.0996661, 0.0438744, 0.0784698, ...","[-0.0442645, -0.0783047, 0.0185897, 0.111879, ...",3.445491e-01
6,never influenced or attempted to inspire a bal...,I never influenced or attempted to influence a...,0,0,0,0,12,0,0,0,...,0,0,0,0,0,0,0,"[0.175471, 0.0955723, 0.185727, 0.118799, 0.24...","[0.046249, -0.0139775, 0.0354229, -0.0301752, ...",4.951028e-01
7,I am asked a lot of times to listen to people ...,I am asked to hear many who have voices with p...,0,0,1,0,11,0,0,0,...,0,0,1,0,1,0,0,"[-0.074084, 0.0301939, -0.0122511, -0.0886285,...","[-0.0461552, 0.0255917, 0.07684, -0.0457375, 0...",6.177608e-01
8,We got to some rather biggish palm trees first...,First we came to the tall palm trees on the ed...,0,0,0,0,6,0,0,0,...,0,0,2,1,3,0,0,"[0.0749598, -0.0719321, -0.136984, 0.013523, 0...","[0.026862, -0.189241, -0.18385, -0.0618037, 0....",4.988237e-01
9,The viewpoint of these lands had been altered ...,The whole aspect of the land had changed and t...,1,0,1,0,7,0,1,0,...,0,0,1,0,1,0,0,"[0.0609872, -0.0368344, -0.0476999, 0.00508734...","[0.0984612, -0.078033, 0.0494705, 0.0560214, 0...",6.981884e-01
