In [1]:
import xml.etree.ElementTree as ET
import copy
import collections
import pandas as pd
import numpy as np
import itertools

In [2]:
paraphrase_types =["mor_inflectional",		#inflectional changes  
"mor_modal_verb",				#modal verb changes
"mor_derivational",			#derivational changes
"lex_spelling_and_format",	#spelling and format changes
"lex_same_polarity",			#same polarity substitutions
"lex_synt_ana",				#synthetic/analytic substitutions 
"lex_opposite_polarity",		#opposite polarity substitutions
"lex_converse",				#converse substitutions
"syn_diathesis",				#diathesis alternations
"syn_negation",				#negation switching
"syn_ellipsis",				#ellipsis
"syn_coordination",			#coordination changes
"syn_subord_nesting",			#subordination and nesting changes
"dis_punct_format",			#punctuation and format changes
"dis_direct_indirect",		#direct/indirect style alternations
"dis_sent_modality",			#sentence modality changes
"syn_dis_structure",			#syntax/discourse structure changes
"semantic",					#semantic based changes
"order",						#change of order
"addition_deletion",			#addition/deletion
"identical",					#identical
"non_paraphrases"]			#non-paraphrases





In [3]:
def proc_relation(rel):
    phrase1, phrase2 = map(lambda e:e.text, rel.findall(".//frase"))
    
    paraphrase_types_count = collections.Counter( {ptype:0 for ptype in paraphrase_types})
    paraphrase_types_count.update([fenomen.attrib["type"] for fenomen in rel.findall(".//fenomen")])
    return phrase1, phrase2, paraphrase_types_count
    

In [4]:
def load_corpus(filename= "corpora/P4P/P4P_corpus_v1.xml"):
    df = pd.DataFrame(columns=["phrase1", "phrase2"]+paraphrase_types) #depending on Order of set=order of counter keys
    
    xml= ET.ElementTree(file=filename)
    
    for ii,rel in enumerate(xml.findall("./")):
        phrase1, phrase2, paraphrase_types_count = proc_relation(rel)
        df.loc[ii,"phrase1"] = phrase1
        df.loc[ii,"phrase2"] = phrase2
        for ptype,count in paraphrase_types_count.items():
            df.loc[ii,ptype]=count
    
    return df.convert_objects()
    


In [5]:
corpus = load_corpus()



In [6]:
import nltk
import pickle
import scipy.spatial
import gensim

In [7]:
model = None

with open("../QuestionTime/hansard_model_1000.pickle", "br") as model_fs:
    model = pickle.load(model_fs)

In [8]:
def get_phrase_vec(phrase):
    return model.infer_vector(nltk.word_tokenize(str.lower(phrase)))

corpus["phrase_1_embeddings"] = corpus.phrase1.apply(get_phrase_vec)
corpus["phrase_2_embeddings"] = corpus.phrase2.apply(get_phrase_vec)

In [9]:
corpus["dist"] = [scipy.spatial.distance.cosine(pv1,pv2) for pv1,pv2 in zip(corpus.phrase_1_embeddings, corpus.phrase_2_embeddings)]


In [10]:
corpus.to_csv("P4P.csv")
corpus

Unnamed: 0,phrase1,phrase2,mor_inflectional,mor_modal_verb,mor_derivational,lex_spelling_and_format,lex_same_polarity,lex_synt_ana,lex_opposite_polarity,lex_converse,...,dis_sent_modality,syn_dis_structure,semantic,order,addition_deletion,identical,non_paraphrases,phrase_1_embeddings,phrase_2_embeddings,dist
0,All art is imitation of nature. One does not n...,"In order to move us, it needs no reference to ...",1,0,2,0,6,2,0,1,...,0,0,0,1,5,0,0,"[0.00606818, -0.0333632, -0.036308, -0.0304761...","[0.0490966, -0.0142498, 0.0209724, 0.0779864, ...",5.603627e-01
1,He has selected a personage for his drama with...,He has selected a personage for his drama with...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,"[0.0600559, -0.0155101, 0.0665919, -0.0502336,...","[0.0600559, -0.0155101, 0.0665919, -0.0502336,...",-2.060469e-07
2,This question is linked closely to the often-d...,"This Query is, of course, intimately connected...",0,0,0,1,14,2,0,0,...,0,0,0,1,4,0,0,"[-0.0664917, 0.0025552, 0.0440153, 0.00252179,...","[0.0362215, -0.0161281, 0.0459165, 0.0683729, ...",3.220783e-01
3,Since the principles regulating the constituti...,Having thus laid down and discussed the princi...,0,0,1,0,8,0,0,0,...,0,0,0,0,2,0,0,"[0.00905898, 0.0304128, 0.0384496, -0.0509892,...","[-0.00253073, -0.0587331, 0.0188974, 0.0549476...",4.725104e-01
4,"I've got some things to take care of up-town, ...","""I've got a few errands up-town, and you just ...",0,0,0,1,9,1,0,1,...,0,0,0,1,1,0,0,"[0.047276, 0.120079, -0.0345881, 0.0876063, 0....","[-0.0497227, 0.118621, 0.0672782, 0.0607433, 0...",4.804805e-01
5,The seeds of the aging trees blew by the thous...,In countless thousands the winged seeds float ...,1,0,0,0,6,0,0,0,...,0,0,1,3,1,0,0,"[-0.0762051, -0.0431955, 0.136492, 0.0700147, ...","[-0.047439, -0.0219717, -0.00084193, 0.0785036...",3.437177e-01
6,never influenced or attempted to inspire a bal...,I never influenced or attempted to influence a...,0,0,0,0,12,0,0,0,...,0,0,0,0,0,0,0,"[0.13105, 0.0672765, 0.235167, 0.0709527, 0.12...","[0.0416506, -0.0238976, 0.0527522, 0.0589938, ...",4.273360e-01
7,I am asked a lot of times to listen to people ...,I am asked to hear many who have voices with p...,0,0,1,0,11,0,0,0,...,0,0,1,0,1,0,0,"[-0.130958, 0.0195034, -0.0435477, -0.0839956,...","[-0.016956, 0.0111931, 0.0806737, -0.037502, -...",5.339877e-01
8,We got to some rather biggish palm trees first...,First we came to the tall palm trees on the ed...,0,0,0,0,6,0,0,0,...,0,0,2,1,3,0,0,"[0.0504721, -0.0127587, -0.121118, 0.0135203, ...","[-0.0613059, -0.0442682, -0.0870859, 0.0004533...",4.247490e-01
9,The viewpoint of these lands had been altered ...,The whole aspect of the land had changed and t...,1,0,1,0,7,0,1,0,...,0,0,1,0,1,0,0,"[0.0434078, -0.0489495, 0.0259091, 0.0369797, ...","[0.0925998, -0.117256, 0.0331559, 0.0566558, 0...",6.552379e-01


In [11]:
for col in paraphrase_types:
    print(col)
    print(corpus.loc[:,col].value_counts())
    print("------------------")

mor_inflectional
0    654
1    159
2     36
3      6
5      1
dtype: int64
------------------
mor_modal_verb
0    751
1     94
2     11
dtype: int64
------------------
mor_derivational
0    644
1    173
2     30
3      8
4      1
dtype: int64
------------------
lex_spelling_and_format
0     613
1     151
2      55
3      14
4       8
5       7
8       3
7       2
6       2
17      1
dtype: int64
------------------
lex_same_polarity
6     97
4     91
3     87
5     81
2     75
7     71
8     68
9     67
10    48
0     43
1     39
11    25
12    20
13    17
14    12
15     7
16     4
18     2
17     1
20     1
dtype: int64
------------------
lex_synt_ana
0    436
1    249
2    109
3     51
4      8
5      2
7      1
dtype: int64
------------------
lex_opposite_polarity
0    801
1     45
2     10
dtype: int64
------------------
lex_converse
0    825
1     29
2      2
dtype: int64
------------------
syn_diathesis
0    739
1    105
2     11
3      1
dtype: int64
------------------
syn_negat

In [12]:
ptypes = corpus.loc[:,paraphrase_types]
corpus["primary_type"] = ((ptypes - ptypes.mean())/ptypes.std()).idxmax(axis=1)


In [13]:
def r_squared(actual,pred):
    SS_tot = ((actual - actual.mean())**2).sum()
    SS_res = ((actual - pred)**2).sum()
    return 1-(SS_res/SS_tot)

def fit(active_paraphrase_types):
    nonactive_paraphrase_types=set(paraphrase_types).difference(active_paraphrase_types)
    subcorpus = corpus.loc[(corpus.loc[:,nonactive_paraphrase_types]==0).all(axis=1)]
    subcorpus = subcorpus.loc[(corpus.loc[:,active_paraphrase_types]!=0).any(axis=1)]
    ptypes = subcorpus.loc[:,active_paraphrase_types]
    contributions = np.linalg.pinv(ptypes.as_matrix()).dot(subcorpus.dist.as_matrix())
    pred = ptypes.as_matrix().dot(contributions)
    return r_squared(subcorpus.dist, pred), subcorpus, pred,contributions

In [14]:

ptypes = corpus.loc[:,paraphrase_types]
contributions = np.linalg.pinv(ptypes.as_matrix()).dot(corpus.dist.as_matrix())
pred_dist= ptypes.as_matrix().dot(contributions)

In [15]:
help(np.correlate)

Help on function correlate in module numpy.core.numeric:

correlate(a, v, mode='valid', old_behavior=False)
    Cross-correlation of two 1-dimensional sequences.
    
    This function computes the correlation as generally defined in signal
    processing texts::
    
        c_{av}[k] = sum_n a[n+k] * conj(v[n])
    
    with a and v sequences being zero-padded where necessary and conj being
    the conjugate.
    
    Parameters
    ----------
    a, v : array_like
        Input sequences.
    mode : {'valid', 'same', 'full'}, optional
        Refer to the `convolve` docstring.  Note that the default
        is `valid`, unlike `convolve`, which uses `full`.
    old_behavior : bool
        If True, uses the old behavior from Numeric,
        (correlate(a,v) == correlate(v,a), and the conjugate is not taken
        for complex arrays). If False, uses the conventional signal
        processing definition.
    
    Returns
    -------
    out : ndarray
        Discrete cross-correlation 

In [16]:
def m_cov(a,b):
    return ((a-a.mean())*(b-b.mean())).mean()

def m_cor(a,b):
    return m_cov(a,b)/(a.std()*b.std())

#dist_measure = np.asarray([scipy.spatial.distance.braycurtis(pv1,pv2) for pv1,pv2 in zip(corpus.phrase_1_embeddings, corpus.phrase_2_embeddings)])
dist_measure = corpus.dist
c = np.asarray([m_cor(ptypes.loc[:,col],dist_measure) for col in ptypes.columns])
r = np.asarray([[m_cor(ptypes.loc[:,col1],ptypes.loc[:,col2]) for col1 in ptypes.columns] for col2 in ptypes.columns])
c.T.dot(np.linalg.inv(r).dot(c))

0.20657165206672243

In [17]:
pvs=list(zip(ptypes.columns, c))
pvs.sort(key=lambda kv:-1*np.abs(kv[1]))
for p,r in pvs:
    print(p,":",r)

lex_same_polarity : 0.254511675293
identical : -0.227411354148
semantic : 0.218266975074
addition_deletion : 0.124017813505
mor_derivational : 0.123204842253
order : -0.113865572455
lex_converse : 0.0862967176006
lex_synt_ana : 0.0819125738811
syn_dis_structure : 0.0605312313604
non_paraphrases : 0.0582215854703
dis_sent_modality : 0.0555490289284
syn_ellipsis : -0.0540185729796
lex_spelling_and_format : -0.0491770134316
lex_opposite_polarity : 0.0428745170102
dis_direct_indirect : 0.0397269166708
syn_negation : 0.0367373425209
mor_modal_verb : 0.0343191767672
dis_punct_format : 0.0237163559135
syn_diathesis : 0.0226943474827
syn_subord_nesting : 0.0172651139549
syn_coordination : -0.0147405897858
mor_inflectional : -0.00803544219378


In [23]:
keep_para_types = set(paraphrase_types)
r,subcorpus, pred,contributions = fit(keep_para_types )
r

-0.65386186774716193

In [24]:
for ptype, contribution in zip(keep_para_types, contributions):
    print(ptype, ":", contribution)

identical : 0.0915620597874
non_paraphrases : 0.150069181212
mor_modal_verb : 0.0160958987216
syn_negation : -0.00396163821699
dis_punct_format : 0.0189235726052
lex_same_polarity : 0.0316344128344
syn_diathesis : 0.022525343416
mor_derivational : 0.0415837383995
semantic : 0.0711355380791
lex_spelling_and_format : 0.02502375608
order : 0.0040617378985
dis_sent_modality : 0.0457431881925
lex_synt_ana : 0.0276492884988
syn_subord_nesting : 0.0329910736332
mor_inflectional : 0.0196751106794
syn_ellipsis : 0.0227726675986
syn_dis_structure : 0.0372893700484
dis_direct_indirect : 0.0279923098171
lex_opposite_polarity : 0.0360234004482
syn_coordination : 0.0128560148284
addition_deletion : 0.0332615292248
lex_converse : 0.070957568753


In [25]:
len(subcorpus)

847

In [26]:
m_cor(subcorpus.dist, pred)

0.38704847347794763

In [33]:
from sklearn.svm import SVR

svr = SVR(epsilon=0.1*corpus.dist.std())
svr.fit(corpus.loc[:,paraphrase_types], corpus.dist)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.011770711740398681,
  gamma=0.0, kernel='rbf', max_iter=-1, shrinking=True, tol=0.001,
  verbose=False)

In [34]:
svr.score(corpus.loc[:,paraphrase_types], corpus.dist)

0.61929452825464049