# Imports

In [1]:
import copy
import pandas as pd
import numpy as np

# Reading the ETPC

This is the ETPC dataset compiled by Wahle and posted on HuggingFace

In [2]:
# Unpickle etpc_raw
etpc = pd.read_pickle('datasets/etpc_raw.pkl')

These are the XML files from the ETPC github repo.

The first one contains all pairs marked as paraphrases by the MRPC:

In [3]:
textual_paraphrases = pd.read_xml('datasets/etpc/textual_paraphrases.xml')
# Convert scopes from strings to lists of ints
textual_paraphrases['s1_scope'] = textual_paraphrases['s1_scope'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)
textual_paraphrases['s2_scope'] = textual_paraphrases['s2_scope'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)

The second one contains the text and pair ids for *all* sentence pairs (paraphrases or not). It doesn't contain any data on whether they're paraphrases or not, or what EPT types are in them.

In [4]:
pairs = pd.read_xml('datasets/etpc/text_pairs.xml')
pairs.drop(columns=['negation'], inplace=True)
pairs.set_index('pair_id', inplace=True)

# Cleanup

## Cleaning up Columns

In [5]:
etpc.rename(columns={'paraphrase_type_ids': 'ept_ids', 'paraphrase_types': 'ept_names'}, inplace=True)
etpc.drop(columns={'negation'}, axis=1, inplace=True)
etpc

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_segment_location,sentence2_segment_location,sentence1_segment_location_indices,sentence2_segment_location_indices,sentence1_segment_text,sentence2_segment_text
0,1_0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 2...","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26...","[[5], [7], [0, 1, 2, 3], [8, 9, 10, 11, 12, 13...","[[1, 2], [0], [10, 11, 12, 13], [4]]","[whom, called, Amrozi accused his brother, `` ...","[to him, Referring, Amrozi accused his brother..."
1,2_1,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...,"[Yucaipa, owned, Dominick, 's, before, selling...","[Yucaipa, bought, Dominick, 's, in, 1995, for,...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
2,3_2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, ...","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 2...","[[0], [14], [8, 9, 10], [17, 18, 19]]","[[4, 5, 6, 7], [18], [0, 1, 2, 3], [8, 9, 10, ...","[They, cargo, on June 10, , he added, had publ...","[the ship 's owners, explosives, On June 10 ,,..."
3,4_3,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ...","[Around, 0335, GMT, ,, Tab, shares, were, up, ...","[Tab, shares, jumped, 20, cents, ,, or, 4.6, %...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
4,5_4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...",0,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, ...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[[0, 1], [2], [11, 12, 14], [13], [13], [7], [...","[[0, 1, 2, 3, 4], [5], [11], [20, 21], [20, 21...","[The stock, rose, to close at, Friday, Friday,...","[PG & E Corp. shares, jumped, to, on Friday, o..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5796,5797_5796,"After Hughes refused to rehire Hernandez, he c...",Hernandez filed an Equal Employment Opportunit...,"[After, Hughes, refused, to, rehire, Hernandez...","[Hernandez, filed, an, Equal, Employment, Oppo...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
5797,5798_5797,There are 103 Democrats in the Assembly and 47...,Democrats dominate the Assembly while Republic...,"[There, are, 103, Democrats, in, the, Assembly...","[Democrats, dominate, the, Assembly, while, Re...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
5798,5799_5798,Bethany Hamilton remained in stable condition ...,"Bethany, who remained in stable condition afte...","[Bethany, Hamilton, remained, in, stable, cond...","[Bethany, ,, who, remained, in, stable, condit...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
5799,5800_5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[26, 26, 0, 7, 7, 6, 1, 25, 25, 4, 25, 25, 25,...","[25, 25, 25, 25, 7, 0, 6, 1, 0, 0, 4, 25, 0, 2...","[[9], [3, 4], [5], [6], [0, 1], [24], [7, 8, 1...","[[10], [4], [6], [7], [13, 14], [0, 1, 2, 3], ...","[Corp, power station’s, US, owners, Last week,...","[Corp., Drax, American, owner, last week, The ..."


# Remapping paraphrase IDs


First, make a list with paraphrase types and IDs from the ETPC:

In [6]:
id_map = pd.read_xml('https://raw.githubusercontent.com/venelink/ETPC/master/Corpus/paraphrase_types.xml')
# Rename columns for clarity
id_map.rename(columns={'type_id': 'ept_id', 'type_name': 'ept_name'}, inplace=True)
# Drop unused data
id_map = id_map[['ept_id', 'ept_name']] # No use for type_category column
id_map.drop(id_map.tail(2).index,inplace=True) # Types don't appear in ETPC
id_map.style.hide(axis="index")
id_map

Unnamed: 0,ept_id,ept_name
0,1,Inflectional Changes
1,2,Modal Verb Changes
2,3,Derivational Changes
3,4,Spelling changes
4,5,Same Polarity Substitution (habitual)
5,6,Same Polarity Substitution (contextual)
6,7,Same Polarity Substitution (named ent.)
7,8,Change of format
8,9,Opposite polarity substitution (habitual)
9,10,Opposite polarity substitution (contextual)


Now, make a list with paraphrase names and IDs for ParaOp types

In [7]:
data = [[0, 'No change'],
        [1, 'Addition/Deletion - Function Word'],
        [2, 'Addition/Deletion - Content Word'],
        [3, 'Change of Order'],
        [4, 'Substitution - Synonym'],
        [5, 'Substitution - Contextual Synonym'],
        [6, 'Substitution - Morphological'],
        [7, 'Substitution - Spelling and Format']
       ]
paraop_map = pd.DataFrame(data, columns = ['paraop_id', 'paraop_name'])
paraop_map.set_index('paraop_id', inplace=True)
paraop_map

Unnamed: 0_level_0,paraop_name
paraop_id,Unnamed: 1_level_1
0,No change
1,Addition/Deletion - Function Word
2,Addition/Deletion - Content Word
3,Change of Order
4,Substitution - Synonym
5,Substitution - Contextual Synonym
6,Substitution - Morphological
7,Substitution - Spelling and Format


## Mapping

We'll use the dataframe below for mapping. Each row will contain the name and ID of a paraphrase type in the ETPC, and the name and ID of the correspondent ParaOp type.

In [8]:
id_map['paraop_id'] = ''
id_map['paraop_name'] = ''
id_map

Unnamed: 0,ept_id,ept_name,paraop_id,paraop_name
0,1,Inflectional Changes,,
1,2,Modal Verb Changes,,
2,3,Derivational Changes,,
3,4,Spelling changes,,
4,5,Same Polarity Substitution (habitual),,
5,6,Same Polarity Substitution (contextual),,
6,7,Same Polarity Substitution (named ent.),,
7,8,Change of format,,
8,9,Opposite polarity substitution (habitual),,
9,10,Opposite polarity substitution (contextual),,


Here's where we do the mapping:

In [9]:
# Helper function to map an ETPC id to a Paraop id
def map_id(ept_id, paraop_id):
    """Given an EPT id and a Paraop id, look up the name of the Paraop id and 
    fill in the rows of id_map with paraop_id and the name."""
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'] = paraop_id
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_name'] = paraop_map.loc[paraop_id, 'paraop_name']

In [10]:
map_id(ept_id=1, paraop_id=6)
map_id(ept_id=3, paraop_id=6)
map_id(ept_id=26, paraop_id=3)
map_id(ept_id=29, paraop_id=0)
map_id(4, 7)
map_id(5, 4)
map_id(6, 5)
map_id(8, 7)
map_id(9, 4)
map_id(2, 5)
map_id(7, 4)
id_map.style.hide(axis="index")

ept_id,ept_name,paraop_id,paraop_name
1,Inflectional Changes,6.0,Substitution - Morphological
2,Modal Verb Changes,5.0,Substitution - Contextual Synonym
3,Derivational Changes,6.0,Substitution - Morphological
4,Spelling changes,7.0,Substitution - Spelling and Format
5,Same Polarity Substitution (habitual),4.0,Substitution - Synonym
6,Same Polarity Substitution (contextual),5.0,Substitution - Contextual Synonym
7,Same Polarity Substitution (named ent.),4.0,Substitution - Synonym
8,Change of format,7.0,Substitution - Spelling and Format
9,Opposite polarity substitution (habitual),4.0,Substitution - Synonym
10,Opposite polarity substitution (contextual),,


TODO: Figure out a way to hide index of map_id throughout whole notebook. For some reason this seems harder than it needs to be...

Helper function to convert an ETPC ID to a Paraop ID

In [11]:
# Helper function to get a Paraop id from an ETPC id
def ept_to_paraop(ept_id):
    return id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'].iloc[0]

ept_to_paraop(3)

6

# Reannotation

## Creating positives dataframe

In [12]:
positives = etpc.loc[etpc['mrpc_label'] == 1]
positives.rename(columns={'sentence1_segment_location': 'sentence1_scope_etpc', 
                          'sentence2_segment_location': 'sentence2_scope_etpc'}, inplace=True)
positives.drop(columns=['sentence1_segment_location_indices', 'sentence2_segment_location_indices'],inplace=True)
positives['idx'] = positives.index.to_series()
positives

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives.rename(columns={'sentence1_segment_location': 'sentence1_scope_etpc',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives.drop(columns=['sentence1_segment_location_indices', 'sentence2_segment_location_indices'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['idx'] = positives.index.to_series()


Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_scope_etpc,sentence2_scope_etpc,sentence1_segment_text,sentence2_segment_text
0,0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 2...","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26...","[whom, called, Amrozi accused his brother, `` ...","[to him, Referring, Amrozi accused his brother..."
2,2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, ...","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 2...","[They, cargo, on June 10, , he added, had publ...","[the ship 's owners, explosives, On June 10 ,,..."
4,4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...",0,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, ...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[The stock, rose, to close at, Friday, Friday,...","[PG & E Corp. shares, jumped, to, on Friday, o..."
5,5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...",1,1,"[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]","[25, 11, 11, 11, 11, 11, 11, 11, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 11, 11...","[in the first quarter of the year, Revenue dro...","[the first quarter of the year, With the scand..."
7,7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]","[25, 4, 25, 25, 25, 25, 5, 25, 25, 25]","[25, 4, 4, 25, 25, 25, 25, 25, 5, 25, 25, 25]","[DVD-CCA, state, then, The appealed to the Sup...","[DVD CCA, U.S., that decision, The appealed to..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5792,5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...",1,1,"[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[29, 29, 29, 29, 29, 29, 29, 26, 26, 26, 26, 2...","[authorities said, Gehring waived extradition ...",[Gehring waived extradition Monday during a he...
5793,5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...",1,1,"[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[Silver, Silver, `` I am advised that certain ...","[the Silver statement, the Silver statement, ,..."
5795,5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...",0,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[deal, be completed, The deal , approved by bo...","[acquisition, close, The acquisition has been ..."
5799,5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[26, 26, 0, 7, 7, 6, 1, 25, 25, 4, 25, 25, 25,...","[25, 25, 25, 25, 7, 0, 6, 1, 0, 0, 4, 25, 0, 2...","[Corp, power station’s, US, owners, Last week,...","[Corp., Drax, American, owner, last week, The ..."


## Why we cannot use the ETPC from Wahle et al.

Here's a fundamental part of the ETPC that I hadn't realized until now: each token in a sentence can have *more than one* paraphrase type. Here's an example--note how, in sentence 2, token 5 appears in the scopes both of inflectional and derivational changes.

In [13]:
ric = textual_paraphrases.loc[(textual_paraphrases['pair_id'] == 4205+1) & (textual_paraphrases['type_id'].isin([3,1]))]
ric[:2]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
15963,4206,1,Inflectional Changes,yes,[3],"[3, 5]",completed,had inspected,,,,
15964,4206,3,Derivational Changes,yes,[4],[5],inspections,inspected,,,,


It seems that this issue also wasn't noticed by Wahle et al: some paraphrase scopes consist of only a single number repeated for the entirety of the list:

In [14]:
positives[positives['sentence1_scope_etpc'].apply(lambda x: (len(np.unique(x)) == 1))][:10]

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_scope_etpc,sentence2_scope_etpc,sentence1_segment_text,sentence2_segment_text
14,14,He told The Sun newspaper that Mr. Hussein's d...,"""Saddam's daughters had British schools and ho...","[He, told, The, Sun, newspaper, that, Mr., Hus...","[``, Saddam, 's, daughters, had, British, scho...",1,1,"[Same Polarity Substitution (named ent.), Same...","[7, 6, 7, 26, 25, 29, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[0, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26...","[Hussein, The Sun newspaper, Mr. Hussein, Mr. ...","[Saddam, The Sun, Saddam, Saddam 's daughters ..."
22,22,But tropical storm warnings and watches were p...,Tropical storm warnings were in place Thursday...,"[But, tropical, storm, warnings, and, watches,...","[Tropical, storm, warnings, were, in, place, T...",0,1,"[Addition/Deletion, Addition/Deletion, Identit...","[25, 25, 29, 30, 4, 6, 11, 17]","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[But, tropical storm warnings watches , the so...","[Jamaica and, storm warnings watches , the sou..."
35,35,Trading in Loral was halted yesterday; the sha...,The New York Stock Exchange suspended trading ...,"[Trading, in, Loral, was, halted, yesterday, ;...","[The, New, York, Stock, Exchange, suspended, t...",0,1,"[Same Polarity Substitution (habitual), Diathe...","[5, 14, 18, 29, 30, 21]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[halted, Trading in Loral was halted, Trading ...","[suspended, The New York Stock Exchange suspen..."
40,40,Last year the court upheld Cleveland's school ...,"Last year, the court ruled 5-4 in an Ohio case...","[Last, year, the, court, upheld, Cleveland, 's...","[Last, year, ,, the, court, ruled, 5-4, in, an...",1,1,"[Same Polarity Substitution (contextual), Infl...","[6, 1, 25, 25, 29, 28, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29, 25, 29...","[provide, choice, Last year that vouchers are ...","[provide with, choices, government, among a ra..."
63,63,Contrary to what PeopleSoft management would h...,Ellison said that contrary to the contentions ...,"[Contrary, to, what, PeopleSoft, management, w...","[Ellison, said, that, contrary, to, the, conte...",1,1,"[Addition/Deletion, Identity, Semantic based, ...","[25, 29, 28, 21]","[28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 2...","[25, 25, 0, 29, 29, 28, 28, 28, 28, 28, 29, 29...","[Contrary to , Oracle intends to fully support...","[Ellison said, contrary to , Oracle intends to..."
72,72,Also demonstrating box-office strength _ and g...,Also demonstrating box-office strength -- and ...,"[Also, demonstrating, box-office, strength, _,...","[Also, demonstrating, box-office, strength, --...",1,1,"[Spelling changes, Spelling changes, Identity,...","[4, 4, 29, 21, 21]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[O'Neill 's, Day 's, Also demonstrating box-of...","[ONeills, Days, Also demonstrating box-office ..."
86,86,Sales - a figure watched closely as a baromete...,It also disclosed that sales -- a figure close...,"[Sales, -, a, figure, watched, closely, as, a,...","[It, also, disclosed, that, sales, --, a, figu...",1,1,"[Same Polarity Substitution (habitual), Synthe...","[5, 11, 26, 25, 25, 25, 25, 29, 28, 21]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 0, 25, 0, 25, 25, 26, 25, 25, 25,...","[rose, many industry experts, closely, 5 perce...","[were higher, industry experts, closely, by an..."
111,111,The suite comes complete with a word processor...,"The suite includes a word processor, spreadshe...","[The, suite, comes, complete, with, a, word, p...","[The, suite, includes, a, word, processor, ,, ...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 6, 5, 11, 18, 25, 25, 29, 21, 21]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[comes complete with, software, utilizing, an,...","[includes, application, built around, the, XML..."
124,124,"Powell fired back: ""He's accusing the presiden...","If so, Powell said, he's calling the president...","[Powell, fired, back, :, ``, He, 's, accusing,...","[If, so, ,, Powell, said, ,, he, 's, calling, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 25, 25, 25, 29, 21]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[accusing, he, , he said, Powell fired back :,...","[calling, Powell, , Powell said ,, , too, If s..."
126,126,The memo on protecting sales of Windows and ot...,"The memo specifically mentioned Linux, a still...","[The, memo, on, protecting, sales, of, Windows...","[The, memo, specifically, mentioned, Linux, ,,...",1,1,"[Addition/Deletion, Addition/Deletion, Identity]","[25, 25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...",[on protecting sales of Windows and other desk...,"[specifically, The memo mentioned Linux , a st..."


The issue also exists in part in the original ETPC: some paraphrase types have scopes annotated as pretty much the entire sentence. This seems especially prevalent among 'Punctuation changes'.

TODO: rewrite this, show examples 

While this is certainly an issue for the original ETPC, it's at least partly offset there since their annotation scheme has separate scopes for each paraphrase type. So even if the annotated scope of some given type isn't very informative, the entire sentence isn't lost: you'd still have other paraphrase types, which are most likely annotated correctly. But Wahle's dataset (and consequently his training pipeline) doesn't account for this. Whatever process Wahle et al. used for generating that dataset on Huggingface seems to have an especially hard time with sentences in the original ETPC as exemplified above, but the issue happens throughout *all* their dataset.

## Getting paraphrases from the original ETPC

Let's first clean up the dataset

In [15]:
positives.drop(columns=['idx', 'etpc_label', 'mrpc_label', 
                                       'sentence1_scope_etpc', 
                                       'sentence2_scope_etpc', 
                                       'sentence1_segment_text', 
                                       'sentence2_segment_text'])

Unnamed: 0,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,ept_names,ept_ids
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...","[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]"
2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...","[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]"
4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...","[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]"
5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...","[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]"
7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...","[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]"
...,...,...,...,...,...,...
5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...","[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]"
5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...","[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]"
5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...","[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]"
5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...","[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]"


We'll need a column to house the new scopes. Let's initialize that column with empty arrays for each token in the sentence. That way, we can easily tell which tokens haven't been annotated yet.

In [16]:
#TODO: get rid of SettingWithCopyWarning
positives['sentence1_scope'] = positives['sentence1_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))
positives['sentence2_scope'] = positives['sentence2_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))
positives['sentence1_scope'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['sentence1_scope'] = positives['sentence1_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['sentence2_scope'] = positives['sentence2_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))


array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', ''], dtype='<U64')

### Populating types

Helper function to populate type

In [17]:
# TODO: Convert to df apply (rather than series apply on idx)
# TODO: Figure out if 64 char limit will be an issue

def populate_type(idx, ept_id, lookup_df=textual_paraphrases):
    """Given a paraphrase pair (idx) and an EPT paraphrase type (ept_id), convert the EPT type to Paraop, look up the 
    scopes for both sentences in the pair, and fill in the scopes with the Paraop type. Returns a pair of arrays with
    the newly annotated scopes."""
    paraop_id = ept_to_paraop(ept_id)

    # Copy array to avoid messing up the originals
    array1 = np.copy(positives['sentence1_scope'][idx])
    array2 = np.copy(positives['sentence2_scope'][idx])
    
    # Create a subset of the lookup array containing only the paraphrase types
    # we are interested in (ept_id)
    subset = lookup_df[(lookup_df['pair_id'] == idx+1) & (lookup_df['type_id'] == int(ept_id))]
    subset.reset_index(drop=True, inplace=True)
    instances = len(subset['type_id'].values) # Count how many discrete instances of that type are there in this pair

    def fill(sentence_n, instance, array, scope):
        """Helper function for filling in ids"""
        # Identify which indices in the array have not been filled yet
        empty = np.where(array == '')[0]
        nonempty = np.where(array != '')[0]

        # Fill in empty entries
        if len(scope) > 0:
            empty_intersect = np.intersect1d(scope, empty)
            array[empty_intersect] = f'{paraop_id}_{instance}'
            
        # Append to non-empty entries
        nonempty_intersect = np.intersect1d(scope, nonempty)
        if len(nonempty_intersect) > 0:
            # TODO: Log this in a better way (save to a file instead of just printing)
            print(f'Double check type overwriting: row {idx}, sentence {sentence_n}')
            print(f'Common indices: {list(nonempty_intersect)}')
            array[nonempty_intersect] = np.char.add(array[nonempty_intersect], f' & {paraop_id}_{instance}')

    # Filling in
    for i in range(instances):
        # Get scopes from lookup df
        s1_scope = np.array(subset.loc[subset['type_id'] == ept_id, 's1_scope'].iloc[i])
        s2_scope = np.array(subset.loc[subset['type_id'] == ept_id, 's2_scope'].iloc[i])
        
        fill(1, i, array1, s1_scope)
        fill(2, i, array2, s2_scope)
    
    return array1, array2

Here's a demo of how the outputs to that function look like:

In [18]:
populate_type(0, 26)

(array(['3_0', '3_0', '3_0', '3_0', '', '', '', '', '', '', '', '', '', '',
        '', '', '', '', ''], dtype='<U64'),
 array(['', '', '', '', '', '', '', '', '', '', '3_0', '3_0', '3_0', '3_0',
        '', '', '', '', '', ''], dtype='<U64'))

`populate_type` returns new arrays, it doesn't modify the original df. Use the function below to actually modify the df

In [19]:
def substitute(ept_id, lookup_df=textual_paraphrases):
    series = positives['idx'].apply(populate_type, ept_id=ept_id, lookup_df=lookup_df)
    cols = pd.DataFrame(series.tolist(), columns=['sentence1', 'sentence2'])
    positives.loc[:, 'sentence1_scope'] = cols['sentence1'].values
    positives.loc[:, 'sentence2_scope'] = cols['sentence2'].values

### Performing the reannotation

#### Change of order

In [20]:
substitute(26)

Double check type overwriting: row 196, sentence 1
Common indices: [2]
Double check type overwriting: row 196, sentence 2
Common indices: [9]
Double check type overwriting: row 411, sentence 1
Common indices: [0, 1, 2, 3, 4]
Double check type overwriting: row 411, sentence 2
Common indices: [12, 13, 14, 15]
Double check type overwriting: row 1014, sentence 1
Common indices: [5]
Double check type overwriting: row 1014, sentence 2
Common indices: [10, 11]
Double check type overwriting: row 1543, sentence 1
Common indices: [19]
Double check type overwriting: row 1543, sentence 2
Common indices: [5]
Double check type overwriting: row 1864, sentence 1
Common indices: [5, 6]
Double check type overwriting: row 1864, sentence 2
Common indices: [11, 12]
Double check type overwriting: row 2309, sentence 1
Common indices: [0, 1, 2, 3, 4, 5]
Double check type overwriting: row 2309, sentence 2
Common indices: [10, 11, 12, 13, 14]
Double check type overwriting: row 2336, sentence 1
Common indices: [

Double check type overwriting: row 2920, sentence 1
Common indices: [25, 26]
Double check type overwriting: row 2920, sentence 2
Common indices: [5, 6]
Double check type overwriting: row 3164, sentence 1
Common indices: [6]
Double check type overwriting: row 3164, sentence 2
Common indices: [19]
Double check type overwriting: row 4346, sentence 1
Common indices: [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26]
Double check type overwriting: row 4346, sentence 2
Common indices: [0, 1, 2, 3, 4, 5, 6, 7, 8]
Double check type overwriting: row 4346, sentence 1
Common indices: [11]
Double check type overwriting: row 4346, sentence 2
Common indices: [18]
Double check type overwriting: row 4514, sentence 1
Common indices: [12, 13, 14, 15]
Double check type overwriting: row 4514, sentence 2
Common indices: [3, 4, 5, 6, 7]


#### Same Polarity Substitution (Habitual)

In [21]:
substitute(5)

Double check type overwriting: row 75, sentence 1
Common indices: [5, 6]
Double check type overwriting: row 75, sentence 2
Common indices: [0]
Double check type overwriting: row 152, sentence 1
Common indices: [13, 14]
Double check type overwriting: row 152, sentence 2
Common indices: [10]
Double check type overwriting: row 172, sentence 1
Common indices: [22]
Double check type overwriting: row 172, sentence 2
Common indices: [17]
Double check type overwriting: row 226, sentence 1
Common indices: [0, 1, 2, 3]
Double check type overwriting: row 226, sentence 2
Common indices: [2]
Double check type overwriting: row 310, sentence 1
Common indices: [9]
Double check type overwriting: row 310, sentence 2
Common indices: [8]
Double check type overwriting: row 315, sentence 1
Common indices: [2]
Double check type overwriting: row 315, sentence 2
Common indices: [25]
Double check type overwriting: row 339, sentence 1
Common indices: [5]
Double check type overwriting: row 339, sentence 2
Common 

#### Same Polarity Substitution (Contextual)

In [22]:
substitute(6)

Double check type overwriting: row 56, sentence 1
Common indices: [4]
Double check type overwriting: row 56, sentence 2
Common indices: [0, 1]
Double check type overwriting: row 112, sentence 1
Common indices: [22]
Double check type overwriting: row 112, sentence 2
Common indices: [7, 8]
Double check type overwriting: row 124, sentence 1
Common indices: [16]
Double check type overwriting: row 124, sentence 2
Common indices: [3]
Double check type overwriting: row 191, sentence 1
Common indices: [19, 20, 21]
Double check type overwriting: row 191, sentence 2
Common indices: [14]
Double check type overwriting: row 235, sentence 1
Common indices: [3]
Double check type overwriting: row 235, sentence 2
Common indices: [8, 9, 10, 11]
Double check type overwriting: row 276, sentence 1
Common indices: [10]
Double check type overwriting: row 276, sentence 2
Common indices: [15, 16]
Double check type overwriting: row 332, sentence 1
Common indices: [5]
Double check type overwriting: row 332, sent

#### Derivational Changes

In [23]:
substitute(3)

Double check type overwriting: row 254, sentence 1
Common indices: [23]
Double check type overwriting: row 254, sentence 2
Common indices: [21]
Double check type overwriting: row 433, sentence 1
Common indices: [17]
Double check type overwriting: row 433, sentence 2
Common indices: [17]
Double check type overwriting: row 449, sentence 1
Common indices: [5]
Double check type overwriting: row 449, sentence 2
Common indices: [22]
Double check type overwriting: row 480, sentence 1
Common indices: [4]
Double check type overwriting: row 480, sentence 2
Common indices: [16]
Double check type overwriting: row 608, sentence 1
Common indices: [5]
Double check type overwriting: row 608, sentence 2
Common indices: [5]
Double check type overwriting: row 810, sentence 1
Common indices: [14]
Double check type overwriting: row 810, sentence 2
Common indices: [20]
Double check type overwriting: row 1484, sentence 1
Common indices: [3]
Double check type overwriting: row 1484, sentence 2
Common indices: 

#### Inflectional Changes

In [24]:
substitute(1)

Double check type overwriting: row 47, sentence 1
Common indices: [3]
Double check type overwriting: row 47, sentence 2
Common indices: [3]
Double check type overwriting: row 76, sentence 1
Common indices: [2]
Double check type overwriting: row 76, sentence 2
Common indices: [20]
Double check type overwriting: row 120, sentence 1
Common indices: [13]
Double check type overwriting: row 120, sentence 2
Common indices: [11]
Double check type overwriting: row 164, sentence 1
Common indices: [7]
Double check type overwriting: row 164, sentence 2
Common indices: [11]
Double check type overwriting: row 194, sentence 1
Common indices: [15]
Double check type overwriting: row 194, sentence 2
Common indices: [19]
Double check type overwriting: row 261, sentence 1
Common indices: [2]
Double check type overwriting: row 261, sentence 2
Common indices: [1]
Double check type overwriting: row 357, sentence 1
Common indices: [9]
Double check type overwriting: row 357, sentence 2
Common indices: [7, 8, 9

#### Spelling Changes

In [25]:
substitute(4)

Double check type overwriting: row 155, sentence 1
Common indices: [5]
Double check type overwriting: row 155, sentence 2
Common indices: [8]
Double check type overwriting: row 449, sentence 1
Common indices: [9, 10]
Double check type overwriting: row 449, sentence 2
Common indices: [4]
Double check type overwriting: row 458, sentence 1
Common indices: [15]
Double check type overwriting: row 458, sentence 2
Common indices: [1]
Double check type overwriting: row 780, sentence 1
Common indices: [25]
Double check type overwriting: row 780, sentence 2
Common indices: [11]
Double check type overwriting: row 882, sentence 1
Common indices: [24]
Double check type overwriting: row 882, sentence 2
Common indices: [20]
Double check type overwriting: row 1090, sentence 1
Common indices: [9]
Double check type overwriting: row 1090, sentence 2
Common indices: [9]
Double check type overwriting: row 1496, sentence 1
Common indices: [10]
Double check type overwriting: row 1496, sentence 2
Common indic

#### Change of format

In [26]:
substitute(8)

Double check type overwriting: row 418, sentence 1
Common indices: [9]
Double check type overwriting: row 418, sentence 2
Common indices: [12]
Double check type overwriting: row 508, sentence 1
Common indices: [4]
Double check type overwriting: row 508, sentence 2
Common indices: [9]
Double check type overwriting: row 508, sentence 1
Common indices: [5]
Double check type overwriting: row 508, sentence 2
Common indices: [10]
Double check type overwriting: row 586, sentence 1
Common indices: [13]
Double check type overwriting: row 586, sentence 2
Common indices: [6]
Double check type overwriting: row 1322, sentence 1
Common indices: [25]
Double check type overwriting: row 1322, sentence 2
Common indices: [23]
Double check type overwriting: row 1974, sentence 1
Common indices: [16]
Double check type overwriting: row 1974, sentence 2
Common indices: [16]
Double check type overwriting: row 1974, sentence 1
Common indices: [22]
Double check type overwriting: row 1974, sentence 2
Common indic

#### Opposite Polarity Substitution (Habitual)

In [27]:
substitute(9)

#### Modal Verb Changes

In [28]:
# TODO: Check overlapped words between (e.g.) derivational & inflectional changes
# The way this works right now, you'd have something like ['6_0 & 6_0'] for those
# Make sure this doesn't happen. Probably do a function that does a pass on the
# array of strings later and removes any duplicates

In [29]:
def trim_duplicates(s1_scope, s2_scope, s1_text, s2_text):
    s1_newtext = s1_text.split()
    s2_newtext = s2_text.split()
    
    in1 = np.where(np.in1d(s1_newtext, s2_newtext))[0]
    in2 = np.where(np.in1d(s2_newtext, s1_newtext))[0]

    s1_newscope = np.delete(s1_scope, in1)
    s2_newscope = np.delete(s2_scope, in2)
    s1_newtext = ' '.join(np.delete(s1_newtext, in1))
    s2_newtext = ' '.join(np.delete(s2_newtext, in2))

    return s1_newscope, s2_newscope, s1_newtext, s2_newtext

In [30]:
ric = textual_paraphrases.loc[textual_paraphrases['type_id'].isin([2])]
ric

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
120,40,2,Modal Verb Changes,yes,"[7, 8, 9]","[4, 5]",intend to seek,will seek,,,,
135,45,2,Modal Verb Changes,yes,"[4, 5, 6, 7]","[3, 4]",is expected to decline,will decline,,,,
197,57,2,Modal Verb Changes,yes,"[5, 6, 7]","[2, 3, 4, 5]",would shut down,plans to shut down,,,,
381,108,2,Modal Verb Changes,yes,"[6, 7]","[6, 7, 8]",were dispatched,will be sent,,,,
393,110,2,Modal Verb Changes,yes,"[11, 12]","[10, 11]",may issue,might issue,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
21449,5623,2,Modal Verb Changes,yes,"[8, 9]","[10, 11, 12]",will address,would participate in,,,,
21593,5665,2,Modal Verb Changes,yes,"[3, 4, 5]",[4],could have been,was,,,,
21679,5691,2,Modal Verb Changes,yes,"[4, 5]",[10],could bring,bringing,,,,
21762,5712,2,Modal Verb Changes,yes,"[3, 4]",[2],would give,gives,,,,


In [31]:
trimmed = pd.DataFrame(columns = ric.columns, data = copy.deepcopy(ric.values))
trimmed[['s1_scope', 's2_scope', 's1_text', 's2_text']] = trimmed.apply(lambda x: trim_duplicates(x.s1_scope, x.s2_scope, x.s1_text, x.s2_text), axis=1, result_type='expand')
trimmed

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
0,40,2,Modal Verb Changes,yes,"[7, 8]",[4],intend to,will,,,,
1,45,2,Modal Verb Changes,yes,"[4, 5, 6]",[3],is expected to,will,,,,
2,57,2,Modal Verb Changes,yes,[5],"[2, 3]",would,plans to,,,,
3,108,2,Modal Verb Changes,yes,"[6, 7]","[6, 7, 8]",were dispatched,will be sent,,,,
4,110,2,Modal Verb Changes,yes,[11],[10],may,might,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
179,5623,2,Modal Verb Changes,yes,"[8, 9]","[10, 11, 12]",will address,would participate in,,,,
180,5665,2,Modal Verb Changes,yes,"[3, 4, 5]",[4],could have been,was,,,,
181,5691,2,Modal Verb Changes,yes,"[4, 5]",[10],could bring,bringing,,,,
182,5712,2,Modal Verb Changes,yes,"[3, 4]",[2],would give,gives,,,,


In [32]:
substitute(2, trimmed)

Double check type overwriting: row 56, sentence 1
Common indices: [5]
Double check type overwriting: row 56, sentence 2
Common indices: [2, 3]
Double check type overwriting: row 107, sentence 1
Common indices: [7]
Double check type overwriting: row 107, sentence 2
Common indices: [8]
Double check type overwriting: row 601, sentence 1
Common indices: [2, 3, 4, 5, 6]
Double check type overwriting: row 601, sentence 2
Common indices: [3]
Double check type overwriting: row 651, sentence 1
Common indices: [17, 18, 19]
Double check type overwriting: row 651, sentence 2
Common indices: [1]
Double check type overwriting: row 676, sentence 1
Common indices: [8]
Double check type overwriting: row 676, sentence 2
Common indices: [4, 5, 6, 7]


Double check type overwriting: row 733, sentence 1
Common indices: [3, 4]
Double check type overwriting: row 733, sentence 2
Common indices: [2]
Double check type overwriting: row 799, sentence 1
Common indices: [8]
Double check type overwriting: row 799, sentence 2
Common indices: [8, 9, 10]
Double check type overwriting: row 896, sentence 1
Common indices: [22]
Double check type overwriting: row 896, sentence 2
Common indices: [23, 24]
Double check type overwriting: row 1390, sentence 1
Common indices: [6]
Double check type overwriting: row 1390, sentence 2
Common indices: [10]
Double check type overwriting: row 1484, sentence 1
Common indices: [2, 3]
Double check type overwriting: row 1484, sentence 2
Common indices: [11]
Double check type overwriting: row 1599, sentence 1
Common indices: [13]
Double check type overwriting: row 1599, sentence 2
Common indices: [8, 9, 10, 11, 12]
Double check type overwriting: row 1724, sentence 1
Common indices: [4, 5, 6]
Double check type overwriti

#### Named Entity Substitution

### Diagnosing

Run these cells to make sure everything looks OK after reannotating

In [33]:
positives.loc[positives['idx'] == 2953, 'sentence2_scope'].iloc[0]

array(['', '', '', '', '', '', '', '', '5_0 & 6_0', '5_0 & 6_0', '',
       '5_1', '', '', '5_2', '', '5_3', '5_3', ''], dtype='<U64')

In [34]:
subset = textual_paraphrases[(textual_paraphrases['pair_id'] == 449+1) & (textual_paraphrases['type_id'] == int(5))]
subset

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
1728,450,5,Same Polarity Substitution (habitual),yes,"[20, 21]","[13, 14]",Web sites,Web pages,,,,
1729,450,5,Same Polarity Substitution (habitual),yes,[5],[22],warns,notice,,,,


In [35]:
# For diagnosing
textual_paraphrases.loc[(textual_paraphrases['pair_id'] == 449+1) & (textual_paraphrases['type_id'].isin([1,5,3,26]))]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
1728,450,5,Same Polarity Substitution (habitual),yes,"[20, 21]","[13, 14]",Web sites,Web pages,,,,
1729,450,5,Same Polarity Substitution (habitual),yes,[5],[22],warns,notice,,,,
1730,450,3,Derivational Changes,yes,[5],[22],warns,notice,,,,
1732,450,26,Change of order,yes,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...",changes to Internet Explorer may affect a `` l...,changes to IE `` may affect a large number of ...,,,,


In [36]:
#TODO: Modal verb needs trimming.
#TODO: So does some other one that I forget rn
#TODO: Numbers on named entity substitution
#TODO: Figure out what's the matter with punctuation changes -- it's the key! Maybe I can use those as indices...

In [37]:
def print_sents(idx):
    print(positives.loc[positives['idx'] == idx, 'sentence1'].iloc[0])
    print(positives.loc[positives['idx'] == idx, 'sentence2'].iloc[0])

In [38]:
print_sents(112)
print_sents(96)

Downstream at Mount Vernon, the Skagit River was expected to crest at 36 feet -- 8 feet above flood stage -- tonight, Burke said.
The Skagit was expected to crest during the night at 38 feet at Mount Vernon, 10 feet above flood stage, the National Weather Service said.
Shares of Hartford rose $2.88 to $46.50 in New York Stock Exchange composite trading.
Shares of Hartford were up $2.28, or 5.2 percent, to $45.90 in midday trading.


In [39]:
textual_paraphrases[(textual_paraphrases['pair_id'] == 113)]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
412,113,7,Same Polarity Substitution (named ent.),yes,"[6, 7]",[1],Skagit River,Skagit,,,,
413,113,7,Same Polarity Substitution (named ent.),yes,[16],[16],8,10,,,,
414,113,6,Same Polarity Substitution (contextual),yes,[22],"[7, 8]",tonight,the night,,,,
415,113,7,Same Polarity Substitution (named ent.),yes,[24],"[22, 23, 24, 25]",Burke,the National Weather Service,,,,
416,113,26,Change of order,yes,"[1, 2, 3, 4]","[12, 13, 14]","at Mount Vernon ,",at Mount Vernon,,,,
417,113,26,Change of order,yes,[22],"[6, 7, 8]",tonight,during the night,,,,
418,113,25,Addition/Deletion,yes,[0],,Downstream,,,,,
419,113,29,Identity,yes,"[5, 8, 9, 10, 11, 12, 14, 17, 18, 19, 20, 23, ...","[0, 2, 3, 4, 5, 9, 11, 17, 18, 19, 20, 21, 26,...",the was expected to crest at feet feet above f...,The was expected to crest at feet feet above f...,,,,
420,113,30,Non-paraphrase,yes,[13],[10],36,38,,,,
421,113,21,Punctuation changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","Downstream at Mount Vernon , the Skagit River ...",The Skagit was expected to crest during the ni...,1521.0,15.0,-- --,","


In [40]:
textual_paraphrases[(textual_paraphrases['type_id'] == 7)][:50]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
40,15,7,Same Polarity Substitution (named ent.),yes,[7],[1],Hussein,Saddam,,,,
42,15,7,Same Polarity Substitution (named ent.),yes,"[6, 7]",[1],Mr. Hussein,Saddam,,,,
115,39,7,Same Polarity Substitution (named ent.),yes,[6],[5],JCP,JCP.N,,,,
116,39,7,Same Polarity Substitution (named ent.),yes,[12],[9],WAG,WAG.N,,,,
134,45,7,Same Polarity Substitution (named ent.),yes,[1],[1],US,Americas,,,,
152,49,7,Same Polarity Substitution (named ent.),yes,[11],"[12, 13, 14, 15, 16, 17, 18]",PEP.N,nyse : PEP - news - people,,,,
206,60,7,Same Polarity Substitution (named ent.),yes,[7],"[7, 8, 9]",770,at least 767,,,,
263,76,7,Same Polarity Substitution (named ent.),yes,[0],"[4, 5]",Thomas,Mr. Thomas,,,,
264,76,7,Same Polarity Substitution (named ent.),yes,[2],"[7, 8]",Tauzin,Mr. Tauzin,,,,
296,83,7,Same Polarity Substitution (named ent.),yes,[21],"[17, 18]",3km,two miles,,,,


In [41]:
positives['sentence1_scope'][0]

array(['3_0', '3_0', '3_0', '3_0', '', '4_0', '', '5_0', '', '', '', '',
       '', '', '', '', '', '', ''], dtype='<U64')

Flagged rows:

2432, 5074, 12186


Sentence modality changes have zero ocurrences among paraphrases

# The garbage pail

In [42]:
auxiliaries = ['are', 'am', 'be', 'been', 'being', 'had', 'has', 'have', 'having', 'is', 'was', 'were']

Code that may or may not be useful will remain here for a while

Change of Order > Identity

Game plan:

Same Polarity Substitution > Derivational Changes > Inflectional Changes > ...Modal Verb Changes? > Change of Order (modified)

## Filtering

Helper methods for filtering the ETPC dataframe based on paraphrase types

In [43]:
def filter_contains(df, search_ids):
  """Returns an ETPC dataframe with rows where paraphrase_types_ids contains
  the search_ids. Use this to search for paraphrase pairs containing specific
  ids"""
  return df[df['ept_ids'].apply(lambda x: np.isin(search_ids, x))]

def filter_equals(df, search_ids):
  """Returns an ETPC dataframe with rows where paraphrase_types_ids EXACTLY 
  MATCHES the search_ids."""
  return df[df['ept_ids'].apply(lambda x: np.array_equal(x, search_ids))]

In [44]:
filter_contains(etpc, '3')

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_segment_location,sentence2_segment_location,sentence1_segment_location_indices,sentence2_segment_location_indices,sentence1_segment_text,sentence2_segment_text
142,143_142,Tyco later said the loan had not been forgiven...,"Tyco has said the loan was not forgiven, but t...","[Tyco, later, said, the, loan, had, not, been,...","[Tyco, has, said, the, loan, was, not, forgive...",1,1,"[Same Polarity Substitution (contextual), Infl...","[6, 1, 1, 3, 24, 25, 29, 21]","[0, 25, 1, 29, 29, 1, 29, 1, 1, 29, 6, 29, 29,...","[25, 1, 1, 25, 25, 1, 25, 1, 25, 6, 24, 29, 29...","[[10], [2], [5, 7, 8], [15], [11, 12, 13, 14, ...","[[9], [1, 2], [5, 7], [12], [10, 11, 12, 13, 1...","[and, said, had been forgiven, full, Swartz re...","[but, has said, was forgiven, fully, that Swar..."
149,150_149,She estimated it would take three months and w...,She said it would take an estimated three mont...,"[She, estimated, it, would, take, three, month...","[She, said, it, would, take, an, estimated, th...",1,1,"[Synthetic/analytic substitution, Derivational...","[11, 3, 16, 25, 25, 29]","[25, 3, 25, 16, 25, 25, 25, 25, 16, 25, 11, 25...","[0, 0, 0, 16, 16, 0, 3, 0, 0, 25, 25, 0, 16, 1...","[[10], [1], [3, 4, 8, 9], [0, 2, 4, 5, 6, 7, 9...","[[13, 14], [6], [3, 4, 12], [9, 10]]","[cancellation, estimated, would take would req...","[the cancellation, estimated, would take requi..."
238,239_238,Saddam loyalists have been blamed for sabotagi...,Hussein loyalists have been blamed for sabotag...,"[Saddam, loyalists, have, been, blamed, for, s...","[Hussein, loyalists, have, been, blamed, for, ...",1,1,"[Same Polarity Substitution (named ent.), Deri...","[7, 3, 8, 25, 29, 21]","[7, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 0,...","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[[0], [16], [18], [12, 13, 14], [1, 2, 3, 4, 5...","[[0], [12], [13], [1, 2, 3, 4, 5, 6, 7, 8, 9, ...","[Saddam, attacks, U.S., as well as, loyalists ...","[Hussein, attacking, US, loyalists have been b..."
254,255_254,"""It's amazing to be part of an industry that r...","""It's amazing to be part of an industry that r...","[``, It, 's, amazing, to, be, part, of, an, in...","[``, It, 's, amazing, to, be, part, of, an, in...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 3, 29]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[[24], [22, 23], [23], [0, 1, 2, 3, 4, 5, 6, 7...","[[22], [20, 21, 27, 28], [21], [0, 1, 2, 3, 4,...","[of, recent graduate, graduate, `` It 's amazi...","[from, only graduated last May, graduated, `` ..."
286,287_286,The search was concentrated in northeast Penns...,The search was concentrated in northeastern Pe...,"[The, search, was, concentrated, in, northeast...","[The, search, was, concentrated, in, northeast...",1,1,"[Derivational Changes, Addition/Deletion, Iden...","[3, 25, 29, 28]","[29, 29, 29, 29, 29, 3, 29, 29, 29, 29, 29, 29...","[25, 25, 25, 25, 25, 3, 25, 25, 25, 25, 25, 25...","[[5], [23, 24], [0, 1, 2, 3, 4, 6, 7, 8, 9, 10...","[[5], [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, ...","[northeast, by now, The search was concentrate...","[northeastern, The search was concentrated in ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5621,5622_5621,Palm Wednesday announced plans to acquire Hand...,Palm said on Wednesday it plans to buy Handspr...,"[Palm, Wednesday, announced, plans, to, acquir...","[Palm, said, on, Wednesday, it, plans, to, buy...",0,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 6, 11, 3, 26, 29, 30]","[29, 26, 5, 3, 29, 5, 29, 29, 29, 29, 6, 29, 2...","[29, 5, 26, 26, 0, 3, 29, 5, 29, 29, 29, 29, 6...","[[2], [5], [10], [1], [3], [1], [0, 4, 6, 7, 8...","[[1], [7], [12], [2, 3], [5], [2, 3], [0, 6, 8...","[announced, acquire, started, Wednesday, plans...","[said, buy, created, on Wednesday, plans, on W..."
5702,5703_5702,Some opposition leaders said they would reserv...,Some opposition leaders called for withdrawing...,"[Some, opposition, leaders, said, they, would,...","[Some, opposition, leaders, called, for, withd...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 3, 26, 25, 29]","[26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 2...","[6, 6, 6, 25, 25, 3, 25, 0, 0, 26, 26, 26, 26,...","[[0, 1, 2], [14], [19], [0, 1, 2, 3, 4, 5, 6, ...","[[9], [0, 1, 2], [5], [9, 10, 11, 12, 13, 14, ...","[Some opposition leaders, others, withdrawal, ...","[others, Some opposition leaders, withdrawing,..."
5709,5710_5709,Women who eat potatoes and other tuberous vege...,Australian researchers believe they have found...,"[Women, who, eat, potatoes, and, other, tubero...","[Australian, researchers, believe, they, have,...",1,1,"[Same Polarity Substitution (named ent.), Same...","[7, 5, 3, 24, 18, 26, 25, 29]","[26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 2...","[7, 0, 0, 25, 25, 25, 26, 26, 26, 26, 26, 26, ...","[[23], [0], [15], [0, 1, 2, 3, 4, 5, 6, 7, 8, ...","[[0], [16], [7], [6, 7, 8, 9, 10, 11, 12, 13, ...","[Melbourne, Women, triggering, Women who eat p...","[Australian, mothers, trigger, a trigger of ty..."
5712,5713_5712,There is only one drug on the market for macul...,There is only one drug on the market for macul...,"[There, is, only, one, drug, on, the, market, ...","[There, is, only, one, drug, on, the, market, ...",1,1,"[Derivational Changes, Subordination and nesti...","[3, 18, 25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[17], [19, 20, 21, 22, 23, 24, 25, 26], [18]]","[[18], [20, 21, 22, 23, 24, 25, 26], [0, 1, 2,...","[treat, one subtype that represents a minority...","[treatment, one subtype representing a minorit..."


In [45]:
filter_equals(etpc, ['25', '29'])

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_segment_location,sentence2_segment_location,sentence1_segment_location_indices,sentence2_segment_location_indices,sentence1_segment_text,sentence2_segment_text
167,168_167,U.S. law enforcement officials are sneering at...,U.S. law enforcement officials are sneering at...,"[U.S., law, enforcement, officials, are, sneer...","[U.S., law, enforcement, officials, are, sneer...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[[15, 16, 17, 18, 19, 20, 21, 22, 23]]",[U.S. law enforcement officials are sneering a...,[-- including a police conspiracy to discredit...
645,646_645,I called the number and the lady told me she w...,I called the number and the lady told me she w...,"[I, called, the, number, and, the, lady, told,...","[I, called, the, number, and, the, lady, told,...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[18, 20, 21, 22, 23, 24]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[, Sherry Studabaker told BBC television, I ca...",[I called the number and the lady told me she ...
1017,1018_1017,He said the problem needs to be corrected befo...,He said the prob lem needs to be corrected bef...,"[He, said, the, problem, needs, to, be, correc...","[He, said, the, prob, lem, needs, to, be, corr...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[13, 14, 15]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[is cleared to, He said the problem needs to b...",[He said the prob lem needs to be corrected be...
2046,2047_2046,Other recommendations included a special couns...,Other recommendations included the creation of...,"[Other, recommendations, included, a, special,...","[Other, recommendations, included, the, creati...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",[a special counsel on oceans in the White Hous...,[Other recommendations included the creation o...
2063,2064_2063,"""For me, the Lewinsky imbroglio seemed like ju...","""For me, the Lewinsky imbroglio seemed like ju...","[``, For, me, ,, the, Lewinsky, imbroglio, see...","[``, For, me, ,, the, Lewinsky, imbroglio, see...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[17, 19, 20, 21, 22, 23]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[, according to extracts leaked yesterday, `` ...","[`` For me , the Lewinsky imbroglio seemed lik..."
2180,2181_2180,"And in the Muslim world, Osama bin Laden is be...","And in the Muslim world, Osama bin Laden, the ...","[And, in, the, Muslim, world, ,, Osama, bin, L...","[And, in, the, Muslim, world, ,, Osama, bin, L...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[[9, 10, 11, 12, 13, 14, 15, 16, 17, 18]]","[And in the Muslim world , Osama bin Laden is ...","[, the missing leader of the al-Qaida terroris..."
2229,2230_2229,This is a process and there will be other oppo...,This is a process and there will be other oppo...,"[This, is, a, process, and, there, will, be, o...","[This, is, a, process, and, there, will, be, o...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[21, 22, 23]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[he told reporters, This is a process and ther...",[This is a process and there will be other opp...
2282,2283_2282,"""Right from the beginning, we didn't want to s...","But Mr. Crosby told The Associated Press: ""Rig...","[``, Right, from, the, beginning, ,, we, did, ...","[But, Mr., Crosby, told, The, Associated, Pres...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 0...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[[0, 1, 2, 3, 4, 5, 6, 7]]","[`` Right from the beginning , we did n't want...","[But Mr. Crosby told The Associated Press :, `..."
2703,2704_2703,It's almost as if they (Russians) hit an x-mar...,It's almost as if they (Russians) hit an x-mar...,"[It, 's, almost, as, if, they, (, Russians, ),...","[It, 's, almost, as, if, they, (, Russians, ),...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[15, 17, 18, 19, 20, 21]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[, NASA spokesman Robert Navias said, It 's al...",[It 's almost as if they ( Russians ) hit an x...
2786,2787_2786,"""This puts telemarketers on notice that we wil...","""This puts telemarketers on notice that we wil...","[``, This, puts, telemarketers, on, notice, th...","[``, This, puts, telemarketers, on, notice, th...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[[25, 27, 28, 29, 30, 31]]",[`` This puts telemarketers on notice that we ...,"[, FCC chairman Michael Powell said, `` This p..."
