# Imports

In [1]:
import copy
import pandas as pd
import numpy as np

# Reading the ETPC

This is the ETPC dataset compiled by Wahle and posted on HuggingFace

In [2]:
# Unpickle etpc_raw
etpc = pd.read_pickle('datasets/etpc_raw.pkl')

These are the XML files from the ETPC github repo.

The first one contains all pairs marked as paraphrases by the MRPC:

In [3]:
textual_paraphrases = pd.read_xml('datasets/etpc/textual_paraphrases.xml')
# Convert scopes from strings to lists of ints
textual_paraphrases['s1_scope'] = textual_paraphrases['s1_scope'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)
textual_paraphrases['s2_scope'] = textual_paraphrases['s2_scope'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)
textual_paraphrases['key_s1'] = textual_paraphrases['key_s1'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)
textual_paraphrases['key_s2'] = textual_paraphrases['key_s2'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)

The second one contains the text and pair ids for *all* sentence pairs (paraphrases or not). It doesn't contain any data on whether they're paraphrases or not, or what EPT types are in them.

In [4]:
pairs = pd.read_xml('datasets/etpc/text_pairs.xml')
pairs.drop(columns=['negation'], inplace=True)
pairs.set_index('pair_id', inplace=True)

# Cleanup

## Cleaning up Columns

In [5]:
etpc.rename(columns={'paraphrase_type_ids': 'ept_ids', 'paraphrase_types': 'ept_names'}, inplace=True)
etpc.drop(columns={'negation'}, axis=1, inplace=True)
etpc

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_segment_location,sentence2_segment_location,sentence1_segment_location_indices,sentence2_segment_location_indices,sentence1_segment_text,sentence2_segment_text
0,1_0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 2...","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26...","[[5], [7], [0, 1, 2, 3], [8, 9, 10, 11, 12, 13...","[[1, 2], [0], [10, 11, 12, 13], [4]]","[whom, called, Amrozi accused his brother, `` ...","[to him, Referring, Amrozi accused his brother..."
1,2_1,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...,"[Yucaipa, owned, Dominick, 's, before, selling...","[Yucaipa, bought, Dominick, 's, in, 1995, for,...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
2,3_2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, ...","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 2...","[[0], [14], [8, 9, 10], [17, 18, 19]]","[[4, 5, 6, 7], [18], [0, 1, 2, 3], [8, 9, 10, ...","[They, cargo, on June 10, , he added, had publ...","[the ship 's owners, explosives, On June 10 ,,..."
3,4_3,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ...","[Around, 0335, GMT, ,, Tab, shares, were, up, ...","[Tab, shares, jumped, 20, cents, ,, or, 4.6, %...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
4,5_4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...",0,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, ...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[[0, 1], [2], [11, 12, 14], [13], [13], [7], [...","[[0, 1, 2, 3, 4], [5], [11], [20, 21], [20, 21...","[The stock, rose, to close at, Friday, Friday,...","[PG & E Corp. shares, jumped, to, on Friday, o..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5796,5797_5796,"After Hughes refused to rehire Hernandez, he c...",Hernandez filed an Equal Employment Opportunit...,"[After, Hughes, refused, to, rehire, Hernandez...","[Hernandez, filed, an, Equal, Employment, Oppo...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
5797,5798_5797,There are 103 Democrats in the Assembly and 47...,Democrats dominate the Assembly while Republic...,"[There, are, 103, Democrats, in, the, Assembly...","[Democrats, dominate, the, Assembly, while, Re...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
5798,5799_5798,Bethany Hamilton remained in stable condition ...,"Bethany, who remained in stable condition afte...","[Bethany, Hamilton, remained, in, stable, cond...","[Bethany, ,, who, remained, in, stable, condit...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
5799,5800_5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[26, 26, 0, 7, 7, 6, 1, 25, 25, 4, 25, 25, 25,...","[25, 25, 25, 25, 7, 0, 6, 1, 0, 0, 4, 25, 0, 2...","[[9], [3, 4], [5], [6], [0, 1], [24], [7, 8, 1...","[[10], [4], [6], [7], [13, 14], [0, 1, 2, 3], ...","[Corp, power station’s, US, owners, Last week,...","[Corp., Drax, American, owner, last week, The ..."


# Remapping paraphrase IDs


First, make a list with paraphrase types and IDs from the ETPC:

In [6]:
id_map = pd.read_xml('https://raw.githubusercontent.com/venelink/ETPC/master/Corpus/paraphrase_types.xml')
# Rename columns for clarity
id_map.rename(columns={'type_id': 'ept_id', 'type_name': 'ept_name'}, inplace=True)
# Drop unused data
id_map = id_map[['ept_id', 'ept_name']] # No use for type_category column
id_map.drop(id_map.tail(2).index,inplace=True) # Types don't appear in ETPC
id_map.style.hide(axis="index")
id_map

Unnamed: 0,ept_id,ept_name
0,1,Inflectional Changes
1,2,Modal Verb Changes
2,3,Derivational Changes
3,4,Spelling changes
4,5,Same Polarity Substitution (habitual)
5,6,Same Polarity Substitution (contextual)
6,7,Same Polarity Substitution (named ent.)
7,8,Change of format
8,9,Opposite polarity substitution (habitual)
9,10,Opposite polarity substitution (contextual)


Now, make a list with paraphrase names and IDs for ParaOp types

In [7]:
data = [[0, 'No change'],
        [1, 'Addition/Deletion - Function Word'],
        [2, 'Addition/Deletion - Content Word'],
        [3, 'Change of Order'],
        [4, 'Substitution - Synonym'],
        [5, 'Substitution - Contextual Synonym'],
        [6, 'Substitution - Morphological'],
        [7, 'Substitution - Spelling and Format'],
        [8, 'Addition/Deletion - Punctuation']
       ]
paraop_map = pd.DataFrame(data, columns = ['paraop_id', 'paraop_name'])
paraop_map.set_index('paraop_id', inplace=True)
paraop_map

Unnamed: 0_level_0,paraop_name
paraop_id,Unnamed: 1_level_1
0,No change
1,Addition/Deletion - Function Word
2,Addition/Deletion - Content Word
3,Change of Order
4,Substitution - Synonym
5,Substitution - Contextual Synonym
6,Substitution - Morphological
7,Substitution - Spelling and Format
8,Addition/Deletion - Punctuation


## Mapping

We'll use the dataframe below for mapping. Each row will contain the name and ID of a paraphrase type in the ETPC, and the name and ID of the correspondent ParaOp type.

In [8]:
id_map['paraop_id'] = ''
id_map['paraop_name'] = ''
id_map

Unnamed: 0,ept_id,ept_name,paraop_id,paraop_name
0,1,Inflectional Changes,,
1,2,Modal Verb Changes,,
2,3,Derivational Changes,,
3,4,Spelling changes,,
4,5,Same Polarity Substitution (habitual),,
5,6,Same Polarity Substitution (contextual),,
6,7,Same Polarity Substitution (named ent.),,
7,8,Change of format,,
8,9,Opposite polarity substitution (habitual),,
9,10,Opposite polarity substitution (contextual),,


Here's where we do the mapping:

In [9]:
# Helper function to map an ETPC id to a Paraop id
def map_id(ept_id, paraop_id):
    """Given an EPT id and a Paraop id, look up the name of the Paraop id and 
    fill in the rows of id_map with paraop_id and the name."""
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'] = paraop_id
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_name'] = paraop_map.loc[paraop_id, 'paraop_name']

In [10]:
map_id(ept_id=1, paraop_id=6)
map_id(ept_id=3, paraop_id=6)
map_id(ept_id=26, paraop_id=3)
map_id(ept_id=29, paraop_id=0)
map_id(4, 7)
map_id(5, 4)
map_id(6, 5)
map_id(8, 7)
map_id(9, 4)
map_id(2, 5)
map_id(7, 5)
map_id(13, 5)
id_map.style.hide(axis="index")

ept_id,ept_name,paraop_id,paraop_name
1,Inflectional Changes,6.0,Substitution - Morphological
2,Modal Verb Changes,5.0,Substitution - Contextual Synonym
3,Derivational Changes,6.0,Substitution - Morphological
4,Spelling changes,7.0,Substitution - Spelling and Format
5,Same Polarity Substitution (habitual),4.0,Substitution - Synonym
6,Same Polarity Substitution (contextual),5.0,Substitution - Contextual Synonym
7,Same Polarity Substitution (named ent.),5.0,Substitution - Contextual Synonym
8,Change of format,7.0,Substitution - Spelling and Format
9,Opposite polarity substitution (habitual),4.0,Substitution - Synonym
10,Opposite polarity substitution (contextual),,


TODO: Figure out a way to hide index of map_id throughout whole notebook. For some reason this seems harder than it needs to be...

Helper function to convert an ETPC ID to a Paraop ID

In [11]:
# Helper function to get a Paraop id from an ETPC id
def ept_to_paraop(ept_id):
    return id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'].iloc[0]

ept_to_paraop(3)

6

# Reannotation

## Creating positives dataframe

In [12]:
positives = etpc.loc[etpc['mrpc_label'] == 1]
positives.rename(columns={'sentence1_segment_location': 'sentence1_scope_etpc', 
                          'sentence2_segment_location': 'sentence2_scope_etpc'}, inplace=True)
positives.drop(columns=['sentence1_segment_location_indices', 'sentence2_segment_location_indices'],inplace=True)
positives['idx'] = positives.index.to_series()
positives

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives.rename(columns={'sentence1_segment_location': 'sentence1_scope_etpc',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives.drop(columns=['sentence1_segment_location_indices', 'sentence2_segment_location_indices'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['idx'] = positives.index.to_series()


Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_scope_etpc,sentence2_scope_etpc,sentence1_segment_text,sentence2_segment_text
0,0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 2...","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26...","[whom, called, Amrozi accused his brother, `` ...","[to him, Referring, Amrozi accused his brother..."
2,2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, ...","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 2...","[They, cargo, on June 10, , he added, had publ...","[the ship 's owners, explosives, On June 10 ,,..."
4,4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...",0,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, ...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[The stock, rose, to close at, Friday, Friday,...","[PG & E Corp. shares, jumped, to, on Friday, o..."
5,5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...",1,1,"[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]","[25, 11, 11, 11, 11, 11, 11, 11, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 11, 11...","[in the first quarter of the year, Revenue dro...","[the first quarter of the year, With the scand..."
7,7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]","[25, 4, 25, 25, 25, 25, 5, 25, 25, 25]","[25, 4, 4, 25, 25, 25, 25, 25, 5, 25, 25, 25]","[DVD-CCA, state, then, The appealed to the Sup...","[DVD CCA, U.S., that decision, The appealed to..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5792,5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...",1,1,"[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[29, 29, 29, 29, 29, 29, 29, 26, 26, 26, 26, 2...","[authorities said, Gehring waived extradition ...",[Gehring waived extradition Monday during a he...
5793,5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...",1,1,"[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[Silver, Silver, `` I am advised that certain ...","[the Silver statement, the Silver statement, ,..."
5795,5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...",0,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[deal, be completed, The deal , approved by bo...","[acquisition, close, The acquisition has been ..."
5799,5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[26, 26, 0, 7, 7, 6, 1, 25, 25, 4, 25, 25, 25,...","[25, 25, 25, 25, 7, 0, 6, 1, 0, 0, 4, 25, 0, 2...","[Corp, power station’s, US, owners, Last week,...","[Corp., Drax, American, owner, last week, The ..."


## Why we cannot use the ETPC from Wahle et al.

Here's a fundamental part of the ETPC that I hadn't realized until now: each token in a sentence can have *more than one* paraphrase type. Here's an example--note how, in sentence 2, token 5 appears in the scopes both of inflectional and derivational changes.

In [13]:
ric = textual_paraphrases.loc[(textual_paraphrases['pair_id'] == 4205+1) & (textual_paraphrases['type_id'].isin([3,1]))]
ric[:2]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
15963,4206,1,Inflectional Changes,yes,[3],"[3, 5]",completed,had inspected,,,,
15964,4206,3,Derivational Changes,yes,[4],[5],inspections,inspected,,,,


It seems that this issue also wasn't noticed by Wahle et al: some paraphrase scopes consist of only a single number repeated for the entirety of the list:

In [14]:
positives[positives['sentence1_scope_etpc'].apply(lambda x: (len(np.unique(x)) == 1))][:10]

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_scope_etpc,sentence2_scope_etpc,sentence1_segment_text,sentence2_segment_text
14,14,He told The Sun newspaper that Mr. Hussein's d...,"""Saddam's daughters had British schools and ho...","[He, told, The, Sun, newspaper, that, Mr., Hus...","[``, Saddam, 's, daughters, had, British, scho...",1,1,"[Same Polarity Substitution (named ent.), Same...","[7, 6, 7, 26, 25, 29, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[0, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26...","[Hussein, The Sun newspaper, Mr. Hussein, Mr. ...","[Saddam, The Sun, Saddam, Saddam 's daughters ..."
22,22,But tropical storm warnings and watches were p...,Tropical storm warnings were in place Thursday...,"[But, tropical, storm, warnings, and, watches,...","[Tropical, storm, warnings, were, in, place, T...",0,1,"[Addition/Deletion, Addition/Deletion, Identit...","[25, 25, 29, 30, 4, 6, 11, 17]","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[But, tropical storm warnings watches , the so...","[Jamaica and, storm warnings watches , the sou..."
35,35,Trading in Loral was halted yesterday; the sha...,The New York Stock Exchange suspended trading ...,"[Trading, in, Loral, was, halted, yesterday, ;...","[The, New, York, Stock, Exchange, suspended, t...",0,1,"[Same Polarity Substitution (habitual), Diathe...","[5, 14, 18, 29, 30, 21]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[halted, Trading in Loral was halted, Trading ...","[suspended, The New York Stock Exchange suspen..."
40,40,Last year the court upheld Cleveland's school ...,"Last year, the court ruled 5-4 in an Ohio case...","[Last, year, the, court, upheld, Cleveland, 's...","[Last, year, ,, the, court, ruled, 5-4, in, an...",1,1,"[Same Polarity Substitution (contextual), Infl...","[6, 1, 25, 25, 29, 28, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29, 25, 29...","[provide, choice, Last year that vouchers are ...","[provide with, choices, government, among a ra..."
63,63,Contrary to what PeopleSoft management would h...,Ellison said that contrary to the contentions ...,"[Contrary, to, what, PeopleSoft, management, w...","[Ellison, said, that, contrary, to, the, conte...",1,1,"[Addition/Deletion, Identity, Semantic based, ...","[25, 29, 28, 21]","[28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 2...","[25, 25, 0, 29, 29, 28, 28, 28, 28, 28, 29, 29...","[Contrary to , Oracle intends to fully support...","[Ellison said, contrary to , Oracle intends to..."
72,72,Also demonstrating box-office strength _ and g...,Also demonstrating box-office strength -- and ...,"[Also, demonstrating, box-office, strength, _,...","[Also, demonstrating, box-office, strength, --...",1,1,"[Spelling changes, Spelling changes, Identity,...","[4, 4, 29, 21, 21]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[O'Neill 's, Day 's, Also demonstrating box-of...","[ONeills, Days, Also demonstrating box-office ..."
86,86,Sales - a figure watched closely as a baromete...,It also disclosed that sales -- a figure close...,"[Sales, -, a, figure, watched, closely, as, a,...","[It, also, disclosed, that, sales, --, a, figu...",1,1,"[Same Polarity Substitution (habitual), Synthe...","[5, 11, 26, 25, 25, 25, 25, 29, 28, 21]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 0, 25, 0, 25, 25, 26, 25, 25, 25,...","[rose, many industry experts, closely, 5 perce...","[were higher, industry experts, closely, by an..."
111,111,The suite comes complete with a word processor...,"The suite includes a word processor, spreadshe...","[The, suite, comes, complete, with, a, word, p...","[The, suite, includes, a, word, processor, ,, ...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 6, 5, 11, 18, 25, 25, 29, 21, 21]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[comes complete with, software, utilizing, an,...","[includes, application, built around, the, XML..."
124,124,"Powell fired back: ""He's accusing the presiden...","If so, Powell said, he's calling the president...","[Powell, fired, back, :, ``, He, 's, accusing,...","[If, so, ,, Powell, said, ,, he, 's, calling, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 25, 25, 25, 29, 21]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[accusing, he, , he said, Powell fired back :,...","[calling, Powell, , Powell said ,, , too, If s..."
126,126,The memo on protecting sales of Windows and ot...,"The memo specifically mentioned Linux, a still...","[The, memo, on, protecting, sales, of, Windows...","[The, memo, specifically, mentioned, Linux, ,,...",1,1,"[Addition/Deletion, Addition/Deletion, Identity]","[25, 25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...",[on protecting sales of Windows and other desk...,"[specifically, The memo mentioned Linux , a st..."


The issue also exists in part in the original ETPC: some paraphrase types have scopes annotated as pretty much the entire sentence. This seems especially prevalent among 'Punctuation changes'.

TODO: rewrite this, show examples 

While this is certainly an issue for the original ETPC, it's at least partly offset there since their annotation scheme has separate scopes for each paraphrase type. So even if the annotated scope of some given type isn't very informative, the entire sentence isn't lost: you'd still have other paraphrase types, which are most likely annotated correctly. But Wahle's dataset (and consequently his training pipeline) doesn't account for this. Whatever process Wahle et al. used for generating that dataset on Huggingface seems to have an especially hard time with sentences in the original ETPC as exemplified above, but the issue happens throughout *all* their dataset.

## Getting paraphrases from the original ETPC

Let's first clean up the dataset

In [15]:
positives.drop(columns=['idx', 'etpc_label', 'mrpc_label', 
                                       'sentence1_scope_etpc', 
                                       'sentence2_scope_etpc', 
                                       'sentence1_segment_text', 
                                       'sentence2_segment_text'])

Unnamed: 0,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,ept_names,ept_ids
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...","[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]"
2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...","[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]"
4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...","[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]"
5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...","[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]"
7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...","[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]"
...,...,...,...,...,...,...
5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...","[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]"
5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...","[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]"
5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...","[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]"
5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...","[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]"


We'll need a column to house the new scopes. Let's initialize that column with empty arrays for each token in the sentence. That way, we can easily tell which tokens haven't been annotated yet.

In [16]:
#TODO: get rid of SettingWithCopyWarning
positives['sentence1_scope'] = positives['sentence1_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))
positives['sentence2_scope'] = positives['sentence2_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))
positives['sentence1_scope'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['sentence1_scope'] = positives['sentence1_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['sentence2_scope'] = positives['sentence2_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))


array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', ''], dtype='<U64')

### Populating types

Helper function to populate type

In [17]:
# TODO: Convert to df apply (rather than series apply on idx)
# TODO: Figure out if 64 char limit will be an issue

def populate_type(idx, ept_id, lookup_df=textual_paraphrases, manual = None, subs = True):
    """Given a paraphrase pair (idx) and an EPT paraphrase type (ept_id), convert the EPT type to Paraop, look up the 
    scopes for both sentences in the pair, and fill in the scopes with the Paraop type. Returns a pair of arrays with
    the newly annotated scopes.
    
    The 'manual' argument controls whether we manually specify what the Paraop ID will be, or whether we automatically 
    get the Paraop ID from id_map.
    """
    
    paraop_id = manual if manual else ept_to_paraop(ept_id) 

    # Copy array to avoid messing up the originals
    array1 = np.copy(positives['sentence1_scope'][idx])
    array2 = np.copy(positives['sentence2_scope'][idx])
    
    # Create a subset of the lookup array containing only the paraphrase types
    # we are interested in (ept_id)
    subset = lookup_df[(lookup_df['pair_id'] == idx+1) & (lookup_df['type_id'] == int(ept_id))]
    subset.reset_index(drop=True, inplace=True)
    instances = len(subset['type_id'].values) # Count how many discrete instances of that type are there in this pair

    def fill(sentence_n, instance, array, scope, subs):
        """Helper function for filling in ids"""

        # Do nothing if scope is None
        if scope.tolist() is None:
            return
        
        # Identify which indices in the array have not been filled yet
        empty = np.where(array == '')[0]
        nonempty = np.where(array != '')[0]

        # Fill in empty entries
        if len(scope) > 0:
            empty_intersect = np.intersect1d(scope, empty)
            array[empty_intersect] = f'{paraop_id}_{instance}'

        if subs:    
            # Append to non-empty entries
            nonempty_intersect = np.intersect1d(scope, nonempty)
            if len(nonempty_intersect) > 0:
                # TODO: Log this in a better way (save to a file instead of just printing)
                print(f'Double check type overwriting: row {idx}, sentence {sentence_n}')
                print(f'Common indices: {list(nonempty_intersect)}', end=' | ')
                print(f'Pre-existing types: {array[nonempty_intersect]}')
                array[nonempty_intersect] = np.char.add(array[nonempty_intersect], f' & {paraop_id}_{instance}')

    # Filling in
    for i in range(instances):
        # Get scopes from lookup df
        s1_scope = np.array(subset.loc[subset['type_id'] == ept_id, 's1_scope'].iloc[i])
        s2_scope = np.array(subset.loc[subset['type_id'] == ept_id, 's2_scope'].iloc[i])
        
        fill(1, i, array1, s1_scope, subs)
        fill(2, i, array2, s2_scope, subs)
    
    return array1, array2

Here's a demo of how the outputs to that function look like:

In [18]:
populate_type(0, 26)

(array(['3_0', '3_0', '3_0', '3_0', '', '', '', '', '', '', '', '', '', '',
        '', '', '', '', ''], dtype='<U64'),
 array(['', '', '', '', '', '', '', '', '', '', '3_0', '3_0', '3_0', '3_0',
        '', '', '', '', '', ''], dtype='<U64'))

`populate_type` returns new arrays, it doesn't modify the original df. Use the function below to actually modify the df

In [19]:
def substitute(ept_id, lookup_df=textual_paraphrases, manual = None, subs = True):
    series = positives['idx'].apply(populate_type, ept_id=ept_id, lookup_df=lookup_df, manual = manual, subs = subs)
    cols = pd.DataFrame(series.tolist(), columns=['sentence1', 'sentence2'])
    positives.loc[:, 'sentence1_scope'] = cols['sentence1'].values
    positives.loc[:, 'sentence2_scope'] = cols['sentence2'].values

## Performing the reannotation

Helper functions:

In [20]:
def print_sents(idx: int):
    """Prints both sentences in a sentence pair, given the pair's id"""
    idx -= 1
    print(positives.loc[positives['idx'] == idx, 'sentence1'].iloc[0])
    print(positives.loc[positives['idx'] == idx, 'sentence2'].iloc[0])

In [21]:
def duplicate_df(df: pd.DataFrame):
    """Returns a deep copy of a dataframe"""
    return pd.DataFrame(columns = df.columns, data = copy.deepcopy(df.values))

In [22]:
def split_add_sub(df: pd.DataFrame):
    """Splits a dataframe into two dataframes: one containing types to be annotated as Addition/Deletion, and another
    containing types to be annotated as Substitution."""
    add_del = df[(df['s1_scope'].isnull()) | (df['s2_scope'].isnull())]
    subs = df[~((df['s1_scope'].isnull()) | (df['s2_scope'].isnull()))]
    return add_del, subs

### Change of order

In [23]:
substitute(26)

Double check type overwriting: row 196, sentence 1
Common indices: [2] | Pre-existing types: ['3_0']
Double check type overwriting: row 196, sentence 2
Common indices: [9] | Pre-existing types: ['3_0']
Double check type overwriting: row 411, sentence 1
Common indices: [0, 1, 2, 3, 4] | Pre-existing types: ['3_0' '3_0' '3_0' '3_0' '3_0']
Double check type overwriting: row 411, sentence 2
Common indices: [12, 13, 14, 15] | Pre-existing types: ['3_0' '3_0' '3_0' '3_0']
Double check type overwriting: row 1014, sentence 1
Common indices: [5] | Pre-existing types: ['3_0']
Double check type overwriting: row 1014, sentence 2
Common indices: [10, 11] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 1543, sentence 1
Common indices: [19] | Pre-existing types: ['3_0']
Double check type overwriting: row 1543, sentence 2
Common indices: [5] | Pre-existing types: ['3_0']
Double check type overwriting: row 1864, sentence 1
Common indices: [5, 6] | Pre-existing types: ['3_0' '3_0'

### Same Polarity Substitution (Habitual)

In [24]:
substitute(5)

Double check type overwriting: row 75, sentence 1
Common indices: [5, 6] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 75, sentence 2
Common indices: [0] | Pre-existing types: ['3_0']
Double check type overwriting: row 152, sentence 1
Common indices: [13, 14] | Pre-existing types: ['4_0' '4_0']
Double check type overwriting: row 152, sentence 2
Common indices: [10] | Pre-existing types: ['4_0']
Double check type overwriting: row 172, sentence 1
Common indices: [22] | Pre-existing types: ['3_0']
Double check type overwriting: row 172, sentence 2
Common indices: [17] | Pre-existing types: ['3_0']
Double check type overwriting: row 226, sentence 1
Common indices: [0, 1, 2, 3] | Pre-existing types: ['3_0' '3_0' '3_0' '3_0']
Double check type overwriting: row 226, sentence 2
Common indices: [2] | Pre-existing types: ['3_0']
Double check type overwriting: row 310, sentence 1
Common indices: [9] | Pre-existing types: ['3_0']
Double check type overwriting: row 310, sen

Double check type overwriting: row 4086, sentence 1
Common indices: [15] | Pre-existing types: ['3_0']
Double check type overwriting: row 4086, sentence 2
Common indices: [7] | Pre-existing types: ['3_0']
Double check type overwriting: row 4288, sentence 1
Common indices: [5] | Pre-existing types: ['3_0']
Double check type overwriting: row 4288, sentence 2
Common indices: [8, 9, 10, 11] | Pre-existing types: ['3_0' '3_0' '3_0' '3_0']
Double check type overwriting: row 4318, sentence 1
Common indices: [10] | Pre-existing types: ['3_0']
Double check type overwriting: row 4318, sentence 2
Common indices: [0] | Pre-existing types: ['3_0']
Double check type overwriting: row 4353, sentence 1
Common indices: [16, 17] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 4353, sentence 2
Common indices: [4, 5] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 4558, sentence 1
Common indices: [19] | Pre-existing types: ['3_1']
Double check type overwriting:

### Same Polarity Substitution (Contextual)

In [25]:
substitute(6)

Double check type overwriting: row 56, sentence 1
Common indices: [4] | Pre-existing types: ['3_0']
Double check type overwriting: row 56, sentence 2
Common indices: [0, 1] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 112, sentence 1
Common indices: [22] | Pre-existing types: ['3_1']
Double check type overwriting: row 112, sentence 2
Common indices: [7, 8] | Pre-existing types: ['3_1' '3_1']
Double check type overwriting: row 124, sentence 1
Common indices: [16] | Pre-existing types: ['3_0']
Double check type overwriting: row 124, sentence 2
Common indices: [3] | Pre-existing types: ['3_0']
Double check type overwriting: row 191, sentence 1
Common indices: [19, 20, 21] | Pre-existing types: ['3_0' '3_0' '3_0']
Double check type overwriting: row 191, sentence 2
Common indices: [14] | Pre-existing types: ['3_0']
Double check type overwriting: row 235, sentence 1
Common indices: [3] | Pre-existing types: ['3_0']
Double check type overwriting: row 235, sentence 2


### Derivational Changes

In [26]:
substitute(3)

Double check type overwriting: row 254, sentence 1
Common indices: [23] | Pre-existing types: ['4_0']
Double check type overwriting: row 254, sentence 2
Common indices: [21] | Pre-existing types: ['4_0']
Double check type overwriting: row 433, sentence 1
Common indices: [17] | Pre-existing types: ['4_0']
Double check type overwriting: row 433, sentence 2
Common indices: [17] | Pre-existing types: ['4_0']
Double check type overwriting: row 449, sentence 1
Common indices: [5] | Pre-existing types: ['4_1']
Double check type overwriting: row 449, sentence 2
Common indices: [22] | Pre-existing types: ['4_1']
Double check type overwriting: row 480, sentence 1
Common indices: [4] | Pre-existing types: ['3_0']
Double check type overwriting: row 480, sentence 2
Common indices: [16] | Pre-existing types: ['3_0']
Double check type overwriting: row 608, sentence 1
Common indices: [5] | Pre-existing types: ['5_1']
Double check type overwriting: row 608, sentence 2
Common indices: [5] | Pre-existing

### Inflectional Changes

In [27]:
substitute(1)

Double check type overwriting: row 47, sentence 1
Common indices: [3] | Pre-existing types: ['4_0']
Double check type overwriting: row 47, sentence 2
Common indices: [3] | Pre-existing types: ['4_0']
Double check type overwriting: row 76, sentence 1
Common indices: [2] | Pre-existing types: ['3_0']
Double check type overwriting: row 76, sentence 2
Common indices: [20] | Pre-existing types: ['3_0']
Double check type overwriting: row 120, sentence 1
Common indices: [13] | Pre-existing types: ['5_0']
Double check type overwriting: row 120, sentence 2
Common indices: [11] | Pre-existing types: ['5_0']
Double check type overwriting: row 164, sentence 1
Common indices: [7] | Pre-existing types: ['4_0']
Double check type overwriting: row 164, sentence 2
Common indices: [11] | Pre-existing types: ['4_0']
Double check type overwriting: row 194, sentence 1
Common indices: [15] | Pre-existing types: ['3_0']
Double check type overwriting: row 194, sentence 2
Common indices: [19] | Pre-existing typ

### Spelling Changes

In [28]:
substitute(4)

Double check type overwriting: row 155, sentence 1
Common indices: [5] | Pre-existing types: ['4_0']
Double check type overwriting: row 155, sentence 2
Common indices: [8] | Pre-existing types: ['4_0']
Double check type overwriting: row 449, sentence 1
Common indices: [9, 10] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 449, sentence 2
Common indices: [4] | Pre-existing types: ['3_0']
Double check type overwriting: row 458, sentence 1
Common indices: [15] | Pre-existing types: ['3_0']
Double check type overwriting: row 458, sentence 2
Common indices: [1] | Pre-existing types: ['3_0']
Double check type overwriting: row 780, sentence 1
Common indices: [25] | Pre-existing types: ['3_1']
Double check type overwriting: row 780, sentence 2
Common indices: [11] | Pre-existing types: ['3_1']
Double check type overwriting: row 882, sentence 1
Common indices: [24] | Pre-existing types: ['3_0']
Double check type overwriting: row 882, sentence 2
Common indices: [20] | Pre

### Change of format

In [29]:
substitute(8)

Double check type overwriting: row 418, sentence 1
Common indices: [9] | Pre-existing types: ['5_0']
Double check type overwriting: row 418, sentence 2
Common indices: [12] | Pre-existing types: ['5_0']
Double check type overwriting: row 508, sentence 1
Common indices: [4] | Pre-existing types: ['3_0']
Double check type overwriting: row 508, sentence 2
Common indices: [9] | Pre-existing types: ['3_0']
Double check type overwriting: row 508, sentence 1
Common indices: [5] | Pre-existing types: ['3_0']
Double check type overwriting: row 508, sentence 2
Common indices: [10] | Pre-existing types: ['3_0']
Double check type overwriting: row 586, sentence 1
Common indices: [13] | Pre-existing types: ['3_0']
Double check type overwriting: row 586, sentence 2
Common indices: [6] | Pre-existing types: ['3_0']
Double check type overwriting: row 1322, sentence 1
Common indices: [25] | Pre-existing types: ['7_0']
Double check type overwriting: row 1322, sentence 2
Common indices: [23] | Pre-existin

### Opposite Polarity Substitution (Habitual)

In [30]:
substitute(9)

### Modal Verb Changes (TODO)

In [31]:
# TODO: Check overlapped words between (e.g.) derivational & inflectional changes
# The way this works right now, you'd have something like ['6_0 & 6_0'] for those
# Make sure this doesn't happen. Probably do a function that does a pass on the
# array of strings later and removes any duplicates

In [32]:
def trim_duplicates(s1_scope, s2_scope, s1_text, s2_text):
    s1_newtext = s1_text.split()
    s2_newtext = s2_text.split()
    
    in1 = np.where(np.in1d(s1_newtext, s2_newtext))[0]
    in2 = np.where(np.in1d(s2_newtext, s1_newtext))[0]

    s1_newscope = np.delete(s1_scope, in1)
    s2_newscope = np.delete(s2_scope, in2)
    s1_newtext = ' '.join(np.delete(s1_newtext, in1))
    s2_newtext = ' '.join(np.delete(s2_newtext, in2))

    return s1_newscope, s2_newscope, s1_newtext, s2_newtext

In [33]:
ric = textual_paraphrases.loc[textual_paraphrases['type_id'].isin([2])]
ric

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
120,40,2,Modal Verb Changes,yes,"[7, 8, 9]","[4, 5]",intend to seek,will seek,,,,
135,45,2,Modal Verb Changes,yes,"[4, 5, 6, 7]","[3, 4]",is expected to decline,will decline,,,,
197,57,2,Modal Verb Changes,yes,"[5, 6, 7]","[2, 3, 4, 5]",would shut down,plans to shut down,,,,
381,108,2,Modal Verb Changes,yes,"[6, 7]","[6, 7, 8]",were dispatched,will be sent,,,,
393,110,2,Modal Verb Changes,yes,"[11, 12]","[10, 11]",may issue,might issue,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
21449,5623,2,Modal Verb Changes,yes,"[8, 9]","[10, 11, 12]",will address,would participate in,,,,
21593,5665,2,Modal Verb Changes,yes,"[3, 4, 5]",[4],could have been,was,,,,
21679,5691,2,Modal Verb Changes,yes,"[4, 5]",[10],could bring,bringing,,,,
21762,5712,2,Modal Verb Changes,yes,"[3, 4]",[2],would give,gives,,,,


### Punctuation Changes

Let's look closely at punctuation changes. This is one of the types that annotates key elements, and those are what we 
want to use, so we'll use those as the scopes.  

In [34]:
punctuation = duplicate_df(textual_paraphrases)
punctuation = punctuation[punctuation['type_id'] == 21]
punctuation.drop(columns=['s1_scope', 's2_scope'], inplace=True)
punctuation.rename(columns={'key_s1': 's1_scope', 'key_s2': 's2_scope'}, inplace=True)
punctuation

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
20,5,21,Punctuation changes,yes,"The stock rose $ 2.11 , or about 11 percent , ...",PG & E Corp. shares jumped $ 1.63 or 8 percent...,"[5, 10]",,", ,",
33,10,21,Punctuation changes,yes,the foodservice pie business does n't fit the ...,`` The foodservice pie business does not fit o...,,[0],,``
46,15,21,Punctuation changes,yes,He told The Sun newspaper that Mr. Hussein 's ...,`` Saddam 's daughters had British schools and...,,"[0, 23]",,`` ''
87,29,21,Punctuation changes,yes,I wanted to bring the most beautiful people in...,`` I wanted to bring the most beautiful people...,,"[0, 15]",,`` ''
108,36,21,Punctuation changes,yes,Trading in Loral was halted yesterday ; the sh...,The New York Stock Exchange suspended trading ...,[6],[10],;,","
...,...,...,...,...,...,...,...,...,...,...
21962,5773,21,Punctuation changes,yes,`` I would rather be talking about positive nu...,But I would rather be talking about high stand...,[0],[14],``,''\n
21978,5780,21,Punctuation changes,yes,`` the man who has the blood of innocent peopl...,a man who has the blood of innocent people on ...,[4],,``,
21987,5782,21,Punctuation changes,yes,The Ministry of Defence said that `` an invest...,The Ministry of Defence said yesterday : “We c...,"[6, 27]",,`` '',
22033,5793,21,Punctuation changes,yes,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,[10],,",",


Some punctuation changes are substitutions (the ones with two keys), and some are additions/deletions (the ones where there's only one key, and the other key is `None`).

We'll need to treat them separately:

In [35]:
punct_adddel, punct_subs = split_add_sub(punctuation)

In [36]:
substitute(21, punct_adddel, 8)

Double check type overwriting: row 480, sentence 2
Common indices: [17] | Pre-existing types: ['3_0']
Double check type overwriting: row 896, sentence 2
Common indices: [7] | Pre-existing types: ['3_0']
Double check type overwriting: row 1115, sentence 2
Common indices: [5] | Pre-existing types: ['3_0']
Double check type overwriting: row 1346, sentence 2
Common indices: [1, 3] | Pre-existing types: ['3_1 & 5_0' '3_1 & 5_0']
Double check type overwriting: row 1398, sentence 2
Common indices: [14] | Pre-existing types: ['3_0']


Double check type overwriting: row 1533, sentence 1
Common indices: [18] | Pre-existing types: ['3_0']
Double check type overwriting: row 1586, sentence 2
Common indices: [27] | Pre-existing types: ['3_0']
Double check type overwriting: row 1586, sentence 2
Common indices: [19] | Pre-existing types: ['3_0']
Double check type overwriting: row 1593, sentence 1
Common indices: [3] | Pre-existing types: ['3_0']
Double check type overwriting: row 1667, sentence 2
Common indices: [3] | Pre-existing types: ['3_0']
Double check type overwriting: row 2154, sentence 1
Common indices: [10] | Pre-existing types: ['3_0']
Double check type overwriting: row 2417, sentence 2
Common indices: [0, 22] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 2453, sentence 2
Common indices: [17] | Pre-existing types: ['3_0']
Double check type overwriting: row 2889, sentence 1
Common indices: [8, 12] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 2986, sentence 1
Commo

In [37]:
substitute(21, punct_subs, 7)

Double check type overwriting: row 401, sentence 1
Common indices: [0, 12] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 401, sentence 2
Common indices: [13] | Pre-existing types: ['3_0']
Double check type overwriting: row 449, sentence 1
Common indices: [14, 17] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 449, sentence 2
Common indices: [5, 16] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 573, sentence 1
Common indices: [0] | Pre-existing types: ['3_0']
Double check type overwriting: row 573, sentence 2
Common indices: [5] | Pre-existing types: ['3_0']
Double check type overwriting: row 656, sentence 1
Common indices: [25] | Pre-existing types: ['3_0']
Double check type overwriting: row 656, sentence 2
Common indices: [20] | Pre-existing types: ['3_0']
Double check type overwriting: row 790, sentence 1
Common indices: [3] | Pre-existing types: ['3_0']
Double check type overwriting: row 790, sentence 2
Common

### Named Entity Substitution

In [38]:
substitute(7)

Double check type overwriting: row 14, sentence 1
Common indices: [7] | Pre-existing types: ['3_0']
Double check type overwriting: row 14, sentence 2
Common indices: [1] | Pre-existing types: ['3_0']
Double check type overwriting: row 14, sentence 1
Common indices: [6, 7] | Pre-existing types: ['3_0' '3_0 & 5_0']
Double check type overwriting: row 14, sentence 2
Common indices: [1] | Pre-existing types: ['3_0 & 5_0']
Double check type overwriting: row 172, sentence 1
Common indices: [23, 24] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 172, sentence 2
Common indices: [18, 19, 20, 21] | Pre-existing types: ['3_0' '3_0' '3_0' '3_0']
Double check type overwriting: row 272, sentence 1
Common indices: [19] | Pre-existing types: ['3_0']
Double check type overwriting: row 272, sentence 2
Common indices: [18, 19] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 272, sentence 1
Common indices: [18, 19] | Pre-existing types: ['3_0' '3_0 & 5_0']
Dou

### Synthetic/Analytic Substitution

In [39]:
def differ_by_one(s1, s2):
    s1_list = s1.lower().split()
    s2_list = s2.lower().split()
    return abs(len(s1_list) - len(s2_list)) == 1


In [40]:
textual_paraphrases[textual_paraphrases['type_id'] == 11]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
13,5,11,Synthetic/analytic substitution,yes,[13],"[20, 21]",Friday,on Friday,,,,
21,6,11,Synthetic/analytic substitution,yes,"[1, 2, 3, 4, 5, 6, 7]","[10, 11, 12, 13, 14, 15]",in the first quarter of the year,the first quarter of the year,,,,
63,23,11,Synthetic/analytic substitution,yes,"[12, 13, 14, 15, 16, 17]","[14, 15, 16, 17]",western portions of the Dominican Republic,the western Dominican Republic,,,,
73,27,11,Synthetic/analytic substitution,yes,[8],"[8, 9, 10]",struck,managed to strike,,,,
88,32,11,Synthetic/analytic substitution,yes,"[3, 4, 5, 6, 7, 8, 9]","[7, 8, 9, 10, 11, 12, 13, 14]",the United States ' 12th-largest trading partner,the 12th-largest trading partner of the United...,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
22020,5791,11,Synthetic/analytic substitution,yes,"[0, 1]","[7, 8, 9]",Remaining shares,the remaining shares,,,,
22021,5791,11,Synthetic/analytic substitution,yes,"[6, 7, 8]","[3, 4, 5]",QVC 's management,QVC management team,,,,
22022,5791,11,Synthetic/analytic substitution,yes,"[6, 7, 8]","[2, 3, 4, 5]",QVC 's management,the QVC management team,,,,
22035,5793,11,Synthetic/analytic substitution,yes,"[21, 22]",[19],on Tuesday,Tuesday,,,,


In [41]:
new = pd.DataFrame(columns = textual_paraphrases.columns, data = copy.deepcopy(textual_paraphrases.values))
new = new[new['type_id'] == 11]
new['s1_text'] = new['s1_text'].str.lower() 
new['s2_text'] = new['s2_text'].str.lower() 
new = new[new.apply(lambda x: differ_by_one(x.s1_text, x.s2_text), axis=1)]
new

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
13,5,11,Synthetic/analytic substitution,yes,[13],"[20, 21]",friday,on friday,,,,
21,6,11,Synthetic/analytic substitution,yes,"[1, 2, 3, 4, 5, 6, 7]","[10, 11, 12, 13, 14, 15]",in the first quarter of the year,the first quarter of the year,,,,
88,32,11,Synthetic/analytic substitution,yes,"[3, 4, 5, 6, 7, 8, 9]","[7, 8, 9, 10, 11, 12, 13, 14]",the united states ' 12th-largest trading partner,the 12th-largest trading partner of the united...,,,,
146,48,11,Synthetic/analytic substitution,yes,"[8, 9]",[12],its earnings,earnings,,,,
156,50,11,Synthetic/analytic substitution,yes,"[10, 11, 12]","[10, 11, 12, 13]",significant economic growth,a significant economic growth,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
22011,5788,11,Synthetic/analytic substitution,yes,"[0, 1]",[4],pwc itself,pwc,,,,
22020,5791,11,Synthetic/analytic substitution,yes,"[0, 1]","[7, 8, 9]",remaining shares,the remaining shares,,,,
22022,5791,11,Synthetic/analytic substitution,yes,"[6, 7, 8]","[2, 3, 4, 5]",qvc 's management,the qvc management team,,,,
22035,5793,11,Synthetic/analytic substitution,yes,"[21, 22]",[19],on tuesday,tuesday,,,,


In [42]:
new['s1_scope'], new['s2_scope'], new['s1_text'], new['s2_text'] = new.apply(lambda x: trim_duplicates(x.s1_scope, x.s2_scope, x.s1_text, x.s2_text), axis=1, result_type='expand').transpose().values
new

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
13,5,11,Synthetic/analytic substitution,yes,[],[20],,on,,,,
21,6,11,Synthetic/analytic substitution,yes,[1],[],in,,,,,
88,32,11,Synthetic/analytic substitution,yes,[6],[11],',of,,,,
146,48,11,Synthetic/analytic substitution,yes,[8],[],its,,,,,
156,50,11,Synthetic/analytic substitution,yes,[],[10],,a,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
22011,5788,11,Synthetic/analytic substitution,yes,[1],[],itself,,,,,
22020,5791,11,Synthetic/analytic substitution,yes,[],[7],,the,,,,
22022,5791,11,Synthetic/analytic substitution,yes,[7],"[2, 5]",'s,the team,,,,
22035,5793,11,Synthetic/analytic substitution,yes,[21],[],on,,,,,


In [43]:
new['s1_text'] = new['s1_text'].apply(lambda x: None if x == '' else x)
new['s2_text'] = new['s2_text'].apply(lambda x: None if x == '' else x)
new['s1_scope'] = new['s1_scope'].apply(lambda x: None if list(x) == [] else x)
new['s2_scope'] = new['s2_scope'].apply(lambda x: None if list(x) == [] else x)

In [44]:
new

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
13,5,11,Synthetic/analytic substitution,yes,,[20],,on,,,,
21,6,11,Synthetic/analytic substitution,yes,[1],,in,,,,,
88,32,11,Synthetic/analytic substitution,yes,[6],[11],',of,,,,
146,48,11,Synthetic/analytic substitution,yes,[8],,its,,,,,
156,50,11,Synthetic/analytic substitution,yes,,[10],,a,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
22011,5788,11,Synthetic/analytic substitution,yes,[1],,itself,,,,,
22020,5791,11,Synthetic/analytic substitution,yes,,[7],,the,,,,
22022,5791,11,Synthetic/analytic substitution,yes,[7],"[2, 5]",'s,the team,,,,
22035,5793,11,Synthetic/analytic substitution,yes,[21],,on,,,,,


In [45]:
new_adddel = new[(new['s1_scope'].isnull()) | (new['s2_scope'].isnull())]
new_subs = new[~((new['s1_scope'].isnull()) | (new['s2_scope'].isnull()))]

In [46]:
new_adddel[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
13,5,11,Synthetic/analytic substitution,yes,,[20],,on,,,,
21,6,11,Synthetic/analytic substitution,yes,[1],,in,,,,,
146,48,11,Synthetic/analytic substitution,yes,[8],,its,,,,,
156,50,11,Synthetic/analytic substitution,yes,,[10],,a,,,,
253,74,11,Synthetic/analytic substitution,yes,[9],,all,,,,,
307,87,11,Synthetic/analytic substitution,yes,[20],,many,,,,,
551,150,11,Synthetic/analytic substitution,yes,,[13],,the,,,,
650,172,11,Synthetic/analytic substitution,yes,[14],,more,,,,,
656,173,11,Synthetic/analytic substitution,yes,[18],,'s,,,,,
662,174,11,Synthetic/analytic substitution,yes,[24],,its,,,,,


In [47]:
substitute(11, new_adddel, 1, False)

In [48]:
new_subs[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
88,32,11,Synthetic/analytic substitution,yes,[6],[11],',of,,,,
321,89,11,Synthetic/analytic substitution,yes,"[7, 8]","[9, 11]",nation 's,in us,,,,
649,172,11,Synthetic/analytic substitution,yes,"[5, 7]",[6],the cost,prices,,,,
686,180,11,Synthetic/analytic substitution,yes,[12],"[13, 14]",mexican,of mexico,,,,
860,222,11,Synthetic/analytic substitution,yes,[12],[8],federal,us,,,,
1023,265,11,Synthetic/analytic substitution,yes,[19],[9],in,',,,,
1090,279,11,Synthetic/analytic substitution,yes,[8],"[3, 5]",makers,the manufacturers,,,,
1210,309,11,Synthetic/analytic substitution,yes,"[20, 21]",[21],the pool,pools,,,,
1562,410,11,Synthetic/analytic substitution,yes,"[9, 10]",[8],indicating creditworthiness,credit,,,,
1567,411,11,Synthetic/analytic substitution,yes,[4],"[4, 6]",comment,an response,,,,


In [49]:
substitute(11, new_subs, 5, False)

In [50]:
aye = pd.DataFrame(columns = textual_paraphrases.columns, data = copy.deepcopy(textual_paraphrases.values))
aye = aye[aye['type_id'] == 11]
aye.drop(new.index.tolist(), inplace=True)
aye[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
63,23,11,Synthetic/analytic substitution,yes,"[12, 13, 14, 15, 16, 17]","[14, 15, 16, 17]",western portions of the Dominican Republic,the western Dominican Republic,,,,
73,27,11,Synthetic/analytic substitution,yes,[8],"[8, 9, 10]",struck,managed to strike,,,,
89,32,11,Synthetic/analytic substitution,yes,"[12, 13]","[17, 18]",two-way trade,trade volume,,,,
143,46,11,Synthetic/analytic substitution,yes,"[6, 7, 8, 9, 10, 11]","[6, 7, 8, 9, 10, 11, 12, 13, 14]",State Department official John S. Wolf,"John S. Wolf , an assistant secretary of state",,,,
182,54,11,Synthetic/analytic substitution,yes,"[12, 13, 14, 15, 16, 17, 18]","[10, 11, 12, 13]",morning trading on the Nasdaq Stock Market,Nasdaq Stock Market trading,,,,
201,59,11,Synthetic/analytic substitution,yes,"[2, 3, 4, 5, 6, 7]","[2, 3, 4]",the questions asked by the audience,the audience questions,,,,
304,86,11,Synthetic/analytic substitution,yes,"[11, 12, 13]","[8, 9, 10, 11, 12, 13, 14]",Longhorn 's release,release dates of Microsoft 's new products,,,,
331,94,11,Synthetic/analytic substitution,yes,[12],"[11, 12, 14]",resisted,put up resistance,,,,
392,110,11,Synthetic/analytic substitution,yes,"[4, 5, 6, 7]","[4, 5, 6, 7]",Russia 's foreign ministry,the Russian Foreign Ministry,,,,
405,112,11,Synthetic/analytic substitution,yes,[24],[25],XML-based,XML,,,,


In [51]:
substitute(11, aye, 5, False)

In [52]:
positives.loc[45]['sentence1']

'The group will be headed by State Department official John S. Wolf, who has served in Australia, Vietnam, Greece and Pakistan.'

In [53]:
positives.loc[45]['sentence2']

'The group will be headed by John S. Wolf, an assistant secretary of state who has served in Australia, Vietnam, Greece and Pakistan.'

In [54]:
positives.loc[45]['sentence1_scope']

array(['', '', '', '', '', '', '5_0', '5_0', '5_0', '5_0', '5_0', '5_0',
       '', '', '', '', '', '', '', '', '', '', '', '', ''], dtype='<U64')

### Converse Substitution

In [55]:
textual_paraphrases[textual_paraphrases['type_id'] == 13][:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
1139,292,13,Converse substitution,yes,"[10, 11]","[14, 15]",was taken,had brought,,,,
1354,347,13,Converse substitution,yes,[4],[1],was,leaves,,,,
2370,622,13,Converse substitution,yes,"[3, 4]",[4],be back,shelve,,,,
3021,798,13,Converse substitution,yes,"[10, 11]","[14, 15]",breaks down,interacts with,,,,
3389,902,13,Converse substitution,yes,"[2, 6]","[3, 4, 8]",includes in,would get of,,,,
3502,932,13,Converse substitution,yes,"[4, 5]",[3],were given,received,,,,
3503,932,13,Converse substitution,yes,[14],"[16, 17]",took,were given,,,,
4609,1244,13,Converse substitution,yes,"[2, 3]","[6, 7]",was subpoenaed,was cooperating,,,,
6113,1663,13,Converse substitution,yes,"[2, 3, 4]","[8, 10, 11]",were hurt by,put pressure on,,,,
7418,2001,13,Converse substitution,yes,[8],[9],swept,gave,,,,


In [56]:
substitute(13)

Double check type overwriting: row 2009, sentence 1
Common indices: [6, 7, 8] | Pre-existing types: ['6_0' '6_0' '6_0']
Double check type overwriting: row 2009, sentence 2
Common indices: [6, 7] | Pre-existing types: ['6_0' '6_0']
Double check type overwriting: row 3180, sentence 1
Common indices: [10, 13] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 3180, sentence 2
Common indices: [19] | Pre-existing types: ['3_0']
Double check type overwriting: row 4887, sentence 1
Common indices: [13] | Pre-existing types: ['6_0']
Double check type overwriting: row 4887, sentence 2
Common indices: [17, 18] | Pre-existing types: ['6_0' '6_0']
Double check type overwriting: row 5466, sentence 1
Common indices: [5] | Pre-existing types: ['6_0']
Double check type overwriting: row 5466, sentence 2
Common indices: [2] | Pre-existing types: ['6_0']


### Coordination Changes

This is similar to punctuation changes in two ways:
- It uses `keys` for annotation
- It can either be Addition/Deletion (in this case, function word) or Substitution
  - But which kind of substitution? Synonym?

In [57]:
coord = pd.DataFrame(columns = textual_paraphrases.columns, data = copy.deepcopy(textual_paraphrases.values))
coord = coord[coord['type_id'] == 17]
coord.drop(columns=['s1_scope', 's2_scope'], inplace=True)
coord.rename(columns={'key_s1': 's1_scope', 'key_s2': 's2_scope'}, inplace=True)

coord[30:40]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
11748,3120,17,Coordination changes,yes,traffic has disappeared from once bustling str...,traffic has disappeared from once-bustling str...,,[10],,and
12294,3262,17,Coordination changes,yes,"a 4.5-inch LCD screen , Memory Stick expansion...",a 4.5 in back-lit LCD screen and memory expans...,,[9],,and
13874,3675,17,Coordination changes,yes,"20 years , 63 days",20 years and 63 days,[7],[15],",",and
14171,3752,17,Coordination changes,yes,"State Sen. Vi Simpson , former state and natio...",former state and national Democratic Chairman ...,,[12],,and
16307,4301,17,Coordination changes,yes,Pacific Northwest has more than 800 employees ...,"It has 800 employees , compared with Wells Far...",[8],,and,
16346,4312,17,Coordination changes,yes,The victims were last seen ; their bodies were...,The family was last seen and their bodies were...,,[7],,and
16937,4474,17,Coordination changes,yes,"`` I have lots of bad dreams , I have flashbac...","`` I have lots of bad dreams , flashbacks and ...",,[9],,and
17198,4533,17,Coordination changes,yes,"some administrative material , some from a fai...",administrative paper work and some about a fai...,,[10],,and
17211,4537,17,Coordination changes,yes,a June opening record,a monster opening and a June record,,[17],,and
17247,4548,17,Coordination changes,yes,is still being held at the prison and is now i...,was held in isolation at the same prison,[8],,and,


In [58]:
print_sents(1316)

Five foreign embassies, including the Singapore embassy, in Bangkok were among the targets, it said.
Five foreign embassies in Bangkok, including the Singapore embassy, were among those targeted.


In [59]:
coord_adddel = coord[(coord['s1_scope'].isnull()) | (coord['s2_scope'].isnull())]
coord_subs = coord[~((coord['s1_scope'].isnull()) | (coord['s2_scope'].isnull()))]

In [60]:
substitute(17, coord_adddel, 1)

Double check type overwriting: row 4977, sentence 2
Common indices: [6] | Pre-existing types: ['3_1']
Double check type overwriting: row 5321, sentence 2
Common indices: [18] | Pre-existing types: ['5_1']


TODO: Deal with the below

In [61]:
coord_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
64,23,17,Coordination changes,yes,But tropical storm warnings and watches were p...,Tropical storm warnings were in place Thursday...,[4],[11],and,and
2925,769,17,Coordination changes,yes,"He was sent to Larned State Hospital , where h...",He ordered him sent to the Larned State Securi...,[8],[10],where,for
4868,1317,17,Coordination changes,yes,hammering consumer spending and leaving shops ...,"hammering consumer spending , with shops , res...",[23],[14],and,","
9107,2442,17,Coordination changes,yes,Ms Pike also said it was not unusual for hospi...,But Ms Pike said it was not unusual for hospit...,[15],[14],but,and
9663,2604,17,Coordination changes,yes,Jews and the US,"Jews , Americans",[3],[1],and,","
10342,2789,17,Coordination changes,yes,"In addition to O'Connor , Rehnquist 's majorit...","Justices Sandra Day O'Connor , David H. Souter...","[0, 1, 2, 20]",[19],In addition to and,and
13874,3675,17,Coordination changes,yes,"20 years , 63 days",20 years and 63 days,[7],[15],",",and
20398,5343,17,Coordination changes,yes,"vegetables , fruits",fruits and vegetables,[9],[14],",",and
20399,5343,17,Coordination changes,yes,"nuts , cereals",nuts and cereals,[15],[18],",",and
21161,5545,17,Coordination changes,yes,benefiting from a little luck Thursday to eras...,to erase a set point and beat unseeded Nadia P...,[23],[20],and,and


In [62]:
print_sents(769)

He was sent to Larned State Hospital, where he was evaluated and treated.
He ordered him sent to the Larned State Security Hospital for continued evaluation and treatment.


Let's isolate the punctuation and add it first

In [63]:
coord_punct = duplicate_df(coord.loc[[4868, 9663, 13874, 20398, 20399],:])
coord_punct

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,1317,17,Coordination changes,yes,hammering consumer spending and leaving shops ...,"hammering consumer spending , with shops , res...",[23],[14],and,","
1,2604,17,Coordination changes,yes,Jews and the US,"Jews , Americans",[3],[1],and,","
2,3675,17,Coordination changes,yes,"20 years , 63 days",20 years and 63 days,[7],[15],",",and
3,5343,17,Coordination changes,yes,"vegetables , fruits",fruits and vegetables,[9],[14],",",and
4,5343,17,Coordination changes,yes,"nuts , cereals",nuts and cereals,[15],[18],",",and


In [64]:
coord_subs.loc[[13874, 20398, 20399], 's1_scope'] = None
coord_subs.loc[[4868, 9663], 's2_scope'] = None
coord_subs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coord_subs.loc[[13874, 20398, 20399], 's1_scope'] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coord_subs.loc[[4868, 9663], 's2_scope'] = None


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
64,23,17,Coordination changes,yes,But tropical storm warnings and watches were p...,Tropical storm warnings were in place Thursday...,[4],[11],and,and
2925,769,17,Coordination changes,yes,"He was sent to Larned State Hospital , where h...",He ordered him sent to the Larned State Securi...,[8],[10],where,for
4868,1317,17,Coordination changes,yes,hammering consumer spending and leaving shops ...,"hammering consumer spending , with shops , res...",[23],,and,","
9107,2442,17,Coordination changes,yes,Ms Pike also said it was not unusual for hospi...,But Ms Pike said it was not unusual for hospit...,[15],[14],but,and
9663,2604,17,Coordination changes,yes,Jews and the US,"Jews , Americans",[3],,and,","
10342,2789,17,Coordination changes,yes,"In addition to O'Connor , Rehnquist 's majorit...","Justices Sandra Day O'Connor , David H. Souter...","[0, 1, 2, 20]",[19],In addition to and,and
13874,3675,17,Coordination changes,yes,"20 years , 63 days",20 years and 63 days,,[15],",",and
20398,5343,17,Coordination changes,yes,"vegetables , fruits",fruits and vegetables,,[14],",",and
20399,5343,17,Coordination changes,yes,"nuts , cereals",nuts and cereals,,[18],",",and
21161,5545,17,Coordination changes,yes,benefiting from a little luck Thursday to eras...,to erase a set point and beat unseeded Nadia P...,[23],[20],and,and


In [65]:
coord_punct.loc[[0,1], 's1_scope'] = None
coord_punct.loc[[2,3,4], 's2_scope'] = None
coord_punct

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,1317,17,Coordination changes,yes,hammering consumer spending and leaving shops ...,"hammering consumer spending , with shops , res...",,[14],and,","
1,2604,17,Coordination changes,yes,Jews and the US,"Jews , Americans",,[1],and,","
2,3675,17,Coordination changes,yes,"20 years , 63 days",20 years and 63 days,[7],,",",and
3,5343,17,Coordination changes,yes,"vegetables , fruits",fruits and vegetables,[9],,",",and
4,5343,17,Coordination changes,yes,"nuts , cereals",nuts and cereals,[15],,",",and


In [66]:
substitute(17, coord_punct, 8)

Double check type overwriting: row 3674, sentence 1
Common indices: [7] | Pre-existing types: ['3_0']


In [67]:
substitute(17, coord_subs, 1)

Double check type overwriting: row 3674, sentence 2
Common indices: [15] | Pre-existing types: ['3_0']


### Subordination and Nesting changes

In [68]:
subord = pd.DataFrame(columns = textual_paraphrases.columns, data = copy.deepcopy(textual_paraphrases.values))
subord = subord[subord['type_id'] == 18]
subord.drop(columns=['s1_scope', 's2_scope'], inplace=True)
subord.rename(columns={'key_s1': 's1_scope', 'key_s2': 's2_scope'}, inplace=True)

subord[:10]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
48,20,18,Subordination and nesting changes,yes,Sheena Young of Child,"Sheena Young , a spokesman for Child ,","[2, 3]","[3, 4, 5, 6]",of Child,a spokesman for Child
97,33,18,Subordination and nesting changes,yes,The AFL-CIO is waiting until October to decide,The AFL-CIO announced Wednesday that it will d...,,"[2, 4]",,announced that
105,36,18,Subordination and nesting changes,yes,Trading in Loral was halted yesterday ; the sh...,The New York Stock Exchange suspended trading ...,"[7, 8, 9, 10, 11, 12, 14]",[11],the shares closed on Monday at 3.01,which
131,43,18,Subordination and nesting changes,yes,", who faces charges of conspiracy lying to a g...",on charges of conspiracy and lying to a grand ...,[2],"[7, 8, 9]",who,on charges of
191,56,18,Subordination and nesting changes,yes,and allows developers,", which allows developers",[5],[12],and,which
227,69,18,Subordination and nesting changes,yes,a point system the U.S. Supreme Court found un...,the way it previously admitted undergraduates,"[16, 17, 18, 19, 20, 21]","[21, 22, 23, 24]",the U.S. Supreme Court found unconstitutional,it previously admitted undergraduates
406,112,18,Subordination and nesting changes,yes,The suite comes complete with a word processor...,"The suite includes a word processor , spreadsh...",[17],,while,
489,134,18,Subordination and nesting changes,yes,at the same time of the anthrax attacks,at the same time that real anthrax attacks,"[19, 20, 21, 22]",[19],of the anthrax attacks,that
497,135,18,Subordination and nesting changes,yes,"$ 200 billion annually , which Gephardt would ...",$ 200 billion annually and be paid for,[11],,which,
632,169,18,Subordination and nesting changes,yes,The Saudi newspaper Okaz reported Monday that ...,The newspaper Okaz reported that the six suspe...,"[8, 9, 10, 11, 12]","[11, 12]",who escaped Saturday 's raid,the raid


In [69]:
subord_adddel = subord[(subord['s1_scope'].isnull()) | (subord['s2_scope'].isnull())]
subord_subs = subord[~((subord['s1_scope'].isnull()) | (subord['s2_scope'].isnull()))]

In [70]:
subord_adddel[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
97,33,18,Subordination and nesting changes,yes,The AFL-CIO is waiting until October to decide,The AFL-CIO announced Wednesday that it will d...,,"[2, 4]",,announced that
406,112,18,Subordination and nesting changes,yes,The suite comes complete with a word processor...,"The suite includes a word processor , spreadsh...",[17],,while,
497,135,18,Subordination and nesting changes,yes,"$ 200 billion annually , which Gephardt would ...",$ 200 billion annually and be paid for,[11],,which,
967,252,18,Subordination and nesting changes,yes,A hearing on the matter was held Thursday morn...,A hearing Thursday morning before Judge Elizab...,"[14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 2...",,", marking one of the early steps in deciding t...",
1507,394,18,Subordination and nesting changes,yes,a patent verdict that went against the company,a patent verdict against it,[14],,that,
1524,399,18,Subordination and nesting changes,yes,"in August 2000 , when Rambus accused Infineon","At that time , Rambus accused Infineon",[8],,when,
1698,443,18,Subordination and nesting changes,yes,"Kyi , a U.N. envoy says , as Japan adds to gro...",JAPAN added to growing international pressure ...,[14],,by,
1781,465,18,Subordination and nesting changes,yes,"Meanwhile , rival contender , General Electric...",Other contenders included General Electric 's ...,,[9],,which
1982,525,18,Subordination and nesting changes,yes,500 clergy sex abuse lawsuits,500 sex abuse lawsuits involving priests,,"[20, 21]",,involving priests
2079,549,18,Subordination and nesting changes,yes,"In the 2002 study , the margin of error ranged...",It has a margin of error of plus or minus thre...,[0],,In,


Isolate single words, since those are almost certainly function words   

In [71]:
def singleword(word):
    if not word:
        return False
    else:
        return len(word.split()) == 1

In [72]:
subord_adddel_funct = subord_adddel[subord_adddel.apply(lambda x: singleword(x.k1_text) or singleword(x.k2_text), axis=1)]

In [73]:
subord_adddel_funct[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
406,112,18,Subordination and nesting changes,yes,The suite comes complete with a word processor...,"The suite includes a word processor , spreadsh...",[17],,while,
497,135,18,Subordination and nesting changes,yes,"$ 200 billion annually , which Gephardt would ...",$ 200 billion annually and be paid for,[11],,which,
1507,394,18,Subordination and nesting changes,yes,a patent verdict that went against the company,a patent verdict against it,[14],,that,
1524,399,18,Subordination and nesting changes,yes,"in August 2000 , when Rambus accused Infineon","At that time , Rambus accused Infineon",[8],,when,
1698,443,18,Subordination and nesting changes,yes,"Kyi , a U.N. envoy says , as Japan adds to gro...",JAPAN added to growing international pressure ...,[14],,by,
1781,465,18,Subordination and nesting changes,yes,"Meanwhile , rival contender , General Electric...",Other contenders included General Electric 's ...,,[9],,which
2079,549,18,Subordination and nesting changes,yes,"In the 2002 study , the margin of error ranged...",It has a margin of error of plus or minus thre...,[0],,In,
2106,554,18,Subordination and nesting changes,yes,"sales were flat , while the gross margin fell","sales were flat , with gross margin down",[12],,while,
2137,563,18,Subordination and nesting changes,yes,code which is from its Unix property,Unix intellectual property owned by SCO,[11],,which,
2334,612,18,Subordination and nesting changes,yes,A rebel who was captured,A captured rebel,[2],,who,


In [74]:
substitute(18, subord_adddel_funct, 1)

Double check type overwriting: row 790, sentence 1
Common indices: [8] | Pre-existing types: ['3_0']


Double check type overwriting: row 2879, sentence 2
Common indices: [10] | Pre-existing types: ['3_0']
Double check type overwriting: row 3152, sentence 1
Common indices: [26] | Pre-existing types: ['3_0']
Double check type overwriting: row 3658, sentence 1
Common indices: [18] | Pre-existing types: ['3_0']
Double check type overwriting: row 4132, sentence 1
Common indices: [18] | Pre-existing types: ['5_1']
Double check type overwriting: row 4844, sentence 2
Common indices: [3] | Pre-existing types: ['3_0']


In [75]:
subord_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
48,20,18,Subordination and nesting changes,yes,Sheena Young of Child,"Sheena Young , a spokesman for Child ,","[2, 3]","[3, 4, 5, 6]",of Child,a spokesman for Child
105,36,18,Subordination and nesting changes,yes,Trading in Loral was halted yesterday ; the sh...,The New York Stock Exchange suspended trading ...,"[7, 8, 9, 10, 11, 12, 14]",[11],the shares closed on Monday at 3.01,which
131,43,18,Subordination and nesting changes,yes,", who faces charges of conspiracy lying to a g...",on charges of conspiracy and lying to a grand ...,[2],"[7, 8, 9]",who,on charges of
191,56,18,Subordination and nesting changes,yes,and allows developers,", which allows developers",[5],[12],and,which
227,69,18,Subordination and nesting changes,yes,a point system the U.S. Supreme Court found un...,the way it previously admitted undergraduates,"[16, 17, 18, 19, 20, 21]","[21, 22, 23, 24]",the U.S. Supreme Court found unconstitutional,it previously admitted undergraduates
...,...,...,...,...,...,...,...,...,...,...
21573,5660,18,Subordination and nesting changes,yes,the number one priority for David Jones,the number one issue David Jones would tackle,"[17, 18, 19]","[18, 19, 20, 21]",for David Jones,David Jones would tackle
21746,5710,18,Subordination and nesting changes,yes,Women who eat potatoes and other tuberous vege...,their mothers eating potatoes and other tubero...,[1],"[17, 18, 19, 20, 21, 22, 23, 24]",who,eating potatoes and other tuberous vegetables ...
21753,5711,18,Subordination and nesting changes,yes,State Senate Majority Leader Joseph Bruno,"Joseph L. Bruno , the State Senate majority le...","[7, 8]","[8, 9, 10, 11, 12]",Joseph Bruno,the State Senate majority leader
21767,5713,18,Subordination and nesting changes,yes,one subtype that represents a minority of cases,one subtype representing a minority of cases,[21],"[22, 23, 24, 25, 26]",that,representing a minority of cases


In [76]:
subord_adddel.drop(subord_adddel_funct.index.tolist(), inplace=True)
subord_adddel

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subord_adddel.drop(subord_adddel_funct.index.tolist(), inplace=True)


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
97,33,18,Subordination and nesting changes,yes,The AFL-CIO is waiting until October to decide,The AFL-CIO announced Wednesday that it will d...,,"[2, 4]",,announced that
967,252,18,Subordination and nesting changes,yes,A hearing on the matter was held Thursday morn...,A hearing Thursday morning before Judge Elizab...,"[14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 2...",,", marking one of the early steps in deciding t...",
1982,525,18,Subordination and nesting changes,yes,500 clergy sex abuse lawsuits,500 sex abuse lawsuits involving priests,,"[20, 21]",,involving priests
2812,744,18,Subordination and nesting changes,yes,The cleanup cost about $ 130 million,The $ 130 million cleanup,"[12, 13]",,cost about,
3000,792,18,Subordination and nesting changes,yes,Myanmar 's pro-democracy leader Aung San Suu K...,Burma pro-democracy leader Aung San Suu Kyi wi...,"[14, 15, 16, 17, 18, 19, 20, 21, 22, 23]",,following her release from a hospital where sh...,
5435,1475,18,Subordination and nesting changes,yes,The companies announced plans to collaborate,The two groups said they would collaborate,,"[4, 5, 6]",,they would collaborate
5604,1507,18,Subordination and nesting changes,yes,a Rhodes scholar at Oxford,an Rhodes Scholar he met while at Oxford,,"[16, 17]",,he met
6212,1695,18,Subordination and nesting changes,yes,the case of a nine-year-old girl who turned up...,the abduction of a 9-year-old who was found sa...,"[21, 22, 23, 24, 25, 26, 27]",,after being violently abducted from her home,
6698,1818,18,Subordination and nesting changes,yes,63 percent of home broadband users connected v...,63 percent of home broadband users had cable m...,,"[15, 16, 19]",,compared with who
7624,2060,18,Subordination and nesting changes,yes,Doctors have advised that the boy get chemothe...,Daren and Barbara Jensen refused to heed docto...,,"[5, 6, 7, 8, 9, 10, 11]",,to heed doctors ' recommendation of chemotherapy


In [77]:
substitute(18, subord_adddel, 2, False)

### Direct/Indirect Style Alternations (TODO)

In [78]:
textual_paraphrases[textual_paraphrases['type_id'] == 22][:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
521,141,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","`` This deal makes sense for both companies , ...","Brian Halla , CEO of NatSemi , claimed the dea...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",,"`` This deal makes sense for both companies , ''",
745,197,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11]","[2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 15]",`` I just got carried away and started making ...,he got carried away and just `` started making...,"[0, 11]",,`` '',
872,225,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",Mr Abbas said : `` Every day without an agreem...,"His Palestinian counterpart , Mahmoud Abbas , ...",,[8],,that
2076,547,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",The draft of the report was forthright : `` Cl...,The original report had concluded that ''clima...,,[5],,that
2676,701,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",He added that those `` are not solely American...,`` These are not solely American principles no...,[2],,that,
3362,895,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",We need a certifiable pay as you go budget by ...,Texas lawmakers must close a $ 185.9 million b...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",,We need a certifiable pay as you go budget by ...,
3988,1068,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 20, 21,...","[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...",`` There is no conscious policy of the United ...,there is no conscious policy by the United Sta...,"[0, 25]",,`` '',
4899,1323,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",The bank also said its offer was subject to th...,The offer is also subject to Goldman signing a...,"[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...",,its offer was subject to the agreement of Drax...,
4929,1331,22,Direct/indirect style alternations,yes,"[1, 2, 3, 4, 5, 6]","[0, 1, 2]",He said they were in distress,We 're asphyxiating,"[3, 4, 5, 6]",,they were in distress,
5003,1347,22,Direct/indirect style alternations,yes,"[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11]",the two men were `` defined by dedication and ...,`` [ They ] were defined by dedication and cou...,,"[0, 11]",,`` ''


In [79]:
print_sents(1346)

The $19.50-a-share bid, comes two days after PeopleSoft revised its bid for smaller rival J.D. Edwards & Co. JDEC.O to include cash as well as stock.
Oracle's $19.50-a-share bid comes two days after PeopleSoft added cash to its original all-share deal with smaller rival J.D. Edwards & Co. JDEC.O .


The annotation for this type seem to be all over the place. Nevertheless, an easy case to deal with seems to be 
additions of 'that':

In [80]:
direct = pd.DataFrame(columns = textual_paraphrases.columns, data = copy.deepcopy(textual_paraphrases.values))
direct = direct[direct['type_id'] == 22]
direct.drop(columns=['s1_scope', 's2_scope'], inplace=True)
direct.rename(columns={'key_s1': 's1_scope', 'key_s2': 's2_scope'}, inplace=True)

In [81]:
direct[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
521,141,22,Direct/indirect style alternations,yes,"`` This deal makes sense for both companies , ...","Brian Halla , CEO of NatSemi , claimed the dea...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",,"`` This deal makes sense for both companies , ''",
745,197,22,Direct/indirect style alternations,yes,`` I just got carried away and started making ...,he got carried away and just `` started making...,"[0, 11]",,`` '',
872,225,22,Direct/indirect style alternations,yes,Mr Abbas said : `` Every day without an agreem...,"His Palestinian counterpart , Mahmoud Abbas , ...",,[8],,that
2076,547,22,Direct/indirect style alternations,yes,The draft of the report was forthright : `` Cl...,The original report had concluded that ''clima...,,[5],,that
2676,701,22,Direct/indirect style alternations,yes,He added that those `` are not solely American...,`` These are not solely American principles no...,[2],,that,
3362,895,22,Direct/indirect style alternations,yes,We need a certifiable pay as you go budget by ...,Texas lawmakers must close a $ 185.9 million b...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",,We need a certifiable pay as you go budget by ...,
3988,1068,22,Direct/indirect style alternations,yes,`` There is no conscious policy of the United ...,there is no conscious policy by the United Sta...,"[0, 25]",,`` '',
4899,1323,22,Direct/indirect style alternations,yes,The bank also said its offer was subject to th...,The offer is also subject to Goldman signing a...,"[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...",,its offer was subject to the agreement of Drax...,
4929,1331,22,Direct/indirect style alternations,yes,He said they were in distress,We 're asphyxiating,"[3, 4, 5, 6]",,they were in distress,
5003,1347,22,Direct/indirect style alternations,yes,the two men were `` defined by dedication and ...,`` [ They ] were defined by dedication and cou...,,"[0, 11]",,`` ''


In [82]:
print_sents(2058)

"The NAFTA ruling confirms that Canadian producers dump lumber in to the U.S. market," Rusty Wood, chairman of the coalition, said in a release.
"The NAFTA ruling confirms that Canadian producers dump lumber into the U.S. market," said Rusty Wood, chairman of the Coalition for Fair Lumber Imports.


In [83]:
direct_that = direct[(direct['k1_text'] == 'that') | (direct['k2_text'] == 'that')][:30]
direct_that

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
872,225,22,Direct/indirect style alternations,yes,Mr Abbas said : `` Every day without an agreem...,"His Palestinian counterpart , Mahmoud Abbas , ...",,[8],,that
2076,547,22,Direct/indirect style alternations,yes,The draft of the report was forthright : `` Cl...,The original report had concluded that ''clima...,,[5],,that
2676,701,22,Direct/indirect style alternations,yes,He added that those `` are not solely American...,`` These are not solely American principles no...,[2],,that,
5082,1373,22,Direct/indirect style alternations,yes,"O'Donnell wrote in her autobiography , `` Find...","In her autobiography , `` Find Me , '' O'Donne...",[11],,that,
5259,1422,22,Direct/indirect style alternations,yes,`` The discovery that the MAP bug is present i...,The researchers say that the fact the MAP bug ...,,[3],,that
6791,1838,22,Direct/indirect style alternations,yes,Neither military action nor large-scale briber...,"Indeed , Wolfowitz admitted Saturday that neit...",,[5],,that
7000,1892,22,Direct/indirect style alternations,yes,"`` If I was diagnosed today with CJD , I would...",He added that if he were diagnosed with vCJD `...,,[2],,that
7403,1996,22,Direct/indirect style alternations,yes,Gibson said last month in a press statement th...,Gibson said in a June statement that he and hi...,,[6],,that
8210,2211,22,Direct/indirect style alternations,yes,"In terms of a free trade area , we 've got a l...","As for a free trade area , the official stress...",,[10],,that
8816,2369,22,Direct/indirect style alternations,yes,The study found that only about one-third of p...,Only about one-third of parents of sexually ex...,[3],,that,


In [84]:
substitute(22, direct_that, 1)

In [85]:
direct_quotemarks = direct[(direct['k1_text'] == "`` ''") | (direct['k2_text'] == "`` ''")][:10]
# 'k2_text' in the row below has other elements of the sentence, and we don't want to mess with those
direct_quotemarks.loc[direct_quotemarks['pair_id'] == 2059, 'k2_text'] = None  
direct_quotemarks 

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
745,197,22,Direct/indirect style alternations,yes,`` I just got carried away and started making ...,he got carried away and just `` started making...,"[0, 11]",,`` '',
3988,1068,22,Direct/indirect style alternations,yes,`` There is no conscious policy of the United ...,there is no conscious policy by the United Sta...,"[0, 25]",,`` '',
5003,1347,22,Direct/indirect style alternations,yes,the two men were `` defined by dedication and ...,`` [ They ] were defined by dedication and cou...,,"[0, 11]",,`` ''
7622,2059,22,Direct/indirect style alternations,yes,`` No data exists to indicate that the situati...,"However , FAA spokeswoman Kathleen Bergen said...","[0, 17]","[7, 8, 9, 10, 11, 12, 13, 14, 15]",`` '',
16877,4454,22,Direct/indirect style alternations,yes,its contention that KBR had `` delivered fuel ...,`` We believe KBR delivered fuel to Iraq at th...,,"[0, 21]",,`` ''
17540,4612,22,Direct/indirect style alternations,yes,it believed `` the long-term prospects for the...,`` We believe the long-term prospects for the ...,,"[0, 16]",,`` ''


In [86]:
substitute(22, direct_quotemarks, 8)

Double check type overwriting: row 196, sentence 1
Common indices: [0, 11] | Pre-existing types: ['3_1' '3_1']
Double check type overwriting: row 1067, sentence 1
Common indices: [0, 25] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 1346, sentence 2
Common indices: [0, 11] | Pre-existing types: ['3_1' '3_1']
Double check type overwriting: row 2058, sentence 2
Common indices: [9, 15] | Pre-existing types: ['6_0' '5_0 & 6_1']
Double check type overwriting: row 4453, sentence 2
Common indices: [0, 21] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 4611, sentence 2
Common indices: [0, 16] | Pre-existing types: ['3_0' '3_0']


In [87]:
print_sents(2058)

"The NAFTA ruling confirms that Canadian producers dump lumber in to the U.S. market," Rusty Wood, chairman of the coalition, said in a release.
"The NAFTA ruling confirms that Canadian producers dump lumber into the U.S. market," said Rusty Wood, chairman of the Coalition for Fair Lumber Imports.


In [88]:
direct_that.index.tolist()

[872,
 2076,
 2676,
 5082,
 5259,
 6791,
 7000,
 7403,
 8210,
 8816,
 9848,
 13045,
 13164,
 14395,
 14774,
 14911,
 16950,
 17053,
 17788,
 18117,
 18955,
 19905,
 20192,
 20985,
 21929]

In [89]:
indices = direct_that.index.tolist() + direct_quotemarks.index.tolist()
indices
direct_remaining = direct.drop(index=indices)
direct_remaining

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
521,141,22,Direct/indirect style alternations,yes,"`` This deal makes sense for both companies , ...","Brian Halla , CEO of NatSemi , claimed the dea...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",,"`` This deal makes sense for both companies , ''",
3362,895,22,Direct/indirect style alternations,yes,We need a certifiable pay as you go budget by ...,Texas lawmakers must close a $ 185.9 million b...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",,We need a certifiable pay as you go budget by ...,
4899,1323,22,Direct/indirect style alternations,yes,The bank also said its offer was subject to th...,The offer is also subject to Goldman signing a...,"[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...",,its offer was subject to the agreement of Drax...,
4929,1331,22,Direct/indirect style alternations,yes,He said they were in distress,We 're asphyxiating,"[3, 4, 5, 6]",,they were in distress,
5663,1521,22,Direct/indirect style alternations,yes,The vast majority of trades will be priced at ...,Eurex said `` the vast majority '' of trades o...,,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...",,`` the vast majority '' of trades on Eurex US ...
6286,1710,22,Direct/indirect style alternations,yes,"`` It '' s absurd , '' Funny Cide 's trainer B...","Meanwhile , Funny Cide 's trainer , Barclay Ta...",,"[11, 12, 13, 14, 16]",,the allegations `` ridiculous ''\n
7561,2048,22,Direct/indirect style alternations,yes,`` We will work with the board to ensure a smo...,He said federal regulators would work with the...,,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]",,federal regulators would work with the corpora...
7632,2061,22,Direct/indirect style alternations,yes,IAAF council member Jose Maria Odriozola said ...,`` I have proposed to the [ IAAF ] council tha...,"[7, 8, 9, 10, 11, 12, 13]",,Drummond should be excluded from the champions...,
9039,2422,22,Direct/indirect style alternations,yes,"`` Frank Quattrone is innocent , '' Keker said...",Quattrone lawyer John W. Keker said his client...,"[0, 1, 2, 3, 4, 5, 6]","[6, 7, 8, 9]","`` Frank Quattrone is innocent , ''",his client is innocent
9790,2634,22,Direct/indirect style alternations,yes,But he confessed : `` There 's total fear to s...,But he said there was a `` total fear to start...,,"[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...",,there was a `` total fear to start with becaus...


### Sentence Modality Changes

Nothing needed here; this type has zero occurences in the ETPC

### Syntax/Discourse Structure Changes (TODO)

In [90]:
textual_paraphrases[textual_paraphrases['type_id'] == 24][:10]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
36,14,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",But he added group performance would improve i...,De Sole said in the results statement that gro...,,[7],,that
148,48,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",The company has said it plans to restate its e...,The company had announced in January that it w...,,[6],,that
217,67,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","The downtime , to take place in May and June ,...",The downtime is expected to take 60 million to...,"[14, 16]","[5, 13, 14]",cut by,take out of
226,69,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",The University of Michigan released a new unde...,The University of Michigan released today a ne...,"[11, 12]","[10, 11, 12, 13, 14, 15, 16]",", dropping",after the U.S. Supreme Court struck down
265,76,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","Thomas and Tauzin say , as do many doctors , t...","Like many doctors , Mr. Thomas and Mr. Tauzin ...",[10],,that,
283,80,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",The delegates said raising and distributing fu...,Bin Laden’s men pointed out that raising and d...,,[5],,that
342,100,24,Syntax/discourse structure changes,yes,"[24, 25, 26, 27, 28, 29]","[21, 22, 23, 24, 25, 26]",after it admitted falsifying inspection reports,after admitting it falsified inspection reports,"[25, 26, 27]","[22, 23, 24]",it admitted falsifying,admitting it falsified
532,143,24,Syntax/discourse structure changes,yes,"[11, 12, 13, 14, 15, 16, 17, 18, 19]","[10, 11, 12, 13, 14, 15, 16]","Swartz repaid it in full , with interest ,",that Swartz fully repaid it with interest,,[10],,that
562,153,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 10, 11]","[0, 1, 2, 3, 4]","In two new schemes , target families",Two new schemes target families,[0],,In,
567,155,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1...",Miss Novikova said while there is no standard ...,Ms. Novikova said that there was no standard w...,,[9],,that


In [91]:
syn_disc = duplicate_df(textual_paraphrases)
syn_disc = syn_disc[syn_disc['type_id'] == 24]
syn_disc.drop(columns=['s1_scope', 's2_scope'], inplace=True)
syn_disc.rename(columns={'key_s1': 's1_scope', 'key_s2': 's2_scope'}, inplace=True)

syn_disc_add, syn_disc_sub = split_add_sub(syn_disc)

In [92]:
syn_disc_add

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
36,14,24,Syntax/discourse structure changes,yes,But he added group performance would improve i...,De Sole said in the results statement that gro...,,[7],,that
148,48,24,Syntax/discourse structure changes,yes,The company has said it plans to restate its e...,The company had announced in January that it w...,,[6],,that
265,76,24,Syntax/discourse structure changes,yes,"Thomas and Tauzin say , as do many doctors , t...","Like many doctors , Mr. Thomas and Mr. Tauzin ...",[10],,that,
283,80,24,Syntax/discourse structure changes,yes,The delegates said raising and distributing fu...,Bin Laden’s men pointed out that raising and d...,,[5],,that
532,143,24,Syntax/discourse structure changes,yes,"Swartz repaid it in full , with interest ,",that Swartz fully repaid it with interest,,[10],,that
...,...,...,...,...,...,...,...,...,...,...
21229,5565,24,Syntax/discourse structure changes,yes,State Education Commissioner Kent King said We...,Missouri Education Commissioner Kent King said...,[7],,that,
21490,5637,24,Syntax/discourse structure changes,yes,"Colin Powell , the Secretary of State , said c...",Secretary of State Colin Powell said yesterday...,,[7],,that
21752,5711,24,Syntax/discourse structure changes,yes,"ALBANY , N.Y. State Senate Majority Leader Jos...","LBANY , Aug. 8 Joseph L. Bruno , the State Sen...",,[16],,that
21793,5722,24,Syntax/discourse structure changes,yes,Bush declared that the British government `` h...,"Bush said , `` The British government has lear...",[13],,that,


Of those, let's subset only additions/deletions of `'that'`, since we know those map nicely to addition/deletion 
of function words

In [93]:
syn_disc_add_that = syn_disc_add[(syn_disc_add['k1_text'] == 'that') | (syn_disc_add['k2_text'] == 'that')]
syn_disc_add_that

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
36,14,24,Syntax/discourse structure changes,yes,But he added group performance would improve i...,De Sole said in the results statement that gro...,,[7],,that
148,48,24,Syntax/discourse structure changes,yes,The company has said it plans to restate its e...,The company had announced in January that it w...,,[6],,that
265,76,24,Syntax/discourse structure changes,yes,"Thomas and Tauzin say , as do many doctors , t...","Like many doctors , Mr. Thomas and Mr. Tauzin ...",[10],,that,
283,80,24,Syntax/discourse structure changes,yes,The delegates said raising and distributing fu...,Bin Laden’s men pointed out that raising and d...,,[5],,that
532,143,24,Syntax/discourse structure changes,yes,"Swartz repaid it in full , with interest ,",that Swartz fully repaid it with interest,,[10],,that
...,...,...,...,...,...,...,...,...,...,...
21229,5565,24,Syntax/discourse structure changes,yes,State Education Commissioner Kent King said We...,Missouri Education Commissioner Kent King said...,[7],,that,
21490,5637,24,Syntax/discourse structure changes,yes,"Colin Powell , the Secretary of State , said c...",Secretary of State Colin Powell said yesterday...,,[7],,that
21752,5711,24,Syntax/discourse structure changes,yes,"ALBANY , N.Y. State Senate Majority Leader Jos...","LBANY , Aug. 8 Joseph L. Bruno , the State Sen...",,[16],,that
21793,5722,24,Syntax/discourse structure changes,yes,Bush declared that the British government `` h...,"Bush said , `` The British government has lear...",[13],,that,


In [94]:
substitute(24, syn_disc_add_that, 1)

Double check type overwriting: row 2334, sentence 1
Common indices: [4] | Pre-existing types: ['3_0']
Double check type overwriting: row 2368, sentence 2
Common indices: [10] | Pre-existing types: ['3_0']
Double check type overwriting: row 4792, sentence 2
Common indices: [12] | Pre-existing types: ['3_0']


Let's see what remains

In [95]:
syn_disc_add.drop(syn_disc_add_that.index.tolist(), inplace=True)
syn_disc_add

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syn_disc_add.drop(syn_disc_add_that.index.tolist(), inplace=True)


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
562,153,24,Syntax/discourse structure changes,yes,"In two new schemes , target families",Two new schemes target families,[0],,In,
613,164,24,Syntax/discourse structure changes,yes,She claimed all the babies were born full-term...,What she told our investigators was that all t...,,"[1, 6, 7]",,What was that
1204,308,24,Syntax/discourse structure changes,yes,"The new effort , Taxpayers Against the Recall ...","Called `` Taxpayers Against the Recall , '' it...",,"[0, 1, 7, 8]",,Called `` '' it
1731,450,24,Syntax/discourse structure changes,yes,the standards body warns,according to the W3C 's notice,,"[17, 18]",,according to
1821,474,24,Syntax/discourse structure changes,yes,Sendmail said,according to Sendmail,,"[10, 11]",,according to
2239,592,24,Syntax/discourse structure changes,yes,it 's a technique that 's been successful in p...,the technique has successfully predicted,"[3, 4, 7]",,it 's that,
2517,656,24,Syntax/discourse structure changes,yes,according to the report,the report noted,"[13, 14]",,according to,
2735,724,24,Syntax/discourse structure changes,yes,"For the third time in the past four years ,",It was the third time in four years that,,"[0, 1, 8]",,It was that
3037,802,24,Syntax/discourse structure changes,yes,The numbers highlight a conundrum :,"As stark as the numbers themselves , is the co...",,"[0, 2, 7]",,As as is
3160,843,24,Syntax/discourse structure changes,yes,US pressure had provoked,it was U.S. pressure which had provoked,,"[12, 13, 16]",,it was which


In [96]:
substitute(24, syn_disc_add, 2)

Double check type overwriting: row 307, sentence 2
Common indices: [8] | Pre-existing types: ['5_0']
Double check type overwriting: row 3975, sentence 1
Common indices: [9, 10] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 4499, sentence 1
Common indices: [17] | Pre-existing types: ['3_0']


In [151]:
syn_disc_sub[150:]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
21025,5513,24,Syntax/discourse structure changes,yes,a minibus detonated a bomb in a Baghdad traffi...,a bomb explosion in a Baghdad traffic tunnel k...,"[5, 6, 8, 15, 19]","[3, 4, 11, 15]",detonated a bomb killing wounding,a bomb killed wounded
21237,5566,24,Syntax/discourse structure changes,yes,", which has agreed to handle his surrender",about arranging his surrender,"[12, 13, 14, 15, 16]","[11, 12]",which has agreed to handle,about arranging
21348,5593,24,Syntax/discourse structure changes,yes,The company posted a profit of $ 54.3 million ...,"That was up from the year-ago quarter , when t...",[16],"[0, 1, 2, 3, 8]",in,That was up from when
21353,5600,24,Syntax/discourse structure changes,yes,"Among those waiting a turn was Jodie Singer , ...","Jodie Singer , a sixth-grader from Washington ...","[0, 1, 2, 3, 4, 5]","[10, 11, 12, 13]",Among those waiting a turn was,anxiously awaited her turn
21416,5614,24,Syntax/discourse structure changes,yes,California lost $ 937 million to corporate tax...,California 's lost tax revenue was mostly due ...,[8],"[5, 6, 7, 8]",to,was mostly due to
21515,5645,24,Syntax/discourse structure changes,yes,Another shooting linked to the spree occurred ...,The latest shooting linked to the spree was a ...,[6],"[7, 8, 11]",occurred,was a shooting
21527,5648,24,Syntax/discourse structure changes,yes,Congress is the best forum for weighing,Congress is the best forum to address,[8],[8],for,to
21535,5651,24,Syntax/discourse structure changes,yes,And because it is so far out in international ...,It is so far out in international water that t...,"[1, 10]",[8],"because ,",that
21543,5652,24,Syntax/discourse structure changes,yes,a set of guidelines to help public administrat...,guidelines to member governments on how to mig...,"[10, 13, 14, 23, 24]","[10, 11]",help decide whether or not,on how
21611,5672,24,Syntax/discourse structure changes,yes,Only Intel Corp. 's 0.3 percent yield was lower .,Only Intel Corp. has a lower dividend yield .\n,"[7, 8]","[3, 4, 5]",was lower,has a lower


In [98]:
print_sents(2540)

Wal-Mart, the nation's largest private employer, has expanded its antidiscrimination policy to protect gay and lesbian employees, company officials said Tuesday.
Wal-Mart Stores Inc., the nation's largest private employer, will now include gays and lesbians in its anti-discrimination policy, company officials said Wednesday.


In [99]:
positives.loc[438,:]

idx                                                                     438
sentence1                 The letter stated that a premature stillborn b...
sentence2                 According to the writer of the letter, the inf...
sentence1_tokenized       [The, letter, stated, that, a, premature, stil...
sentence2_tokenized       [According, to, the, writer, of, the, letter, ...
etpc_label                                                                1
mrpc_label                                                                1
ept_names                 [Same Polarity Substitution (contextual), Synt...
ept_ids                                             [6, 24, 25, 25, 29, 21]
sentence1_scope_etpc      [24, 24, 24, 24, 6, 6, 6, 6, 25, 25, 25, 25, 2...
sentence2_scope_etpc      [24, 24, 24, 24, 24, 24, 24, 24, 6, 6, 0, 0, 0...
sentence1_segment_text    [a premature stillborn baby, The letter stated...
sentence2_segment_text    [the infant, According to the writer of the le...
sentence1_sc

### Semantic Based

In [100]:
textual_paraphrases[textual_paraphrases['type_id'] == 28]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
102,35,28,Semantic based,yes,"[0, 1, 2, 3, 4, 5]",[10],The largest gains were seen in,increased,,,,
129,41,28,Semantic based,yes,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","[3, 4, 5, 6, 7, 8, 9, 10]",the court upheld Cleveland 's school voucher p...,the court ruled 5-4 in an Ohio case,,,,
166,51,28,Semantic based,yes,"[13, 14, 15, 16]","[18, 19, 20, 21]",they were legally employed,they have legal status,,,,
214,64,28,Semantic based,yes,"[2, 3, 4, 5, 6, 7, 8]","[5, 6, 7, 8, 9]",what PeopleSoft management would have you believe,the contentions of PeopleSoft management,,,,
223,68,28,Semantic based,yes,"[4, 5, 6, 7, 8, 9, 10, 11, 12, 13]","[0, 1, 2, 3, 4]",Troy is expected to be sentenced to life in pr...,Troy faces life in prison,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
21906,5756,28,Semantic based,yes,"[20, 21]","[22, 25, 26, 27, 28, 29, 30, 31]",work for,be & apos ; s chief operating officer,,,,
21935,5767,28,Semantic based,yes,"[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 2...","[13, 14, 15, 16, 17, 18, 19, 20, 21, 22]","43,000 jobs in Santa Clara County and 18,000 j...","nearly 62,000 jobs in the Santa Clara County-S...",,,,
21953,5772,28,Semantic based,yes,[0],"[2, 3, 4]",After,into the study,,,,
21980,5781,28,Semantic based,yes,"[8, 9, 10, 11, 12, 13, 14, 15, 16]","[8, 9, 10, 11]",13 of the state 's 16 fatalities were reported,13 people were killed,,,,


In [101]:
print_sents(226)

We remain hopeful that the city will agree to work with us and engage in good-faith discussions on this issue."
Alhart said the governor "remains hopeful that the city will continue to work with us and engage in good-faith discussions."


### Ellipsis

In [102]:
ellip = duplicate_df(textual_paraphrases[textual_paraphrases['type_id'] == 16])
ellip['k1_text'] = ellip['k1_text'].str.lower()
ellip['k2_text'] = ellip['k2_text'].str.lower()
ellip.drop(columns=['s1_scope', 's2_scope'], inplace=True)
ellip.rename(columns={'key_s1': 's1_scope', 'key_s2': 's2_scope'}, inplace=True)
ellip

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,150,16,Ellipsis,yes,would take would require,would take require,"[3, 8]",[3],would would,would
1,273,16,Ellipsis,yes,short-lived or long-lived,short- or long-lived,"[13, 15]",[14],short-lived long-lived,long-lived
2,336,16,Ellipsis,yes,"The company will launch 800 hot spots , or `` ...",The service will launch later this summer with...,"[0, 1]","[0, 1, 12]",the company,the service sprint
3,470,16,Ellipsis,yes,with aggravated harassment and criminal posses...,with aggravated harassment in the phone call c...,[21],"[10, 19]",with,with with
4,594,16,Ellipsis,yes,"After three months , Atkins dieters had lost a...","Three months into the study , the Atkins group...","[12, 16]",[16],pounds pounds,pounds
...,...,...,...,...,...,...,...,...,...,...
61,5384,16,Ellipsis,yes,12-by-18-inch,12-inch-by-18-inch,[9],[11],12-by-18-inch,12-inch-by-18-inch
62,5744,16,Ellipsis,yes,to establish relationships and to make sure,to establish relationships and make sure,"[6, 10]",[6],to to,to
63,5749,16,Ellipsis,yes,not because of who she is but what she did,not because of who she is but because of what ...,"[6, 7]","[7, 8, 13, 14]",because of,because of because of
64,5772,16,Ellipsis,yes,14.7 pounds 5.8 pounds,15 pounds five,"[12, 16]",[16],pounds pounds,pounds


In [103]:
def same(string1, string2):
    return set(string1.split()) == set(string2.split())

In [104]:
samie = ellip[ellip.apply(lambda x: same(x.k1_text, x.k2_text), axis=1)]
ellip.drop(samie.index.tolist(), inplace=True)
samie

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,150,16,Ellipsis,yes,would take would require,would take require,"[3, 8]",[3],would would,would
3,470,16,Ellipsis,yes,with aggravated harassment and criminal posses...,with aggravated harassment in the phone call c...,[21],"[10, 19]",with,with with
4,594,16,Ellipsis,yes,"After three months , Atkins dieters had lost a...","Three months into the study , the Atkins group...","[12, 16]",[16],pounds pounds,pounds
5,627,16,Ellipsis,yes,We believe and will defend,We believe and we will defend,[0],"[1, 8]",we,we we
7,768,16,Ellipsis,yes,"Dell has about 32 percent of the U.S. market ,...",Dell has 32 percent of the PC market in the Un...,[1],"[1, 15]",has,has has
9,1207,16,Ellipsis,yes,was lying watching,was lying was watching,[10],"[11, 24]",was,was was
10,1357,16,Ellipsis,yes,"Of personal vehicles , 57 percent are cars or ...","Of all personal vehicles , 57 percent are cars...",[6],"[7, 15, 24]",are,are are are
17,1936,16,Ellipsis,yes,other producers ' server software can work wit...,other producers ' server software can connect ...,[16],"[17, 29]",can,can can
18,2232,16,Ellipsis,yes,About 10 percent of high school and 16 percent...,16 percent of elementary and middle school stu...,[11],"[10, 17]",students,students students
19,2240,16,Ellipsis,yes,will keep the Interwoven name and be headquart...,will be named Interwoven and will be headquart...,[3],"[3, 8]",will,will will


In [105]:
preps = ('at', 'from', 'in', 'the', 'to')
samie_preps = samie[(samie['k1_text'].isin(preps)) | (samie['k2_text'].isin(preps))]
samie_preps

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
20,2339,16,Ellipsis,yes,from multiple screen names and other POP and I...,"from multiple AOL screen names , as well as fr...",[6],"[5, 14]",from,from from
25,2945,16,Ellipsis,yes,at 160 on June 16 and at 120 on June 23,at 160 on June 16 and 120 at June 23,"[16, 22]",[4],at at,at
41,3873,16,Ellipsis,yes,Testing of the swimsuit at a state police lab ...,Testing at a Massachusetts State Police lab an...,"[4, 10]",[1],at at,at
45,4220,16,Ellipsis,yes,to resign or negotiate,to resign or to negotiate,[9],"[6, 10]",to,to to
52,4613,16,Ellipsis,yes,in Washington and in New York City,in Washington and New York,"[11, 18]",[9],in in,in
57,5106,16,Ellipsis,yes,the pledges and the minute of silence,the pledges and moment of silence,"[20, 23]",[21],the the,the
62,5744,16,Ellipsis,yes,to establish relationships and to make sure,to establish relationships and make sure,"[6, 10]",[6],to to,to


In [106]:
print_sents(4220)

The MDC called the strike to force Mr Mugabe to either resign or negotiate a settlement of the Zimbabwe crisis.
The MDC called the week-long protest to urge Mugabe either to resign or to negotiate a settlement of the crisis gripping the country.


Let's delete the first preposition

In [107]:
samie_preps['s1_scope'].apply(lambda x: x.pop(0))
samie_preps['s2_scope'].apply(lambda x: x.pop(0))

20     5
25     4
41     1
45     6
52     9
57    21
62     6
Name: s2_scope, dtype: int64

In [108]:
samie_preps

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
20,2339,16,Ellipsis,yes,from multiple screen names and other POP and I...,"from multiple AOL screen names , as well as fr...",[],[14],from,from from
25,2945,16,Ellipsis,yes,at 160 on June 16 and at 120 on June 23,at 160 on June 16 and 120 at June 23,[22],[],at at,at
41,3873,16,Ellipsis,yes,Testing of the swimsuit at a state police lab ...,Testing at a Massachusetts State Police lab an...,[10],[],at at,at
45,4220,16,Ellipsis,yes,to resign or negotiate,to resign or to negotiate,[],[10],to,to to
52,4613,16,Ellipsis,yes,in Washington and in New York City,in Washington and New York,[18],[],in in,in
57,5106,16,Ellipsis,yes,the pledges and the minute of silence,the pledges and moment of silence,[23],[],the the,the
62,5744,16,Ellipsis,yes,to establish relationships and to make sure,to establish relationships and make sure,[10],[],to to,to


In [109]:
samie_preps.loc[[20,45], 's1_scope'] = None

In [110]:
samie_preps.loc[[25,41,52,57,62], 's2_scope'] = None

In [111]:
samie_preps

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
20,2339,16,Ellipsis,yes,from multiple screen names and other POP and I...,"from multiple AOL screen names , as well as fr...",,[14],from,from from
25,2945,16,Ellipsis,yes,at 160 on June 16 and at 120 on June 23,at 160 on June 16 and 120 at June 23,[22],,at at,at
41,3873,16,Ellipsis,yes,Testing of the swimsuit at a state police lab ...,Testing at a Massachusetts State Police lab an...,[10],,at at,at
45,4220,16,Ellipsis,yes,to resign or negotiate,to resign or to negotiate,,[10],to,to to
52,4613,16,Ellipsis,yes,in Washington and in New York City,in Washington and New York,[18],,in in,in
57,5106,16,Ellipsis,yes,the pledges and the minute of silence,the pledges and moment of silence,[23],,the the,the
62,5744,16,Ellipsis,yes,to establish relationships and to make sure,to establish relationships and make sure,[10],,to to,to


In [112]:
substitute(16, samie_preps, 1)

In [113]:
samie.drop(samie_preps.index.tolist(), inplace=True)
samie

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samie.drop(samie_preps.index.tolist(), inplace=True)


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,150,16,Ellipsis,yes,would take would require,would take require,"[3, 8]",[3],would would,would
3,470,16,Ellipsis,yes,with aggravated harassment and criminal posses...,with aggravated harassment in the phone call c...,[21],"[10, 19]",with,with with
4,594,16,Ellipsis,yes,"After three months , Atkins dieters had lost a...","Three months into the study , the Atkins group...","[12, 16]",[16],pounds pounds,pounds
5,627,16,Ellipsis,yes,We believe and will defend,We believe and we will defend,[0],"[1, 8]",we,we we
7,768,16,Ellipsis,yes,"Dell has about 32 percent of the U.S. market ,...",Dell has 32 percent of the PC market in the Un...,[1],"[1, 15]",has,has has
9,1207,16,Ellipsis,yes,was lying watching,was lying was watching,[10],"[11, 24]",was,was was
10,1357,16,Ellipsis,yes,"Of personal vehicles , 57 percent are cars or ...","Of all personal vehicles , 57 percent are cars...",[6],"[7, 15, 24]",are,are are are
17,1936,16,Ellipsis,yes,other producers ' server software can work wit...,other producers ' server software can connect ...,[16],"[17, 29]",can,can can
18,2232,16,Ellipsis,yes,About 10 percent of high school and 16 percent...,16 percent of elementary and middle school stu...,[11],"[10, 17]",students,students students
19,2240,16,Ellipsis,yes,will keep the Interwoven name and be headquart...,will be named Interwoven and will be headquart...,[3],"[3, 8]",will,will will


In [114]:
print_sents(2995)

But I would rather be talking about high standards than low standards."
"I would rather be talking about positive numbers rather than negative.


In [115]:
samie['s1_scope'].apply(lambda x: x.pop(0))
samie['s2_scope'].apply(lambda x: x.pop(0))

0      3
3     10
4     16
5      1
7      1
9     11
10     7
17    17
18    10
19     3
22     1
23     3
24     9
26     9
28     8
29    17
30     3
37    15
42    11
43     2
44     6
46    16
47     5
48     4
49     5
50     1
53     7
58     5
59    10
60     9
63     7
64    16
Name: s2_scope, dtype: int64

In [116]:
samie

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,150,16,Ellipsis,yes,would take would require,would take require,[8],[],would would,would
3,470,16,Ellipsis,yes,with aggravated harassment and criminal posses...,with aggravated harassment in the phone call c...,[],[19],with,with with
4,594,16,Ellipsis,yes,"After three months , Atkins dieters had lost a...","Three months into the study , the Atkins group...",[16],[],pounds pounds,pounds
5,627,16,Ellipsis,yes,We believe and will defend,We believe and we will defend,[],[8],we,we we
7,768,16,Ellipsis,yes,"Dell has about 32 percent of the U.S. market ,...",Dell has 32 percent of the PC market in the Un...,[],[15],has,has has
9,1207,16,Ellipsis,yes,was lying watching,was lying was watching,[],[24],was,was was
10,1357,16,Ellipsis,yes,"Of personal vehicles , 57 percent are cars or ...","Of all personal vehicles , 57 percent are cars...",[],"[15, 24]",are,are are are
17,1936,16,Ellipsis,yes,other producers ' server software can work wit...,other producers ' server software can connect ...,[],[29],can,can can
18,2232,16,Ellipsis,yes,About 10 percent of high school and 16 percent...,16 percent of elementary and middle school stu...,[],[17],students,students students
19,2240,16,Ellipsis,yes,will keep the Interwoven name and be headquart...,will be named Interwoven and will be headquart...,[],[8],will,will will


In [117]:
samie['s1_scope'] = samie['s1_scope'].apply(lambda x: None if not x else x)
samie['s2_scope'] = samie['s2_scope'].apply(lambda x: None if not x else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samie['s1_scope'] = samie['s1_scope'].apply(lambda x: None if not x else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samie['s2_scope'] = samie['s2_scope'].apply(lambda x: None if not x else x)


In [118]:
samie

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,150,16,Ellipsis,yes,would take would require,would take require,[8],,would would,would
3,470,16,Ellipsis,yes,with aggravated harassment and criminal posses...,with aggravated harassment in the phone call c...,,[19],with,with with
4,594,16,Ellipsis,yes,"After three months , Atkins dieters had lost a...","Three months into the study , the Atkins group...",[16],,pounds pounds,pounds
5,627,16,Ellipsis,yes,We believe and will defend,We believe and we will defend,,[8],we,we we
7,768,16,Ellipsis,yes,"Dell has about 32 percent of the U.S. market ,...",Dell has 32 percent of the PC market in the Un...,,[15],has,has has
9,1207,16,Ellipsis,yes,was lying watching,was lying was watching,,[24],was,was was
10,1357,16,Ellipsis,yes,"Of personal vehicles , 57 percent are cars or ...","Of all personal vehicles , 57 percent are cars...",,"[15, 24]",are,are are are
17,1936,16,Ellipsis,yes,other producers ' server software can work wit...,other producers ' server software can connect ...,,[29],can,can can
18,2232,16,Ellipsis,yes,About 10 percent of high school and 16 percent...,16 percent of elementary and middle school stu...,,[17],students,students students
19,2240,16,Ellipsis,yes,will keep the Interwoven name and be headquart...,will be named Interwoven and will be headquart...,,[8],will,will will


In [119]:
samie_none = samie[(samie['s1_scope'].isnull()) | (samie['s2_scope'].isnull())]

Correcting annotation mistake (present in original ETPC)

In [120]:
samie_none.loc[26, 'key_s2'] = [9]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samie_none.loc[26, 'key_s2'] = [9]


In [121]:
samie_none

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text,key_s2
0,150,16,Ellipsis,yes,would take would require,would take require,[8],,would would,would,
3,470,16,Ellipsis,yes,with aggravated harassment and criminal posses...,with aggravated harassment in the phone call c...,,[19],with,with with,
4,594,16,Ellipsis,yes,"After three months , Atkins dieters had lost a...","Three months into the study , the Atkins group...",[16],,pounds pounds,pounds,
5,627,16,Ellipsis,yes,We believe and will defend,We believe and we will defend,,[8],we,we we,
7,768,16,Ellipsis,yes,"Dell has about 32 percent of the U.S. market ,...",Dell has 32 percent of the PC market in the Un...,,[15],has,has has,
9,1207,16,Ellipsis,yes,was lying watching,was lying was watching,,[24],was,was was,
10,1357,16,Ellipsis,yes,"Of personal vehicles , 57 percent are cars or ...","Of all personal vehicles , 57 percent are cars...",,"[15, 24]",are,are are are,
17,1936,16,Ellipsis,yes,other producers ' server software can work wit...,other producers ' server software can connect ...,,[29],can,can can,
18,2232,16,Ellipsis,yes,About 10 percent of high school and 16 percent...,16 percent of elementary and middle school stu...,,[17],students,students students,
19,2240,16,Ellipsis,yes,will keep the Interwoven name and be headquart...,will be named Interwoven and will be headquart...,,[8],will,will will,


In [122]:
substitute(16, samie_none, 2)

Double check type overwriting: row 3361, sentence 1
Common indices: [19] | Pre-existing types: ['3_0']
Double check type overwriting: row 4282, sentence 2
Common indices: [13] | Pre-existing types: ['5_0']


In [123]:
samie.drop(samie_none.index.tolist(), inplace=True)
samie

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samie.drop(samie_none.index.tolist(), inplace=True)


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
22,2643,16,Ellipsis,yes,"`` She was crying and scared , '","`` She was crying and she was really scared , ''",[2],"[2, 5, 6]",she was,she was she was
23,2772,16,Ellipsis,yes,sales of grocery and other consumer packaged p...,sales of grocery and other consumer packaged p...,[3],"[4, 15, 16]",sales of,sales of sales of
28,3338,16,Ellipsis,yes,a profit of 30 cents a share and $ 1.31,a profit of 30 cents a share and $ 1.31 a shar...,[11],"[9, 17, 18]",a share,a share a share
43,4039,16,Ellipsis,yes,She was the only woman in her unit and a membe...,She was the only woman employed as a warehouse...,"[3, 4]","[3, 4, 15, 16, 17]",the only woman,the only woman the only woman
47,4256,16,Ellipsis,yes,the fourth most common in men and the eighth m...,the fourth most common in men and the eighth i...,"[9, 18, 19]",[6],most common most common,most common
50,4474,16,Ellipsis,yes,"`` I have lots of bad dreams , I have flashbac...","`` I have lots of bad dreams , flashbacks and ...","[2, 8, 9, 12, 13]",[2],i have i have i have,i have
59,5127,16,Ellipsis,yes,will be consolidated and will be based,will be consolidated and based,"[9, 16, 17]",[11],will be will be,will be
63,5749,16,Ellipsis,yes,not because of who she is but what she did,not because of who she is but because of what ...,[7],"[8, 13, 14]",because of,because of because of


In [124]:
samie.loc[[23,28,43,63], 's1_scope'] = None
samie.loc[[47,50,59], 's2_scope'] = None
samie

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
22,2643,16,Ellipsis,yes,"`` She was crying and scared , '","`` She was crying and she was really scared , ''",[2],"[2, 5, 6]",she was,she was she was
23,2772,16,Ellipsis,yes,sales of grocery and other consumer packaged p...,sales of grocery and other consumer packaged p...,,"[4, 15, 16]",sales of,sales of sales of
28,3338,16,Ellipsis,yes,a profit of 30 cents a share and $ 1.31,a profit of 30 cents a share and $ 1.31 a shar...,,"[9, 17, 18]",a share,a share a share
43,4039,16,Ellipsis,yes,She was the only woman in her unit and a membe...,She was the only woman employed as a warehouse...,,"[3, 4, 15, 16, 17]",the only woman,the only woman the only woman
47,4256,16,Ellipsis,yes,the fourth most common in men and the eighth m...,the fourth most common in men and the eighth i...,"[9, 18, 19]",,most common most common,most common
50,4474,16,Ellipsis,yes,"`` I have lots of bad dreams , I have flashbac...","`` I have lots of bad dreams , flashbacks and ...","[2, 8, 9, 12, 13]",,i have i have i have,i have
59,5127,16,Ellipsis,yes,will be consolidated and will be based,will be consolidated and based,"[9, 16, 17]",,will be will be,will be
63,5749,16,Ellipsis,yes,not because of who she is but what she did,not because of who she is but because of what ...,,"[8, 13, 14]",because of,because of because of


In [125]:
samie.loc[[47,50,59], 's1_scope'].apply(lambda x: x.pop(0))
samie.loc[[23,28,63], 's2_scope'].apply(lambda x: x.pop(0))
samie

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
22,2643,16,Ellipsis,yes,"`` She was crying and scared , '","`` She was crying and she was really scared , ''",[2],"[2, 5, 6]",she was,she was she was
23,2772,16,Ellipsis,yes,sales of grocery and other consumer packaged p...,sales of grocery and other consumer packaged p...,,"[15, 16]",sales of,sales of sales of
28,3338,16,Ellipsis,yes,a profit of 30 cents a share and $ 1.31,a profit of 30 cents a share and $ 1.31 a shar...,,"[17, 18]",a share,a share a share
43,4039,16,Ellipsis,yes,She was the only woman in her unit and a membe...,She was the only woman employed as a warehouse...,,"[3, 4, 15, 16, 17]",the only woman,the only woman the only woman
47,4256,16,Ellipsis,yes,the fourth most common in men and the eighth m...,the fourth most common in men and the eighth i...,"[18, 19]",,most common most common,most common
50,4474,16,Ellipsis,yes,"`` I have lots of bad dreams , I have flashbac...","`` I have lots of bad dreams , flashbacks and ...","[8, 9, 12, 13]",,i have i have i have,i have
59,5127,16,Ellipsis,yes,will be consolidated and will be based,will be consolidated and based,"[16, 17]",,will be will be,will be
63,5749,16,Ellipsis,yes,not because of who she is but what she did,not because of who she is but because of what ...,,"[13, 14]",because of,because of because of


In [126]:
samie.at[50, 's1_scope'] = [8, 9, 12, 13]
samie.at[43, 's2_scope'] = [15,16,17]
samie

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
22,2643,16,Ellipsis,yes,"`` She was crying and scared , '","`` She was crying and she was really scared , ''",[2],"[2, 5, 6]",she was,she was she was
23,2772,16,Ellipsis,yes,sales of grocery and other consumer packaged p...,sales of grocery and other consumer packaged p...,,"[15, 16]",sales of,sales of sales of
28,3338,16,Ellipsis,yes,a profit of 30 cents a share and $ 1.31,a profit of 30 cents a share and $ 1.31 a shar...,,"[17, 18]",a share,a share a share
43,4039,16,Ellipsis,yes,She was the only woman in her unit and a membe...,She was the only woman employed as a warehouse...,,"[15, 16, 17]",the only woman,the only woman the only woman
47,4256,16,Ellipsis,yes,the fourth most common in men and the eighth m...,the fourth most common in men and the eighth i...,"[18, 19]",,most common most common,most common
50,4474,16,Ellipsis,yes,"`` I have lots of bad dreams , I have flashbac...","`` I have lots of bad dreams , flashbacks and ...","[8, 9, 12, 13]",,i have i have i have,i have
59,5127,16,Ellipsis,yes,will be consolidated and will be based,will be consolidated and based,"[16, 17]",,will be will be,will be
63,5749,16,Ellipsis,yes,not because of who she is but what she did,not because of who she is but because of what ...,,"[13, 14]",because of,because of because of


In [127]:
substitute(16, samie, 2)

In [128]:
ellip

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
1,273,16,Ellipsis,yes,short-lived or long-lived,short- or long-lived,"[13, 15]",[14],short-lived long-lived,long-lived
2,336,16,Ellipsis,yes,"The company will launch 800 hot spots , or `` ...",The service will launch later this summer with...,"[0, 1]","[0, 1, 12]",the company,the service sprint
6,697,16,Ellipsis,yes,People who are high in positive emotions sleep...,"Happy people sleep better , have better diets ...","[1, 2, 3, 4, 5, 6, 7, 11, 16, 20]","[1, 2]",people who are high in positive emotions they ...,happy people
8,785,16,Ellipsis,yes,"Jacob has pushed consolidation for years , but...",Jacob has pushed consolidation for years but said,"[0, 8]",[0],jacob he,jacob
11,1458,16,Ellipsis,yes,the Fed will acknowledge risks are tilted towa...,the central bank will say risks are tilted tow...,"[10, 11, 18, 19]","[10, 11]",risks are they are,risks are
12,1500,16,Ellipsis,yes,immune systems suppressed by medications or by...,suppressed immune systems due to illness or me...,"[10, 13]","[10, 11]",by by,due to
13,1588,16,Ellipsis,yes,"He said it was a mistake , and he reimbursed t...",The governor said the use of the credit card w...,"[0, 8]","[0, 1]",he he,the governor
14,1598,16,Ellipsis,yes,some of the passengers,no passenger but some,[6],[4],passengers,passenger
15,1635,16,Ellipsis,yes,"11M , 22M , 33M , 44M and 55Mbit/sec","11 , 22 , 33 , 44 and 55Mbit/s","[18, 20, 22, 24]",[25],22m 33m 44m 55mbit/sec,55mbit/s
16,1830,16,Ellipsis,yes,someone strangled her and she may have been se...,Park appeared to have been strangled and may h...,"[4, 6]",[0],her she,park


In [129]:
ellip['s1_scope'], ellip['s2_scope'], ellip['k1_text'], ellip['k2_text'] = ellip.apply(lambda x: trim_duplicates(x.s1_scope, x.s2_scope, x.k1_text, x.k2_text), axis=1, result_type='expand').transpose().values
ellip

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
1,273,16,Ellipsis,yes,short-lived or long-lived,short- or long-lived,[13],[],short-lived,
2,336,16,Ellipsis,yes,"The company will launch 800 hot spots , or `` ...",The service will launch later this summer with...,[1],"[1, 12]",company,service sprint
6,697,16,Ellipsis,yes,People who are high in positive emotions sleep...,"Happy people sleep better , have better diets ...","[2, 3, 4, 5, 6, 7, 11, 16, 20]",[1],who are high in positive emotions they they they,happy
8,785,16,Ellipsis,yes,"Jacob has pushed consolidation for years , but...",Jacob has pushed consolidation for years but said,[8],[],he,
11,1458,16,Ellipsis,yes,the Fed will acknowledge risks are tilted towa...,the central bank will say risks are tilted tow...,[18],[],they,
12,1500,16,Ellipsis,yes,immune systems suppressed by medications or by...,suppressed immune systems due to illness or me...,"[10, 13]","[10, 11]",by by,due to
13,1588,16,Ellipsis,yes,"He said it was a mistake , and he reimbursed t...",The governor said the use of the credit card w...,"[0, 8]","[0, 1]",he he,the governor
14,1598,16,Ellipsis,yes,some of the passengers,no passenger but some,[6],[4],passengers,passenger
15,1635,16,Ellipsis,yes,"11M , 22M , 33M , 44M and 55Mbit/sec","11 , 22 , 33 , 44 and 55Mbit/s","[18, 20, 22, 24]",[25],22m 33m 44m 55mbit/sec,55mbit/s
16,1830,16,Ellipsis,yes,someone strangled her and she may have been se...,Park appeared to have been strangled and may h...,"[4, 6]",[0],her she,park


In [130]:
ellip['k1_text'] = ellip['k1_text'].apply(lambda x: None if x == '' else x)
ellip['k2_text'] = ellip['k2_text'].apply(lambda x: None if x == '' else x)
ellip['s1_scope'] = ellip['s1_scope'].apply(lambda x: None if list(x) == [] else x)
ellip['s2_scope'] = ellip['s2_scope'].apply(lambda x: None if list(x) == [] else x)
ellip

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
1,273,16,Ellipsis,yes,short-lived or long-lived,short- or long-lived,[13],,short-lived,
2,336,16,Ellipsis,yes,"The company will launch 800 hot spots , or `` ...",The service will launch later this summer with...,[1],"[1, 12]",company,service sprint
6,697,16,Ellipsis,yes,People who are high in positive emotions sleep...,"Happy people sleep better , have better diets ...","[2, 3, 4, 5, 6, 7, 11, 16, 20]",[1],who are high in positive emotions they they they,happy
8,785,16,Ellipsis,yes,"Jacob has pushed consolidation for years , but...",Jacob has pushed consolidation for years but said,[8],,he,
11,1458,16,Ellipsis,yes,the Fed will acknowledge risks are tilted towa...,the central bank will say risks are tilted tow...,[18],,they,
12,1500,16,Ellipsis,yes,immune systems suppressed by medications or by...,suppressed immune systems due to illness or me...,"[10, 13]","[10, 11]",by by,due to
13,1588,16,Ellipsis,yes,"He said it was a mistake , and he reimbursed t...",The governor said the use of the credit card w...,"[0, 8]","[0, 1]",he he,the governor
14,1598,16,Ellipsis,yes,some of the passengers,no passenger but some,[6],[4],passengers,passenger
15,1635,16,Ellipsis,yes,"11M , 22M , 33M , 44M and 55Mbit/sec","11 , 22 , 33 , 44 and 55Mbit/s","[18, 20, 22, 24]",[25],22m 33m 44m 55mbit/sec,55mbit/s
16,1830,16,Ellipsis,yes,someone strangled her and she may have been se...,Park appeared to have been strangled and may h...,"[4, 6]",[0],her she,park


In [131]:
ellip.at[2, 's1_scope'], ellip.at[2, 's2_scope'] = None, [12]
ellip.at[6, 's1_scope'], ellip.at[6, 's2_scope'] = [11,16,20], None
ellip.at[12, 's1_scope'], ellip.at[12, 's2_scope'] = [13], None
ellip.at[13, 's1_scope'], ellip.at[13, 's2_scope'] = [8], None
ellip.at[15, 's1_scope'], ellip.at[15, 's2_scope'] = [18,20,22,24], None
ellip.at[16, 's1_scope'], ellip.at[16, 's2_scope'] = [6], None
ellip.at[21, 's1_scope'], ellip.at[21, 's2_scope'] = None, [13]
ellip.at[27, 's1_scope'], ellip.at[27, 's2_scope'] = [11], None
ellip.at[32, 's1_scope'], ellip.at[32, 's2_scope'] = [10], None
ellip.at[39, 's1_scope'], ellip.at[39, 's2_scope'] = None, [11]
ellip.at[55, 's1_scope'], ellip.at[55, 's2_scope'] = [17], None
ellip.at[56, 's1_scope'], ellip.at[56, 's2_scope'] = None, [19]
ellip.at[65, 's1_scope'], ellip.at[65, 's2_scope'] = None, [12]
ellip

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
1,273,16,Ellipsis,yes,short-lived or long-lived,short- or long-lived,[13],,short-lived,
2,336,16,Ellipsis,yes,"The company will launch 800 hot spots , or `` ...",The service will launch later this summer with...,,[12],company,service sprint
6,697,16,Ellipsis,yes,People who are high in positive emotions sleep...,"Happy people sleep better , have better diets ...","[11, 16, 20]",,who are high in positive emotions they they they,happy
8,785,16,Ellipsis,yes,"Jacob has pushed consolidation for years , but...",Jacob has pushed consolidation for years but said,[8],,he,
11,1458,16,Ellipsis,yes,the Fed will acknowledge risks are tilted towa...,the central bank will say risks are tilted tow...,[18],,they,
12,1500,16,Ellipsis,yes,immune systems suppressed by medications or by...,suppressed immune systems due to illness or me...,[13],,by by,due to
13,1588,16,Ellipsis,yes,"He said it was a mistake , and he reimbursed t...",The governor said the use of the credit card w...,[8],,he he,the governor
14,1598,16,Ellipsis,yes,some of the passengers,no passenger but some,[6],[4],passengers,passenger
15,1635,16,Ellipsis,yes,"11M , 22M , 33M , 44M and 55Mbit/sec","11 , 22 , 33 , 44 and 55Mbit/s","[18, 20, 22, 24]",,22m 33m 44m 55mbit/sec,55mbit/s
16,1830,16,Ellipsis,yes,someone strangled her and she may have been se...,Park appeared to have been strangled and may h...,[6],,her she,park


In [132]:
ellip_none = ellip[(ellip['s1_scope'].isnull()) | (ellip['s2_scope'].isnull())]
ellip_none

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
1,273,16,Ellipsis,yes,short-lived or long-lived,short- or long-lived,[13],,short-lived,
2,336,16,Ellipsis,yes,"The company will launch 800 hot spots , or `` ...",The service will launch later this summer with...,,[12],company,service sprint
6,697,16,Ellipsis,yes,People who are high in positive emotions sleep...,"Happy people sleep better , have better diets ...","[11, 16, 20]",,who are high in positive emotions they they they,happy
8,785,16,Ellipsis,yes,"Jacob has pushed consolidation for years , but...",Jacob has pushed consolidation for years but said,[8],,he,
11,1458,16,Ellipsis,yes,the Fed will acknowledge risks are tilted towa...,the central bank will say risks are tilted tow...,[18],,they,
12,1500,16,Ellipsis,yes,immune systems suppressed by medications or by...,suppressed immune systems due to illness or me...,[13],,by by,due to
13,1588,16,Ellipsis,yes,"He said it was a mistake , and he reimbursed t...",The governor said the use of the credit card w...,[8],,he he,the governor
15,1635,16,Ellipsis,yes,"11M , 22M , 33M , 44M and 55Mbit/sec","11 , 22 , 33 , 44 and 55Mbit/s","[18, 20, 22, 24]",,22m 33m 44m 55mbit/sec,55mbit/s
16,1830,16,Ellipsis,yes,someone strangled her and she may have been se...,Park appeared to have been strangled and may h...,[6],,her she,park
21,2555,16,Ellipsis,yes,Mauresmo has the confidence of having beaten S...,She has the confidence of having beaten her fo...,,[13],mauresmo,she she


In [133]:
substitute(16, ellip_none, 2, False)

### Addition/Deletion

In [153]:
substitute(25, textual_paraphrases[textual_paraphrases['type_id'] == 25], 2)

Double check type overwriting: row 37, sentence 2
Common indices: [12] | Pre-existing types: ['3_0']
Double check type overwriting: row 37, sentence 2
Common indices: [14, 15] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 53, sentence 1
Common indices: [12] | Pre-existing types: ['5_0']
Double check type overwriting: row 76, sentence 2
Common indices: [22, 23, 24] | Pre-existing types: ['3_0' '3_0' '3_0']
Double check type overwriting: row 124, sentence 2
Common indices: [2] | Pre-existing types: ['3_0']
Double check type overwriting: row 172, sentence 2
Common indices: [16] | Pre-existing types: ['3_0']
Double check type overwriting: row 174, sentence 2
Common indices: [0] | Pre-existing types: ['5_1']
Double check type overwriting: row 203, sentence 2
Common indices: [2] | Pre-existing types: ['7_0']
Double check type overwriting: row 213, sentence 2
Common indices: [2, 3, 4] | Pre-existing types: ['5_0' '5_0' '5_0']
Double check type overwriting: row 217, se

## Diagnosing

Run these cells to make sure everything looks OK after reannotating

In [136]:
positives.loc[positives['idx'] == 401, 'sentence1_scope'].iloc[0]

array(['3_0 & 7_0', '3_0', '3_0', '3_0', '3_0', '3_0', '3_0', '3_0',
       '3_0', '3_0', '3_0', '3_0', '3_0 & 7_0', '3_1', '', '', '', '', '',
       '', '', '', '', '', ''], dtype='<U64')

In [137]:
subset = textual_paraphrases[(textual_paraphrases['pair_id'] == 401+1) & (textual_paraphrases['type_id'] == int(21))]
subset

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
1534,402,21,Punctuation changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",`` The Princess ' marriage was not set up by f...,"Vasile Ionescu , of the Roma Centre for public...","[0, 12]",[13],`` '',``


In [138]:
# For diagnosing
textual_paraphrases.loc[(textual_paraphrases['pair_id'] == 449+1) & (textual_paraphrases['type_id'].isin([1,5,3,26]))]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
1728,450,5,Same Polarity Substitution (habitual),yes,"[20, 21]","[13, 14]",Web sites,Web pages,,,,
1729,450,5,Same Polarity Substitution (habitual),yes,[5],[22],warns,notice,,,,
1730,450,3,Derivational Changes,yes,[5],[22],warns,notice,,,,
1732,450,26,Change of order,yes,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...",changes to Internet Explorer may affect a `` l...,changes to IE `` may affect a large number of ...,,,,


In [139]:
print_sents(113)
print_sents(97)

Downstream at Mount Vernon, the Skagit River was expected to crest at 36 feet -- 8 feet above flood stage -- tonight, Burke said.
The Skagit was expected to crest during the night at 38 feet at Mount Vernon, 10 feet above flood stage, the National Weather Service said.
Shares of Hartford rose $2.88 to $46.50 in New York Stock Exchange composite trading.
Shares of Hartford were up $2.28, or 5.2 percent, to $45.90 in midday trading.


In [140]:
textual_paraphrases[(textual_paraphrases['pair_id'] == 113)]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
412,113,7,Same Polarity Substitution (named ent.),yes,"[6, 7]",[1],Skagit River,Skagit,,,,
413,113,7,Same Polarity Substitution (named ent.),yes,[16],[16],8,10,,,,
414,113,6,Same Polarity Substitution (contextual),yes,[22],"[7, 8]",tonight,the night,,,,
415,113,7,Same Polarity Substitution (named ent.),yes,[24],"[22, 23, 24, 25]",Burke,the National Weather Service,,,,
416,113,26,Change of order,yes,"[1, 2, 3, 4]","[12, 13, 14]","at Mount Vernon ,",at Mount Vernon,,,,
417,113,26,Change of order,yes,[22],"[6, 7, 8]",tonight,during the night,,,,
418,113,25,Addition/Deletion,yes,[0],,Downstream,,,,,
419,113,29,Identity,yes,"[5, 8, 9, 10, 11, 12, 14, 17, 18, 19, 20, 23, ...","[0, 2, 3, 4, 5, 9, 11, 17, 18, 19, 20, 21, 26,...",the was expected to crest at feet feet above f...,The was expected to crest at feet feet above f...,,,,
420,113,30,Non-paraphrase,yes,[13],[10],36,38,,,,
421,113,21,Punctuation changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","Downstream at Mount Vernon , the Skagit River ...",The Skagit was expected to crest during the ni...,"[15, 21]",[15],-- --,","


In [141]:
textual_paraphrases[(textual_paraphrases['type_id'] == 7)][:50]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
40,15,7,Same Polarity Substitution (named ent.),yes,[7],[1],Hussein,Saddam,,,,
42,15,7,Same Polarity Substitution (named ent.),yes,"[6, 7]",[1],Mr. Hussein,Saddam,,,,
115,39,7,Same Polarity Substitution (named ent.),yes,[6],[5],JCP,JCP.N,,,,
116,39,7,Same Polarity Substitution (named ent.),yes,[12],[9],WAG,WAG.N,,,,
134,45,7,Same Polarity Substitution (named ent.),yes,[1],[1],US,Americas,,,,
152,49,7,Same Polarity Substitution (named ent.),yes,[11],"[12, 13, 14, 15, 16, 17, 18]",PEP.N,nyse : PEP - news - people,,,,
206,60,7,Same Polarity Substitution (named ent.),yes,[7],"[7, 8, 9]",770,at least 767,,,,
263,76,7,Same Polarity Substitution (named ent.),yes,[0],"[4, 5]",Thomas,Mr. Thomas,,,,
264,76,7,Same Polarity Substitution (named ent.),yes,[2],"[7, 8]",Tauzin,Mr. Tauzin,,,,
296,83,7,Same Polarity Substitution (named ent.),yes,[21],"[17, 18]",3km,two miles,,,,


In [142]:
positives['sentence1_scope'][0]

array(['3_0', '3_0', '3_0', '3_0', '', '4_0', '', '5_0', '', '', '', '',
       '', '', '', '', '', '', ''], dtype='<U64')

Sentence modality changes have zero ocurrences among paraphrases

Flagged rows:

2432, 5074, 12186


# The garbage pail

In [143]:
auxiliaries = ['are', 'am', 'be', 'been', 'being', 'had', 'has', 'have', 'having', 'is', 'was', 'were']

Code that may or may not be useful will remain here for a while

Change of Order > Identity

Game plan:

Same Polarity Substitution > Derivational Changes > Inflectional Changes > ...Modal Verb Changes? > Change of Order (modified)

## Filtering

Helper methods for filtering the ETPC dataframe based on paraphrase types

In [144]:
def filter_contains(df, search_ids):
  """Returns an ETPC dataframe with rows where paraphrase_types_ids contains
  the search_ids. Use this to search for paraphrase pairs containing specific
  ids"""
  return df[df['ept_ids'].apply(lambda x: np.isin(search_ids, x))]

def filter_equals(df, search_ids):
  """Returns an ETPC dataframe with rows where paraphrase_types_ids EXACTLY 
  MATCHES the search_ids."""
  return df[df['ept_ids'].apply(lambda x: np.array_equal(x, search_ids))]

In [145]:
filter_contains(etpc, '3')

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_segment_location,sentence2_segment_location,sentence1_segment_location_indices,sentence2_segment_location_indices,sentence1_segment_text,sentence2_segment_text
142,143_142,Tyco later said the loan had not been forgiven...,"Tyco has said the loan was not forgiven, but t...","[Tyco, later, said, the, loan, had, not, been,...","[Tyco, has, said, the, loan, was, not, forgive...",1,1,"[Same Polarity Substitution (contextual), Infl...","[6, 1, 1, 3, 24, 25, 29, 21]","[0, 25, 1, 29, 29, 1, 29, 1, 1, 29, 6, 29, 29,...","[25, 1, 1, 25, 25, 1, 25, 1, 25, 6, 24, 29, 29...","[[10], [2], [5, 7, 8], [15], [11, 12, 13, 14, ...","[[9], [1, 2], [5, 7], [12], [10, 11, 12, 13, 1...","[and, said, had been forgiven, full, Swartz re...","[but, has said, was forgiven, fully, that Swar..."
149,150_149,She estimated it would take three months and w...,She said it would take an estimated three mont...,"[She, estimated, it, would, take, three, month...","[She, said, it, would, take, an, estimated, th...",1,1,"[Synthetic/analytic substitution, Derivational...","[11, 3, 16, 25, 25, 29]","[25, 3, 25, 16, 25, 25, 25, 25, 16, 25, 11, 25...","[0, 0, 0, 16, 16, 0, 3, 0, 0, 25, 25, 0, 16, 1...","[[10], [1], [3, 4, 8, 9], [0, 2, 4, 5, 6, 7, 9...","[[13, 14], [6], [3, 4, 12], [9, 10]]","[cancellation, estimated, would take would req...","[the cancellation, estimated, would take requi..."
238,239_238,Saddam loyalists have been blamed for sabotagi...,Hussein loyalists have been blamed for sabotag...,"[Saddam, loyalists, have, been, blamed, for, s...","[Hussein, loyalists, have, been, blamed, for, ...",1,1,"[Same Polarity Substitution (named ent.), Deri...","[7, 3, 8, 25, 29, 21]","[7, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 0,...","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[[0], [16], [18], [12, 13, 14], [1, 2, 3, 4, 5...","[[0], [12], [13], [1, 2, 3, 4, 5, 6, 7, 8, 9, ...","[Saddam, attacks, U.S., as well as, loyalists ...","[Hussein, attacking, US, loyalists have been b..."
254,255_254,"""It's amazing to be part of an industry that r...","""It's amazing to be part of an industry that r...","[``, It, 's, amazing, to, be, part, of, an, in...","[``, It, 's, amazing, to, be, part, of, an, in...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 3, 29]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[[24], [22, 23], [23], [0, 1, 2, 3, 4, 5, 6, 7...","[[22], [20, 21, 27, 28], [21], [0, 1, 2, 3, 4,...","[of, recent graduate, graduate, `` It 's amazi...","[from, only graduated last May, graduated, `` ..."
286,287_286,The search was concentrated in northeast Penns...,The search was concentrated in northeastern Pe...,"[The, search, was, concentrated, in, northeast...","[The, search, was, concentrated, in, northeast...",1,1,"[Derivational Changes, Addition/Deletion, Iden...","[3, 25, 29, 28]","[29, 29, 29, 29, 29, 3, 29, 29, 29, 29, 29, 29...","[25, 25, 25, 25, 25, 3, 25, 25, 25, 25, 25, 25...","[[5], [23, 24], [0, 1, 2, 3, 4, 6, 7, 8, 9, 10...","[[5], [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, ...","[northeast, by now, The search was concentrate...","[northeastern, The search was concentrated in ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5621,5622_5621,Palm Wednesday announced plans to acquire Hand...,Palm said on Wednesday it plans to buy Handspr...,"[Palm, Wednesday, announced, plans, to, acquir...","[Palm, said, on, Wednesday, it, plans, to, buy...",0,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 6, 11, 3, 26, 29, 30]","[29, 26, 5, 3, 29, 5, 29, 29, 29, 29, 6, 29, 2...","[29, 5, 26, 26, 0, 3, 29, 5, 29, 29, 29, 29, 6...","[[2], [5], [10], [1], [3], [1], [0, 4, 6, 7, 8...","[[1], [7], [12], [2, 3], [5], [2, 3], [0, 6, 8...","[announced, acquire, started, Wednesday, plans...","[said, buy, created, on Wednesday, plans, on W..."
5702,5703_5702,Some opposition leaders said they would reserv...,Some opposition leaders called for withdrawing...,"[Some, opposition, leaders, said, they, would,...","[Some, opposition, leaders, called, for, withd...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 3, 26, 25, 29]","[26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 2...","[6, 6, 6, 25, 25, 3, 25, 0, 0, 26, 26, 26, 26,...","[[0, 1, 2], [14], [19], [0, 1, 2, 3, 4, 5, 6, ...","[[9], [0, 1, 2], [5], [9, 10, 11, 12, 13, 14, ...","[Some opposition leaders, others, withdrawal, ...","[others, Some opposition leaders, withdrawing,..."
5709,5710_5709,Women who eat potatoes and other tuberous vege...,Australian researchers believe they have found...,"[Women, who, eat, potatoes, and, other, tubero...","[Australian, researchers, believe, they, have,...",1,1,"[Same Polarity Substitution (named ent.), Same...","[7, 5, 3, 24, 18, 26, 25, 29]","[26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 2...","[7, 0, 0, 25, 25, 25, 26, 26, 26, 26, 26, 26, ...","[[23], [0], [15], [0, 1, 2, 3, 4, 5, 6, 7, 8, ...","[[0], [16], [7], [6, 7, 8, 9, 10, 11, 12, 13, ...","[Melbourne, Women, triggering, Women who eat p...","[Australian, mothers, trigger, a trigger of ty..."
5712,5713_5712,There is only one drug on the market for macul...,There is only one drug on the market for macul...,"[There, is, only, one, drug, on, the, market, ...","[There, is, only, one, drug, on, the, market, ...",1,1,"[Derivational Changes, Subordination and nesti...","[3, 18, 25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[17], [19, 20, 21, 22, 23, 24, 25, 26], [18]]","[[18], [20, 21, 22, 23, 24, 25, 26], [0, 1, 2,...","[treat, one subtype that represents a minority...","[treatment, one subtype representing a minorit..."


In [146]:
filter_equals(etpc, ['25', '29'])

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_segment_location,sentence2_segment_location,sentence1_segment_location_indices,sentence2_segment_location_indices,sentence1_segment_text,sentence2_segment_text
167,168_167,U.S. law enforcement officials are sneering at...,U.S. law enforcement officials are sneering at...,"[U.S., law, enforcement, officials, are, sneer...","[U.S., law, enforcement, officials, are, sneer...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[[15, 16, 17, 18, 19, 20, 21, 22, 23]]",[U.S. law enforcement officials are sneering a...,[-- including a police conspiracy to discredit...
645,646_645,I called the number and the lady told me she w...,I called the number and the lady told me she w...,"[I, called, the, number, and, the, lady, told,...","[I, called, the, number, and, the, lady, told,...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[18, 20, 21, 22, 23, 24]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[, Sherry Studabaker told BBC television, I ca...",[I called the number and the lady told me she ...
1017,1018_1017,He said the problem needs to be corrected befo...,He said the prob lem needs to be corrected bef...,"[He, said, the, problem, needs, to, be, correc...","[He, said, the, prob, lem, needs, to, be, corr...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[13, 14, 15]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[is cleared to, He said the problem needs to b...",[He said the prob lem needs to be corrected be...
2046,2047_2046,Other recommendations included a special couns...,Other recommendations included the creation of...,"[Other, recommendations, included, a, special,...","[Other, recommendations, included, the, creati...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",[a special counsel on oceans in the White Hous...,[Other recommendations included the creation o...
2063,2064_2063,"""For me, the Lewinsky imbroglio seemed like ju...","""For me, the Lewinsky imbroglio seemed like ju...","[``, For, me, ,, the, Lewinsky, imbroglio, see...","[``, For, me, ,, the, Lewinsky, imbroglio, see...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[17, 19, 20, 21, 22, 23]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[, according to extracts leaked yesterday, `` ...","[`` For me , the Lewinsky imbroglio seemed lik..."
2180,2181_2180,"And in the Muslim world, Osama bin Laden is be...","And in the Muslim world, Osama bin Laden, the ...","[And, in, the, Muslim, world, ,, Osama, bin, L...","[And, in, the, Muslim, world, ,, Osama, bin, L...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[[9, 10, 11, 12, 13, 14, 15, 16, 17, 18]]","[And in the Muslim world , Osama bin Laden is ...","[, the missing leader of the al-Qaida terroris..."
2229,2230_2229,This is a process and there will be other oppo...,This is a process and there will be other oppo...,"[This, is, a, process, and, there, will, be, o...","[This, is, a, process, and, there, will, be, o...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[21, 22, 23]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[he told reporters, This is a process and ther...",[This is a process and there will be other opp...
2282,2283_2282,"""Right from the beginning, we didn't want to s...","But Mr. Crosby told The Associated Press: ""Rig...","[``, Right, from, the, beginning, ,, we, did, ...","[But, Mr., Crosby, told, The, Associated, Pres...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 0...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[[0, 1, 2, 3, 4, 5, 6, 7]]","[`` Right from the beginning , we did n't want...","[But Mr. Crosby told The Associated Press :, `..."
2703,2704_2703,It's almost as if they (Russians) hit an x-mar...,It's almost as if they (Russians) hit an x-mar...,"[It, 's, almost, as, if, they, (, Russians, ),...","[It, 's, almost, as, if, they, (, Russians, ),...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[15, 17, 18, 19, 20, 21]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[, NASA spokesman Robert Navias said, It 's al...",[It 's almost as if they ( Russians ) hit an x...
2786,2787_2786,"""This puts telemarketers on notice that we wil...","""This puts telemarketers on notice that we wil...","[``, This, puts, telemarketers, on, notice, th...","[``, This, puts, telemarketers, on, notice, th...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[[25, 27, 28, 29, 30, 31]]",[`` This puts telemarketers on notice that we ...,"[, FCC chairman Michael Powell said, `` This p..."


Named Entity Numbers

Some annotations for 'Named Entity Substitution' are actually number substitutions, for instance:

In [147]:
textual_paraphrases[(textual_paraphrases['pair_id'] == 113) & ((textual_paraphrases['type_id'] == 7))]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
412,113,7,Same Polarity Substitution (named ent.),yes,"[6, 7]",[1],Skagit River,Skagit,,,,
413,113,7,Same Polarity Substitution (named ent.),yes,[16],[16],8,10,,,,
415,113,7,Same Polarity Substitution (named ent.),yes,[24],"[22, 23, 24, 25]",Burke,the National Weather Service,,,,


Let's first identify all the paraphrase pairs containing a number:

In [148]:
def hasdigit(s1, s2):
    return any(char.isdigit() for char in s1) or any(char.isdigit() for char in s2)

named_ent_numbers = pd.DataFrame(columns = textual_paraphrases.columns, data = copy.deepcopy(textual_paraphrases.values))
named_ent_numbers = named_ent_numbers[named_ent_numbers['type_id'] == 7]
named_ent_numbers = named_ent_numbers[named_ent_numbers.apply(lambda x: hasdigit(x.s1_text, x.s2_text), axis=1)]
named_ent_numbers

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
206,60,7,Same Polarity Substitution (named ent.),yes,[7],"[7, 8, 9]",770,at least 767,,,,
296,83,7,Same Polarity Substitution (named ent.),yes,[21],"[17, 18]",3km,two miles,,,,
335,97,7,Same Polarity Substitution (named ent.),yes,"[7, 8]","[13, 14]",$ 46.50,$ 45.90,,,,
336,97,7,Same Polarity Substitution (named ent.),yes,"[4, 5]","[5, 6]",$ 2.88,$ 2.28,,,,
396,111,7,Same Polarity Substitution (named ent.),yes,[19],"[18, 19, 20]",Kazemi,the 54-year-old photojournalist,,,,
413,113,7,Same Polarity Substitution (named ent.),yes,[16],[16],8,10,,,,
855,220,7,Same Polarity Substitution (named ent.),yes,[13],"[13, 14, 15]",R300-million,$ 43.63 million,,,,
999,261,7,Same Polarity Substitution (named ent.),yes,"[6, 8]",[7],some 65,64.7,,,,
1180,302,7,Same Polarity Substitution (named ent.),yes,[18],[16],114.3,114,,,,
1318,336,7,Same Polarity Substitution (named ent.),yes,"[22, 23, 24]","[14, 15]","more than 2,100","2,100 locations",,,,
