# Imports

In [1]:
import copy
import pandas as pd
import numpy as np

# Reading the ETPC

This is the ETPC dataset compiled by Wahle and posted on HuggingFace

In [2]:
# Unpickle etpc_raw
etpc = pd.read_pickle('datasets/etpc_raw.pkl')

These are the XML files from the ETPC github repo.

The first one contains all pairs marked as paraphrases by the MRPC:

In [3]:
textual_paraphrases = pd.read_xml('datasets/etpc/textual_paraphrases.xml')
# Convert scopes from strings to lists of ints
textual_paraphrases['s1_scope'] = textual_paraphrases['s1_scope'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)
textual_paraphrases['s2_scope'] = textual_paraphrases['s2_scope'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)

The second one contains the text and pair ids for *all* sentence pairs (paraphrases or not). It doesn't contain any data on whether they're paraphrases or not, or what EPT types are in them.

In [4]:
pairs = pd.read_xml('datasets/etpc/text_pairs.xml')
pairs.drop(columns=['negation'], inplace=True)
pairs.set_index('pair_id', inplace=True)

# Cleanup

## Cleaning up Columns

In [5]:
etpc.rename(columns={'paraphrase_type_ids': 'ept_ids', 'paraphrase_types': 'ept_names'}, inplace=True)
etpc.drop(columns={'negation'}, axis=1, inplace=True)
etpc

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_segment_location,sentence2_segment_location,sentence1_segment_location_indices,sentence2_segment_location_indices,sentence1_segment_text,sentence2_segment_text
0,1_0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 2...","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26...","[[5], [7], [0, 1, 2, 3], [8, 9, 10, 11, 12, 13...","[[1, 2], [0], [10, 11, 12, 13], [4]]","[whom, called, Amrozi accused his brother, `` ...","[to him, Referring, Amrozi accused his brother..."
1,2_1,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...,"[Yucaipa, owned, Dominick, 's, before, selling...","[Yucaipa, bought, Dominick, 's, in, 1995, for,...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
2,3_2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, ...","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 2...","[[0], [14], [8, 9, 10], [17, 18, 19]]","[[4, 5, 6, 7], [18], [0, 1, 2, 3], [8, 9, 10, ...","[They, cargo, on June 10, , he added, had publ...","[the ship 's owners, explosives, On June 10 ,,..."
3,4_3,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ...","[Around, 0335, GMT, ,, Tab, shares, were, up, ...","[Tab, shares, jumped, 20, cents, ,, or, 4.6, %...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
4,5_4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...",0,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, ...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[[0, 1], [2], [11, 12, 14], [13], [13], [7], [...","[[0, 1, 2, 3, 4], [5], [11], [20, 21], [20, 21...","[The stock, rose, to close at, Friday, Friday,...","[PG & E Corp. shares, jumped, to, on Friday, o..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5796,5797_5796,"After Hughes refused to rehire Hernandez, he c...",Hernandez filed an Equal Employment Opportunit...,"[After, Hughes, refused, to, rehire, Hernandez...","[Hernandez, filed, an, Equal, Employment, Oppo...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
5797,5798_5797,There are 103 Democrats in the Assembly and 47...,Democrats dominate the Assembly while Republic...,"[There, are, 103, Democrats, in, the, Assembly...","[Democrats, dominate, the, Assembly, while, Re...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
5798,5799_5798,Bethany Hamilton remained in stable condition ...,"Bethany, who remained in stable condition afte...","[Bethany, Hamilton, remained, in, stable, cond...","[Bethany, ,, who, remained, in, stable, condit...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
5799,5800_5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[26, 26, 0, 7, 7, 6, 1, 25, 25, 4, 25, 25, 25,...","[25, 25, 25, 25, 7, 0, 6, 1, 0, 0, 4, 25, 0, 2...","[[9], [3, 4], [5], [6], [0, 1], [24], [7, 8, 1...","[[10], [4], [6], [7], [13, 14], [0, 1, 2, 3], ...","[Corp, power station’s, US, owners, Last week,...","[Corp., Drax, American, owner, last week, The ..."


# Remapping paraphrase IDs


First, make a list with paraphrase types and IDs from the ETPC:

In [6]:
id_map = pd.read_xml('https://raw.githubusercontent.com/venelink/ETPC/master/Corpus/paraphrase_types.xml')
# Rename columns for clarity
id_map.rename(columns={'type_id': 'ept_id', 'type_name': 'ept_name'}, inplace=True)
# Drop unused data
id_map = id_map[['ept_id', 'ept_name']] # No use for type_category column
id_map.drop(id_map.tail(2).index,inplace=True) # Types don't appear in ETPC
id_map.style.hide(axis="index")
id_map

Unnamed: 0,ept_id,ept_name
0,1,Inflectional Changes
1,2,Modal Verb Changes
2,3,Derivational Changes
3,4,Spelling changes
4,5,Same Polarity Substitution (habitual)
5,6,Same Polarity Substitution (contextual)
6,7,Same Polarity Substitution (named ent.)
7,8,Change of format
8,9,Opposite polarity substitution (habitual)
9,10,Opposite polarity substitution (contextual)


Now, make a list with paraphrase names and IDs for ParaOp types

In [7]:
data = [[0, 'No change'],
        [1, 'Addition/Deletion - Function Word'],
        [2, 'Addition/Deletion - Content Word'],
        [3, 'Change of Order'],
        [4, 'Substitution - Synonym'],
        [5, 'Substitution - Contextual Synonym'],
        [6, 'Substitution - Morphological'],
        [7, 'Substitution - Spelling and Format']
       ]
paraop_map = pd.DataFrame(data, columns = ['paraop_id', 'paraop_name'])
paraop_map.set_index('paraop_id', inplace=True)
paraop_map

Unnamed: 0_level_0,paraop_name
paraop_id,Unnamed: 1_level_1
0,No change
1,Addition/Deletion - Function Word
2,Addition/Deletion - Content Word
3,Change of Order
4,Substitution - Synonym
5,Substitution - Contextual Synonym
6,Substitution - Morphological
7,Substitution - Spelling and Format


## Mapping

We'll use the dataframe below for mapping. Each row will contain the name and ID of a paraphrase type in the ETPC, and the name and ID of the correspondent ParaOp type.

In [8]:
id_map['paraop_id'] = ''
id_map['paraop_name'] = ''
id_map

Unnamed: 0,ept_id,ept_name,paraop_id,paraop_name
0,1,Inflectional Changes,,
1,2,Modal Verb Changes,,
2,3,Derivational Changes,,
3,4,Spelling changes,,
4,5,Same Polarity Substitution (habitual),,
5,6,Same Polarity Substitution (contextual),,
6,7,Same Polarity Substitution (named ent.),,
7,8,Change of format,,
8,9,Opposite polarity substitution (habitual),,
9,10,Opposite polarity substitution (contextual),,


Here's where we do the mapping:

In [9]:
# Helper function to map an ETPC id to a Paraop id
def map_id(ept_id, paraop_id):
    """Given an EPT id and a Paraop id, look up the name of the Paraop id and 
    fill in the rows of id_map with paraop_id and the name."""
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'] = paraop_id
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_name'] = paraop_map.loc[paraop_id, 'paraop_name']

In [10]:
map_id(ept_id=1, paraop_id=6)
map_id(ept_id=3, paraop_id=6)
map_id(ept_id=26, paraop_id=3)
map_id(ept_id=29, paraop_id=0)
map_id(4, 7)
map_id(5, 4)
map_id(6, 5)
map_id(8, 7)
map_id(9, 4)
id_map

Unnamed: 0,ept_id,ept_name,paraop_id,paraop_name
0,1,Inflectional Changes,6.0,Substitution - Morphological
1,2,Modal Verb Changes,,
2,3,Derivational Changes,6.0,Substitution - Morphological
3,4,Spelling changes,7.0,Substitution - Spelling and Format
4,5,Same Polarity Substitution (habitual),4.0,Substitution - Synonym
5,6,Same Polarity Substitution (contextual),5.0,Substitution - Contextual Synonym
6,7,Same Polarity Substitution (named ent.),,
7,8,Change of format,7.0,Substitution - Spelling and Format
8,9,Opposite polarity substitution (habitual),4.0,Substitution - Synonym
9,10,Opposite polarity substitution (contextual),,


TODO: Figure out a way to hide index of map_id throughout whole notebook. For some reason this seems harder than it needs to be...

Helper function to convert an ETPC ID to a Paraop ID

In [11]:
# Helper function to get a Paraop id from an ETPC id
def ept_to_paraop(ept_id):
    return id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'].iloc[0]

ept_to_paraop(3)

6

# Reannotation

## Creating positives dataframe

In [12]:
positives = etpc.loc[etpc['mrpc_label'] == 1]
positives.rename(columns={'sentence1_segment_location': 'sentence1_scope_etpc', 
                          'sentence2_segment_location': 'sentence2_scope_etpc'}, inplace=True)
positives.drop(columns=['sentence1_segment_location_indices', 'sentence2_segment_location_indices'],inplace=True)
positives['idx'] = positives.index.to_series()
positives

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives.rename(columns={'sentence1_segment_location': 'sentence1_scope_etpc',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives.drop(columns=['sentence1_segment_location_indices', 'sentence2_segment_location_indices'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['idx'] = positives.index.to_series()


Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_scope_etpc,sentence2_scope_etpc,sentence1_segment_text,sentence2_segment_text
0,0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 2...","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26...","[whom, called, Amrozi accused his brother, `` ...","[to him, Referring, Amrozi accused his brother..."
2,2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, ...","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 2...","[They, cargo, on June 10, , he added, had publ...","[the ship 's owners, explosives, On June 10 ,,..."
4,4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...",0,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, ...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[The stock, rose, to close at, Friday, Friday,...","[PG & E Corp. shares, jumped, to, on Friday, o..."
5,5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...",1,1,"[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]","[25, 11, 11, 11, 11, 11, 11, 11, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 11, 11...","[in the first quarter of the year, Revenue dro...","[the first quarter of the year, With the scand..."
7,7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]","[25, 4, 25, 25, 25, 25, 5, 25, 25, 25]","[25, 4, 4, 25, 25, 25, 25, 25, 5, 25, 25, 25]","[DVD-CCA, state, then, The appealed to the Sup...","[DVD CCA, U.S., that decision, The appealed to..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5792,5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...",1,1,"[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[29, 29, 29, 29, 29, 29, 29, 26, 26, 26, 26, 2...","[authorities said, Gehring waived extradition ...",[Gehring waived extradition Monday during a he...
5793,5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...",1,1,"[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[Silver, Silver, `` I am advised that certain ...","[the Silver statement, the Silver statement, ,..."
5795,5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...",0,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[deal, be completed, The deal , approved by bo...","[acquisition, close, The acquisition has been ..."
5799,5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[26, 26, 0, 7, 7, 6, 1, 25, 25, 4, 25, 25, 25,...","[25, 25, 25, 25, 7, 0, 6, 1, 0, 0, 4, 25, 0, 2...","[Corp, power station’s, US, owners, Last week,...","[Corp., Drax, American, owner, last week, The ..."


## Why we cannot use the ETPC from Wahle et al.

Here's a fundamental part of the ETPC that I hadn't realized until now: each token in a sentence can have *more than one* paraphrase type. Here's an example--note how, in sentence 2, token 5 appears in the scopes both of inflectional and derivational changes.

In [13]:
ric = textual_paraphrases.loc[(textual_paraphrases['pair_id'] == 4205+1) & (textual_paraphrases['type_id'].isin([3,1]))]
ric[:2]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
15963,4206,1,Inflectional Changes,yes,[3],"[3, 5]",completed,had inspected,,,,
15964,4206,3,Derivational Changes,yes,[4],[5],inspections,inspected,,,,


It seems that this issue also wasn't noticed by Wahle et al: some paraphrase scopes consist of only a single number repeated for the entirety of the list:

In [14]:
positives[positives['sentence1_scope_etpc'].apply(lambda x: (len(np.unique(x)) == 1))][:10]

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_scope_etpc,sentence2_scope_etpc,sentence1_segment_text,sentence2_segment_text
14,14,He told The Sun newspaper that Mr. Hussein's d...,"""Saddam's daughters had British schools and ho...","[He, told, The, Sun, newspaper, that, Mr., Hus...","[``, Saddam, 's, daughters, had, British, scho...",1,1,"[Same Polarity Substitution (named ent.), Same...","[7, 6, 7, 26, 25, 29, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[0, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26...","[Hussein, The Sun newspaper, Mr. Hussein, Mr. ...","[Saddam, The Sun, Saddam, Saddam 's daughters ..."
22,22,But tropical storm warnings and watches were p...,Tropical storm warnings were in place Thursday...,"[But, tropical, storm, warnings, and, watches,...","[Tropical, storm, warnings, were, in, place, T...",0,1,"[Addition/Deletion, Addition/Deletion, Identit...","[25, 25, 29, 30, 4, 6, 11, 17]","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[But, tropical storm warnings watches , the so...","[Jamaica and, storm warnings watches , the sou..."
35,35,Trading in Loral was halted yesterday; the sha...,The New York Stock Exchange suspended trading ...,"[Trading, in, Loral, was, halted, yesterday, ;...","[The, New, York, Stock, Exchange, suspended, t...",0,1,"[Same Polarity Substitution (habitual), Diathe...","[5, 14, 18, 29, 30, 21]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[halted, Trading in Loral was halted, Trading ...","[suspended, The New York Stock Exchange suspen..."
40,40,Last year the court upheld Cleveland's school ...,"Last year, the court ruled 5-4 in an Ohio case...","[Last, year, the, court, upheld, Cleveland, 's...","[Last, year, ,, the, court, ruled, 5-4, in, an...",1,1,"[Same Polarity Substitution (contextual), Infl...","[6, 1, 25, 25, 29, 28, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29, 25, 29...","[provide, choice, Last year that vouchers are ...","[provide with, choices, government, among a ra..."
63,63,Contrary to what PeopleSoft management would h...,Ellison said that contrary to the contentions ...,"[Contrary, to, what, PeopleSoft, management, w...","[Ellison, said, that, contrary, to, the, conte...",1,1,"[Addition/Deletion, Identity, Semantic based, ...","[25, 29, 28, 21]","[28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 2...","[25, 25, 0, 29, 29, 28, 28, 28, 28, 28, 29, 29...","[Contrary to , Oracle intends to fully support...","[Ellison said, contrary to , Oracle intends to..."
72,72,Also demonstrating box-office strength _ and g...,Also demonstrating box-office strength -- and ...,"[Also, demonstrating, box-office, strength, _,...","[Also, demonstrating, box-office, strength, --...",1,1,"[Spelling changes, Spelling changes, Identity,...","[4, 4, 29, 21, 21]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[O'Neill 's, Day 's, Also demonstrating box-of...","[ONeills, Days, Also demonstrating box-office ..."
86,86,Sales - a figure watched closely as a baromete...,It also disclosed that sales -- a figure close...,"[Sales, -, a, figure, watched, closely, as, a,...","[It, also, disclosed, that, sales, --, a, figu...",1,1,"[Same Polarity Substitution (habitual), Synthe...","[5, 11, 26, 25, 25, 25, 25, 29, 28, 21]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 0, 25, 0, 25, 25, 26, 25, 25, 25,...","[rose, many industry experts, closely, 5 perce...","[were higher, industry experts, closely, by an..."
111,111,The suite comes complete with a word processor...,"The suite includes a word processor, spreadshe...","[The, suite, comes, complete, with, a, word, p...","[The, suite, includes, a, word, processor, ,, ...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 6, 5, 11, 18, 25, 25, 29, 21, 21]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[comes complete with, software, utilizing, an,...","[includes, application, built around, the, XML..."
124,124,"Powell fired back: ""He's accusing the presiden...","If so, Powell said, he's calling the president...","[Powell, fired, back, :, ``, He, 's, accusing,...","[If, so, ,, Powell, said, ,, he, 's, calling, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 25, 25, 25, 29, 21]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[accusing, he, , he said, Powell fired back :,...","[calling, Powell, , Powell said ,, , too, If s..."
126,126,The memo on protecting sales of Windows and ot...,"The memo specifically mentioned Linux, a still...","[The, memo, on, protecting, sales, of, Windows...","[The, memo, specifically, mentioned, Linux, ,,...",1,1,"[Addition/Deletion, Addition/Deletion, Identity]","[25, 25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...",[on protecting sales of Windows and other desk...,"[specifically, The memo mentioned Linux , a st..."


The issue also exists in part in the original ETPC: some paraphrase types have scopes annotated as pretty much the entire sentence. This seems especially prevalent among 'Punctuation changes'.

TODO: rewrite this, show examples 

While this is certainly an issue for the original ETPC, it's at least partly offset there since their annotation scheme has separate scopes for each paraphrase type. So even if the annotated scope of some given type isn't very informative, the entire sentence isn't lost: you'd still have other paraphrase types, which are most likely annotated correctly. But Wahle's dataset (and consequently his training pipeline) doesn't account for this. Whatever process Wahle et al. used for generating that dataset on Huggingface seems to have an especially hard time with sentences in the original ETPC as exemplified above, but the issue happens throughout *all* their dataset.

## Getting paraphrases from the original ETPC

Let's first clean up the dataset

In [15]:
positives.drop(columns=['idx', 'etpc_label', 'mrpc_label', 
                                       'sentence1_scope_etpc', 
                                       'sentence2_scope_etpc', 
                                       'sentence1_segment_text', 
                                       'sentence2_segment_text'])

Unnamed: 0,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,ept_names,ept_ids
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...","[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]"
2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...","[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]"
4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...","[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]"
5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...","[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]"
7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...","[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]"
...,...,...,...,...,...,...
5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...","[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]"
5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...","[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]"
5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...","[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]"
5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...","[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]"


We'll need a column to house the new scopes. Let's initialize that column with empty arrays for each token in the sentence. That way, we can easily tell which tokens haven't been annotated yet.

In [16]:
#TODO: get rid of SettingWithCopyWarning
positives['sentence1_scope'] = positives['sentence1_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))
positives['sentence2_scope'] = positives['sentence2_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))
positives['sentence1_scope'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['sentence1_scope'] = positives['sentence1_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['sentence2_scope'] = positives['sentence2_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))


array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', ''], dtype='<U64')

### Populating types

Helper function to populate type

In [17]:
# TODO: Convert to df apply (rather than series apply on idx)
# TODO: Figure out if 64 char limit will be an issue

def populate_type(idx, ept_id, lookup_df=textual_paraphrases):
    """Given a paraphrase pair (idx) and an EPT paraphrase type (ept_id), convert the EPT type to Paraop, look up the 
    scopes for both sentences in the pair, and fill in the scopes with the Paraop type. Returns a pair of arrays with
    the newly annotated scopes."""
    paraop_id = ept_to_paraop(ept_id)

    # Copy array to avoid messing up the originals
    array1 = np.copy(positives['sentence1_scope'][idx])
    array2 = np.copy(positives['sentence2_scope'][idx])
    
    # Create a subset of the lookup array containing only the paraphrase types
    # we are interested in (ept_id)
    subset = lookup_df[(lookup_df['pair_id'] == idx+1) & (lookup_df['type_id'] == int(ept_id))]
    subset.reset_index(drop=True, inplace=True)
    instances = len(subset['type_id'].values) # Count how many discrete instances of that type are there in this pair

    def fill(sentence_n, instance, array, scope):
        """Helper function for filling in ids"""
        # Identify which indices in the array have not been filled yet
        empty = np.where(array == '')[0]
        nonempty = np.where(array != '')[0]

        # Fill in empty entries
        if len(scope) > 0:
            empty_intersect = np.intersect1d(scope, empty)
            array[empty_intersect] = f'{paraop_id}_{instance}'
            
        # Append to non-empty entries
        nonempty_intersect = np.intersect1d(scope, nonempty)
        if len(nonempty_intersect) > 0:
            # TODO: Log this in a better way (save to a file instead of just printing)
            print(f'Double check type overwriting: row {idx}, sentence {sentence_n}')
            print(f'Common indices: {list(nonempty_intersect)}')
            array[nonempty_intersect] = np.char.add(array[nonempty_intersect], f' & {paraop_id}_{instance}')

    # Filling in
    for i in range(instances):
        # Get scopes from lookup df
        s1_scope = np.array(subset.loc[subset['type_id'] == ept_id, 's1_scope'].iloc[i])
        s2_scope = np.array(subset.loc[subset['type_id'] == ept_id, 's2_scope'].iloc[i])
        
        fill(1, i, array1, s1_scope)
        fill(2, i, array2, s2_scope)
    
    return array1, array2

Here's a demo of how the outputs to that function look like:

In [18]:
populate_type(0, 26)

(array(['3_0', '3_0', '3_0', '3_0', '', '', '', '', '', '', '', '', '', '',
        '', '', '', '', ''], dtype='<U64'),
 array(['', '', '', '', '', '', '', '', '', '', '3_0', '3_0', '3_0', '3_0',
        '', '', '', '', '', ''], dtype='<U64'))

In [19]:
testie = populate_type(0, 26)[0]
#testie
mask = testie == '3_0'
np.where(mask)

(array([0, 1, 2, 3]),)

`populate_type` returns new arrays, it doesn't modify the original df. Use the function below to actually modify the df

In [20]:
def substitute(ept_id, lookup_df=textual_paraphrases):
    series = positives['idx'].apply(populate_type, ept_id=ept_id, lookup_df=lookup_df)
    cols = pd.DataFrame(series.tolist(), columns=['sentence1', 'sentence2'])
    positives.loc[:, 'sentence1_scope'] = cols['sentence1'].values
    positives.loc[:, 'sentence2_scope'] = cols['sentence2'].values

### Performing the reannotation

#### Change of order

In [21]:
substitute(26)

Double check type overwriting: row 196, sentence 1
Common indices: [2]
Double check type overwriting: row 196, sentence 2
Common indices: [9]
Double check type overwriting: row 411, sentence 1
Common indices: [0, 1, 2, 3, 4]
Double check type overwriting: row 411, sentence 2
Common indices: [12, 13, 14, 15]
Double check type overwriting: row 1014, sentence 1
Common indices: [5]
Double check type overwriting: row 1014, sentence 2
Common indices: [10, 11]
Double check type overwriting: row 1543, sentence 1
Common indices: [19]
Double check type overwriting: row 1543, sentence 2
Common indices: [5]
Double check type overwriting: row 1864, sentence 1
Common indices: [5, 6]
Double check type overwriting: row 1864, sentence 2
Common indices: [11, 12]
Double check type overwriting: row 2309, sentence 1
Common indices: [0, 1, 2, 3, 4, 5]
Double check type overwriting: row 2309, sentence 2
Common indices: [10, 11, 12, 13, 14]
Double check type overwriting: row 2336, sentence 1
Common indices: [

In [23]:
positives.loc[positives['idx'] == 196, 'sentence1_scope'].iloc[0]

array(['3_1', '3_1', '3_0 & 3_1', '3_1', '3_1', '3_1', '3_1', '3_1',
       '3_1', '3_1', '3_1', '3_1', '', '', ''], dtype='<U64')

In [24]:
subset = textual_paraphrases[(textual_paraphrases['pair_id'] == 196+1) & (textual_paraphrases['type_id'] == int(26))]
subset

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
741,197,26,Change of order,yes,[2],[9],just,just,,,,
742,197,26,Change of order,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]","[2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 15]",`` I just got carried away and started making ...,he got carried away and just `` started making...,,,,


#### Same Polarity Substitution (Habitual)

In [None]:
substitute(5)

#### Same Polarity Substitution (Contextual)

In [None]:
substitute(6)

#### Derivational Changes

In [None]:
substitute(3)

#### Inflectional Changes

In [None]:
substitute(1)

#### Spelling Changes

In [None]:
substitute(4)

#### Change of format

In [None]:
substitute(8)

#### Opposite Polarity Substitution (Habitual)

In [None]:
substitute(9)

#### Modal Verb Changes

In [None]:
# TODO: Check overlapped words between (e.g.) derivational & inflectional changes
# The way this works right now, you'd have something like ['6_0 & 6_0'] for those
# Make sure this doesn't happen. Probably do a function that does a pass on the
# array of strings later and removes any duplicates

In [None]:
def trim_duplicates(s1_scope, s2_scope, s1_text, s2_text):
    s1_newtext = s1_text.split()
    s2_newtext = s2_text.split()
    
    in1 = np.where(np.in1d(s1_newtext, s2_newtext))[0]
    in2 = np.where(np.in1d(s2_newtext, s1_newtext))[0]

    s1_newscope = np.delete(s1_scope, in1)
    s2_newscope = np.delete(s2_scope, in2)
    s1_newtext = ' '.join(np.delete(s1_newtext, in1))
    s2_newtext = ' '.join(np.delete(s2_newtext, in2))

    return s1_newscope, s2_newscope, s1_newtext, s2_newtext

In [None]:
ric = textual_paraphrases.loc[textual_paraphrases['type_id'].isin([2])]
ric

In [None]:
trimmed = pd.DataFrame(columns = ric.columns, data = copy.deepcopy(ric.values))
trimmed[['s1_scope', 's2_scope', 's1_text', 's2_text']] = trimmed.apply(lambda x: trim_duplicates(x.s1_scope, x.s2_scope, x.s1_text, x.s2_text), axis=1, result_type='expand')
#output = pd.DataFrame(series.tolist(), columns=['s1_scope', 's2_scope', 's1_text', 's2_text'])
#output[:30]
trimmed

In [None]:
substitute(2, trimmed)

### Diagnosing

Run these cells to make sure everything looks OK after reannotating

In [None]:
positives.loc[positives['idx'] == 2953, 'sentence2_scope'].iloc[0]

In [None]:
subset = textual_paraphrases[(textual_paraphrases['pair_id'] == 449+1) & (textual_paraphrases['type_id'] == int(5))]
subset

In [None]:
# For diagnosing
textual_paraphrases.loc[(textual_paraphrases['pair_id'] == 449+1) & (textual_paraphrases['type_id'].isin([1,5,3,26]))]

In [None]:
#TODO: Modal verb needs trimming.
#TODO: So does some other one that I forget rn
#TODO: Numbers on named entity substitution
#TODO: Figure out what's the matter with punctuation changes -- it's the key! Maybe I can use those as indices...

In [None]:
textual_paraphrases[(textual_paraphrases['type_id'] == 21)][:50]

In [None]:
positives['sentence1_scope'][0]

Flagged rows:

2432, 5074, 12186


Sentence modality changes have zero ocurrences among paraphrases

# The garbage pail

In [None]:
auxiliaries = ['are', 'am', 'be', 'been', 'being', 'had', 'has', 'have', 'having', 'is', 'was', 'were']

Code that may or may not be useful will remain here for a while

Change of Order > Identity

Game plan:

Same Polarity Substitution > Derivational Changes > Inflectional Changes > ...Modal Verb Changes? > Change of Order (modified)

## Filtering

Helper methods for filtering the ETPC dataframe based on paraphrase types

In [None]:
def filter_contains(df, search_ids):
  """Returns an ETPC dataframe with rows where paraphrase_types_ids contains
  the search_ids. Use this to search for paraphrase pairs containing specific
  ids"""
  return df[df['ept_ids'].apply(lambda x: np.isin(search_ids, x))]

def filter_equals(df, search_ids):
  """Returns an ETPC dataframe with rows where paraphrase_types_ids EXACTLY 
  MATCHES the search_ids."""
  return df[df['ept_ids'].apply(lambda x: np.array_equal(x, search_ids))]

In [None]:
filter_contains(etpc, '3')

In [None]:
filter_equals(etpc, ['25', '29'])