# Imports

In [1]:
import copy
import pandas as pd
import numpy as np

# Reading the ETPC

This is the ETPC dataset compiled by Wahle and posted on HuggingFace

In [2]:
# Unpickle etpc_raw
etpc = pd.read_pickle('datasets/etpc_raw.pkl')

These are the XML files from the ETPC github repo.

The first one contains all pairs marked as paraphrases by the MRPC:

In [3]:
textual_paraphrases = pd.read_xml('datasets/etpc/textual_paraphrases.xml')
# Convert scopes from strings to lists of ints
textual_paraphrases['s1_scope'] = textual_paraphrases['s1_scope'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)
textual_paraphrases['s2_scope'] = textual_paraphrases['s2_scope'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)

The second one contains the text and pair ids for *all* sentence pairs (paraphrases or not). It doesn't contain any data on whether they're paraphrases or not, or what EPT types are in them.

In [4]:
pairs = pd.read_xml('datasets/etpc/text_pairs.xml')
pairs.drop(columns=['negation'], inplace=True)
pairs.set_index('pair_id', inplace=True)

# Cleanup

## Cleaning up Columns

In [5]:
etpc.rename(columns={'paraphrase_type_ids': 'ept_ids', 'paraphrase_types': 'ept_names'}, inplace=True)
etpc.drop(columns={'negation'}, axis=1, inplace=True)
etpc

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_segment_location,sentence2_segment_location,sentence1_segment_location_indices,sentence2_segment_location_indices,sentence1_segment_text,sentence2_segment_text
0,1_0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 2...","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26...","[[5], [7], [0, 1, 2, 3], [8, 9, 10, 11, 12, 13...","[[1, 2], [0], [10, 11, 12, 13], [4]]","[whom, called, Amrozi accused his brother, `` ...","[to him, Referring, Amrozi accused his brother..."
1,2_1,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...,"[Yucaipa, owned, Dominick, 's, before, selling...","[Yucaipa, bought, Dominick, 's, in, 1995, for,...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
2,3_2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, ...","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 2...","[[0], [14], [8, 9, 10], [17, 18, 19]]","[[4, 5, 6, 7], [18], [0, 1, 2, 3], [8, 9, 10, ...","[They, cargo, on June 10, , he added, had publ...","[the ship 's owners, explosives, On June 10 ,,..."
3,4_3,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ...","[Around, 0335, GMT, ,, Tab, shares, were, up, ...","[Tab, shares, jumped, 20, cents, ,, or, 4.6, %...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
4,5_4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...",0,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, ...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[[0, 1], [2], [11, 12, 14], [13], [13], [7], [...","[[0, 1, 2, 3, 4], [5], [11], [20, 21], [20, 21...","[The stock, rose, to close at, Friday, Friday,...","[PG & E Corp. shares, jumped, to, on Friday, o..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5796,5797_5796,"After Hughes refused to rehire Hernandez, he c...",Hernandez filed an Equal Employment Opportunit...,"[After, Hughes, refused, to, rehire, Hernandez...","[Hernandez, filed, an, Equal, Employment, Oppo...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
5797,5798_5797,There are 103 Democrats in the Assembly and 47...,Democrats dominate the Assembly while Republic...,"[There, are, 103, Democrats, in, the, Assembly...","[Democrats, dominate, the, Assembly, while, Re...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
5798,5799_5798,Bethany Hamilton remained in stable condition ...,"Bethany, who remained in stable condition afte...","[Bethany, Hamilton, remained, in, stable, cond...","[Bethany, ,, who, remained, in, stable, condit...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
5799,5800_5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[26, 26, 0, 7, 7, 6, 1, 25, 25, 4, 25, 25, 25,...","[25, 25, 25, 25, 7, 0, 6, 1, 0, 0, 4, 25, 0, 2...","[[9], [3, 4], [5], [6], [0, 1], [24], [7, 8, 1...","[[10], [4], [6], [7], [13, 14], [0, 1, 2, 3], ...","[Corp, power station’s, US, owners, Last week,...","[Corp., Drax, American, owner, last week, The ..."


## Remapping paraphrase IDs


First, make a list with paraphrase types and IDs from the ETPC:

In [6]:
id_map = pd.read_xml('https://raw.githubusercontent.com/venelink/ETPC/master/Corpus/paraphrase_types.xml')
# Rename columns for clarity
id_map.rename(columns={'type_id': 'ept_id', 'type_name': 'ept_name'}, inplace=True)
# Drop unused data
id_map = id_map[['ept_id', 'ept_name']] # No use for type_category column
id_map.drop(id_map.tail(2).index,inplace=True) # Types don't appear in ETPC
id_map.style.hide(axis="index")
id_map

Unnamed: 0,ept_id,ept_name
0,1,Inflectional Changes
1,2,Modal Verb Changes
2,3,Derivational Changes
3,4,Spelling changes
4,5,Same Polarity Substitution (habitual)
5,6,Same Polarity Substitution (contextual)
6,7,Same Polarity Substitution (named ent.)
7,8,Change of format
8,9,Opposite polarity substitution (habitual)
9,10,Opposite polarity substitution (contextual)


Now, make a list with paraphrase names and IDs for ParaOp types

In [7]:
data = [[0, 'No change'],
        [1, 'Addition/Deletion - Function Word'],
        [2, 'Addition/Deletion - Content Word'],
        [3, 'Change of Order'],
        [4, 'Substitution - Synonym'],
        [5, 'Substitution - Contextual Synonym'],
        [6, 'Substitution - Morphological'],
        [7, 'Substitution - Spelling and Format']
       ]
paraop_map = pd.DataFrame(data, columns = ['paraop_id', 'paraop_name'])
paraop_map.set_index('paraop_id', inplace=True)
paraop_map

Unnamed: 0_level_0,paraop_name
paraop_id,Unnamed: 1_level_1
0,No change
1,Addition/Deletion - Function Word
2,Addition/Deletion - Content Word
3,Change of Order
4,Substitution - Synonym
5,Substitution - Contextual Synonym
6,Substitution - Morphological
7,Substitution - Spelling and Format


### Mapping

We'll use the dataframe below for mapping. Each row will contain the name and ID of a paraphrase type in the ETPC, and the name and ID of the correspondent ParaOp type.

In [8]:
id_map['paraop_id'] = ''
id_map['paraop_name'] = ''
id_map

Unnamed: 0,ept_id,ept_name,paraop_id,paraop_name
0,1,Inflectional Changes,,
1,2,Modal Verb Changes,,
2,3,Derivational Changes,,
3,4,Spelling changes,,
4,5,Same Polarity Substitution (habitual),,
5,6,Same Polarity Substitution (contextual),,
6,7,Same Polarity Substitution (named ent.),,
7,8,Change of format,,
8,9,Opposite polarity substitution (habitual),,
9,10,Opposite polarity substitution (contextual),,


Here's where we do the mapping:

In [9]:
# Helper function to map an ETPC id to a Paraop id
def map_id(ept_id, paraop_id):
    """Given an EPT id and a Paraop id, look up the name of the Paraop id and 
    fill in the rows of id_map with paraop_id and the name."""
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'] = paraop_id
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_name'] = paraop_map.loc[paraop_id, 'paraop_name']

In [10]:
map_id(ept_id=1, paraop_id=6)
map_id(ept_id=3, paraop_id=6)
map_id(ept_id=26, paraop_id=3)
map_id(ept_id=29, paraop_id=0)
map_id(4, 7)
map_id(5, 4)
map_id(6, 5)
id_map

Unnamed: 0,ept_id,ept_name,paraop_id,paraop_name
0,1,Inflectional Changes,6.0,Substitution - Morphological
1,2,Modal Verb Changes,,
2,3,Derivational Changes,6.0,Substitution - Morphological
3,4,Spelling changes,7.0,Substitution - Spelling and Format
4,5,Same Polarity Substitution (habitual),4.0,Substitution - Synonym
5,6,Same Polarity Substitution (contextual),5.0,Substitution - Contextual Synonym
6,7,Same Polarity Substitution (named ent.),,
7,8,Change of format,,
8,9,Opposite polarity substitution (habitual),,
9,10,Opposite polarity substitution (contextual),,


TODO: Figure out a way to hide index of map_id throughout whole notebook. For some reason this seems harder than it needs to be...

# Filtering

Helper methods for filtering the ETPC dataframe based on paraphrase types

In [11]:
# Helper function to get a Paraop id from an ETPC id
def ept_to_paraop(ept_id):
    return id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'].iloc[0]

ept_to_paraop(3)

6

In [12]:
def filter_contains(df, search_ids):
  """Returns an ETPC dataframe with rows where paraphrase_types_ids contains
  the search_ids. Use this to search for paraphrase pairs containing specific
  ids"""
  return df[df['ept_ids'].apply(lambda x: np.isin(search_ids, x))]

def filter_equals(df, search_ids):
  """Returns an ETPC dataframe with rows where paraphrase_types_ids EXACTLY 
  MATCHES the search_ids."""
  return df[df['ept_ids'].apply(lambda x: np.array_equal(x, search_ids))]

In [13]:
filter_contains(etpc, '3')

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_segment_location,sentence2_segment_location,sentence1_segment_location_indices,sentence2_segment_location_indices,sentence1_segment_text,sentence2_segment_text
142,143_142,Tyco later said the loan had not been forgiven...,"Tyco has said the loan was not forgiven, but t...","[Tyco, later, said, the, loan, had, not, been,...","[Tyco, has, said, the, loan, was, not, forgive...",1,1,"[Same Polarity Substitution (contextual), Infl...","[6, 1, 1, 3, 24, 25, 29, 21]","[0, 25, 1, 29, 29, 1, 29, 1, 1, 29, 6, 29, 29,...","[25, 1, 1, 25, 25, 1, 25, 1, 25, 6, 24, 29, 29...","[[10], [2], [5, 7, 8], [15], [11, 12, 13, 14, ...","[[9], [1, 2], [5, 7], [12], [10, 11, 12, 13, 1...","[and, said, had been forgiven, full, Swartz re...","[but, has said, was forgiven, fully, that Swar..."
149,150_149,She estimated it would take three months and w...,She said it would take an estimated three mont...,"[She, estimated, it, would, take, three, month...","[She, said, it, would, take, an, estimated, th...",1,1,"[Synthetic/analytic substitution, Derivational...","[11, 3, 16, 25, 25, 29]","[25, 3, 25, 16, 25, 25, 25, 25, 16, 25, 11, 25...","[0, 0, 0, 16, 16, 0, 3, 0, 0, 25, 25, 0, 16, 1...","[[10], [1], [3, 4, 8, 9], [0, 2, 4, 5, 6, 7, 9...","[[13, 14], [6], [3, 4, 12], [9, 10]]","[cancellation, estimated, would take would req...","[the cancellation, estimated, would take requi..."
238,239_238,Saddam loyalists have been blamed for sabotagi...,Hussein loyalists have been blamed for sabotag...,"[Saddam, loyalists, have, been, blamed, for, s...","[Hussein, loyalists, have, been, blamed, for, ...",1,1,"[Same Polarity Substitution (named ent.), Deri...","[7, 3, 8, 25, 29, 21]","[7, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 0,...","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[[0], [16], [18], [12, 13, 14], [1, 2, 3, 4, 5...","[[0], [12], [13], [1, 2, 3, 4, 5, 6, 7, 8, 9, ...","[Saddam, attacks, U.S., as well as, loyalists ...","[Hussein, attacking, US, loyalists have been b..."
254,255_254,"""It's amazing to be part of an industry that r...","""It's amazing to be part of an industry that r...","[``, It, 's, amazing, to, be, part, of, an, in...","[``, It, 's, amazing, to, be, part, of, an, in...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 3, 29]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[[24], [22, 23], [23], [0, 1, 2, 3, 4, 5, 6, 7...","[[22], [20, 21, 27, 28], [21], [0, 1, 2, 3, 4,...","[of, recent graduate, graduate, `` It 's amazi...","[from, only graduated last May, graduated, `` ..."
286,287_286,The search was concentrated in northeast Penns...,The search was concentrated in northeastern Pe...,"[The, search, was, concentrated, in, northeast...","[The, search, was, concentrated, in, northeast...",1,1,"[Derivational Changes, Addition/Deletion, Iden...","[3, 25, 29, 28]","[29, 29, 29, 29, 29, 3, 29, 29, 29, 29, 29, 29...","[25, 25, 25, 25, 25, 3, 25, 25, 25, 25, 25, 25...","[[5], [23, 24], [0, 1, 2, 3, 4, 6, 7, 8, 9, 10...","[[5], [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, ...","[northeast, by now, The search was concentrate...","[northeastern, The search was concentrated in ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5621,5622_5621,Palm Wednesday announced plans to acquire Hand...,Palm said on Wednesday it plans to buy Handspr...,"[Palm, Wednesday, announced, plans, to, acquir...","[Palm, said, on, Wednesday, it, plans, to, buy...",0,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 6, 11, 3, 26, 29, 30]","[29, 26, 5, 3, 29, 5, 29, 29, 29, 29, 6, 29, 2...","[29, 5, 26, 26, 0, 3, 29, 5, 29, 29, 29, 29, 6...","[[2], [5], [10], [1], [3], [1], [0, 4, 6, 7, 8...","[[1], [7], [12], [2, 3], [5], [2, 3], [0, 6, 8...","[announced, acquire, started, Wednesday, plans...","[said, buy, created, on Wednesday, plans, on W..."
5702,5703_5702,Some opposition leaders said they would reserv...,Some opposition leaders called for withdrawing...,"[Some, opposition, leaders, said, they, would,...","[Some, opposition, leaders, called, for, withd...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 3, 26, 25, 29]","[26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 2...","[6, 6, 6, 25, 25, 3, 25, 0, 0, 26, 26, 26, 26,...","[[0, 1, 2], [14], [19], [0, 1, 2, 3, 4, 5, 6, ...","[[9], [0, 1, 2], [5], [9, 10, 11, 12, 13, 14, ...","[Some opposition leaders, others, withdrawal, ...","[others, Some opposition leaders, withdrawing,..."
5709,5710_5709,Women who eat potatoes and other tuberous vege...,Australian researchers believe they have found...,"[Women, who, eat, potatoes, and, other, tubero...","[Australian, researchers, believe, they, have,...",1,1,"[Same Polarity Substitution (named ent.), Same...","[7, 5, 3, 24, 18, 26, 25, 29]","[26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 2...","[7, 0, 0, 25, 25, 25, 26, 26, 26, 26, 26, 26, ...","[[23], [0], [15], [0, 1, 2, 3, 4, 5, 6, 7, 8, ...","[[0], [16], [7], [6, 7, 8, 9, 10, 11, 12, 13, ...","[Melbourne, Women, triggering, Women who eat p...","[Australian, mothers, trigger, a trigger of ty..."
5712,5713_5712,There is only one drug on the market for macul...,There is only one drug on the market for macul...,"[There, is, only, one, drug, on, the, market, ...","[There, is, only, one, drug, on, the, market, ...",1,1,"[Derivational Changes, Subordination and nesti...","[3, 18, 25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[17], [19, 20, 21, 22, 23, 24, 25, 26], [18]]","[[18], [20, 21, 22, 23, 24, 25, 26], [0, 1, 2,...","[treat, one subtype that represents a minority...","[treatment, one subtype representing a minorit..."


In [14]:
filter_equals(etpc, ['25', '29'])

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_segment_location,sentence2_segment_location,sentence1_segment_location_indices,sentence2_segment_location_indices,sentence1_segment_text,sentence2_segment_text
167,168_167,U.S. law enforcement officials are sneering at...,U.S. law enforcement officials are sneering at...,"[U.S., law, enforcement, officials, are, sneer...","[U.S., law, enforcement, officials, are, sneer...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[[15, 16, 17, 18, 19, 20, 21, 22, 23]]",[U.S. law enforcement officials are sneering a...,[-- including a police conspiracy to discredit...
645,646_645,I called the number and the lady told me she w...,I called the number and the lady told me she w...,"[I, called, the, number, and, the, lady, told,...","[I, called, the, number, and, the, lady, told,...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[18, 20, 21, 22, 23, 24]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[, Sherry Studabaker told BBC television, I ca...",[I called the number and the lady told me she ...
1017,1018_1017,He said the problem needs to be corrected befo...,He said the prob lem needs to be corrected bef...,"[He, said, the, problem, needs, to, be, correc...","[He, said, the, prob, lem, needs, to, be, corr...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[13, 14, 15]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[is cleared to, He said the problem needs to b...",[He said the prob lem needs to be corrected be...
2046,2047_2046,Other recommendations included a special couns...,Other recommendations included the creation of...,"[Other, recommendations, included, a, special,...","[Other, recommendations, included, the, creati...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",[a special counsel on oceans in the White Hous...,[Other recommendations included the creation o...
2063,2064_2063,"""For me, the Lewinsky imbroglio seemed like ju...","""For me, the Lewinsky imbroglio seemed like ju...","[``, For, me, ,, the, Lewinsky, imbroglio, see...","[``, For, me, ,, the, Lewinsky, imbroglio, see...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[17, 19, 20, 21, 22, 23]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[, according to extracts leaked yesterday, `` ...","[`` For me , the Lewinsky imbroglio seemed lik..."
2180,2181_2180,"And in the Muslim world, Osama bin Laden is be...","And in the Muslim world, Osama bin Laden, the ...","[And, in, the, Muslim, world, ,, Osama, bin, L...","[And, in, the, Muslim, world, ,, Osama, bin, L...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[[9, 10, 11, 12, 13, 14, 15, 16, 17, 18]]","[And in the Muslim world , Osama bin Laden is ...","[, the missing leader of the al-Qaida terroris..."
2229,2230_2229,This is a process and there will be other oppo...,This is a process and there will be other oppo...,"[This, is, a, process, and, there, will, be, o...","[This, is, a, process, and, there, will, be, o...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[21, 22, 23]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[he told reporters, This is a process and ther...",[This is a process and there will be other opp...
2282,2283_2282,"""Right from the beginning, we didn't want to s...","But Mr. Crosby told The Associated Press: ""Rig...","[``, Right, from, the, beginning, ,, we, did, ...","[But, Mr., Crosby, told, The, Associated, Pres...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 0...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[[0, 1, 2, 3, 4, 5, 6, 7]]","[`` Right from the beginning , we did n't want...","[But Mr. Crosby told The Associated Press :, `..."
2703,2704_2703,It's almost as if they (Russians) hit an x-mar...,It's almost as if they (Russians) hit an x-mar...,"[It, 's, almost, as, if, they, (, Russians, ),...","[It, 's, almost, as, if, they, (, Russians, ),...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[15, 17, 18, 19, 20, 21]]","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[, NASA spokesman Robert Navias said, It 's al...",[It 's almost as if they ( Russians ) hit an x...
2786,2787_2786,"""This puts telemarketers on notice that we wil...","""This puts telemarketers on notice that we wil...","[``, This, puts, telemarketers, on, notice, th...","[``, This, puts, telemarketers, on, notice, th...",1,1,"[Addition/Deletion, Identity]","[25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","[[25, 27, 28, 29, 30, 31]]",[`` This puts telemarketers on notice that we ...,"[, FCC chairman Michael Powell said, `` This p..."


# Reannotation, continued

## Creating positives dataframe

In [15]:
positives = etpc.loc[etpc['mrpc_label'] == 1]
positives.rename(columns={'sentence1_segment_location': 'sentence1_scope_etpc', 'sentence2_segment_location': 'sentence2_scope_etpc'}, inplace=True)
positives.drop(columns=['sentence1_segment_location_indices', 'sentence2_segment_location_indices'],inplace=True)
positives['idx'] = positives.index.to_series()
positives

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives.rename(columns={'sentence1_segment_location': 'sentence1_scope_etpc', 'sentence2_segment_location': 'sentence2_scope_etpc'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives.drop(columns=['sentence1_segment_location_indices', 'sentence2_segment_location_indices'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['idx'] = positives.index.to_series()


Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_scope_etpc,sentence2_scope_etpc,sentence1_segment_text,sentence2_segment_text
0,0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 2...","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26...","[whom, called, Amrozi accused his brother, `` ...","[to him, Referring, Amrozi accused his brother..."
2,2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, ...","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 2...","[They, cargo, on June 10, , he added, had publ...","[the ship 's owners, explosives, On June 10 ,,..."
4,4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...",0,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, ...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[The stock, rose, to close at, Friday, Friday,...","[PG & E Corp. shares, jumped, to, on Friday, o..."
5,5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...",1,1,"[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]","[25, 11, 11, 11, 11, 11, 11, 11, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 11, 11...","[in the first quarter of the year, Revenue dro...","[the first quarter of the year, With the scand..."
7,7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]","[25, 4, 25, 25, 25, 25, 5, 25, 25, 25]","[25, 4, 4, 25, 25, 25, 25, 25, 5, 25, 25, 25]","[DVD-CCA, state, then, The appealed to the Sup...","[DVD CCA, U.S., that decision, The appealed to..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5792,5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...",1,1,"[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[29, 29, 29, 29, 29, 29, 29, 26, 26, 26, 26, 2...","[authorities said, Gehring waived extradition ...",[Gehring waived extradition Monday during a he...
5793,5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...",1,1,"[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[Silver, Silver, `` I am advised that certain ...","[the Silver statement, the Silver statement, ,..."
5795,5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...",0,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[deal, be completed, The deal , approved by bo...","[acquisition, close, The acquisition has been ..."
5799,5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[26, 26, 0, 7, 7, 6, 1, 25, 25, 4, 25, 25, 25,...","[25, 25, 25, 25, 7, 0, 6, 1, 0, 0, 4, 25, 0, 2...","[Corp, power station’s, US, owners, Last week,...","[Corp., Drax, American, owner, last week, The ..."


In [16]:
#id_map = id_map.style.hide(axis="index")
#id_map

In [17]:
test = filter_contains(positives, '4')

## Diagnosing

In [18]:
def diagnose(row, typee):
    print('Sentences:')
    print(test['sentence1'][row])
    print(test['sentence2'][row])
    #print(test['sentence1_tokenized'][row])
    #print(test['sentence2_tokenized'][row])
    print()
    print(f'Words where type {typee} is found:')
    sent1 = test['sentence1_tokenized'][row]
    sent2 = test['sentence2_tokenized'][row]
    print(sent1[test['sentence1_scope_etpc'][row] == typee])
    print(sent2[test['sentence2_scope_etpc'][row] == typee])
    print()
    print('Scopes:')
    print(test['sentence1_scope_etpc'][row])
    print(test['sentence2_scope_etpc'][row])

In [19]:
diagnose(72, 4)

Sentences:
Also demonstrating box-office strength _ and getting seven Tony nominations _ was a potent revival of Eugene O'Neill's family drama, "Long Day's Journey Into Night."
Also demonstrating box-office strength -- and getting seven Tony nominations -- was a potent revival of Eugene ONeills family drama, Long Days Journey Into Night."

Words where type 4 is found:
[]
[]

Scopes:
[21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21
 21 21 21 21 21 21 21]
[21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21
 21 21 21 21]


## Why we cannot use the ETPC from Wahle et al.

Here's a fundamental part of the ETPC that I hadn't realized until now: each token in a sentence can have *more than one* paraphrase type. Here's an example--note how, in sentence 2, token 5 appears in the scopes both of inflectional and derivational changes.

In [20]:
ric = textual_paraphrases.loc[(textual_paraphrases['pair_id'] == 4205+1) & (textual_paraphrases['type_id'].isin([3,1]))]
ric[:2]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
15963,4206,1,Inflectional Changes,yes,[3],"[3, 5]",completed,had inspected,,,,
15964,4206,3,Derivational Changes,yes,[4],[5],inspections,inspected,,,,


It seems that this issue also wasn't noticed by Wahle et al: some paraphrase scopes consist of only a single number repeated for the entirety of the list:

In [21]:
positives[positives['sentence1_scope_etpc'].apply(lambda x: (len(np.unique(x)) == 1))][:5]

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_scope_etpc,sentence2_scope_etpc,sentence1_segment_text,sentence2_segment_text
14,14,He told The Sun newspaper that Mr. Hussein's d...,"""Saddam's daughters had British schools and ho...","[He, told, The, Sun, newspaper, that, Mr., Hus...","[``, Saddam, 's, daughters, had, British, scho...",1,1,"[Same Polarity Substitution (named ent.), Same...","[7, 6, 7, 26, 25, 29, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[0, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26...","[Hussein, The Sun newspaper, Mr. Hussein, Mr. ...","[Saddam, The Sun, Saddam, Saddam 's daughters ..."
22,22,But tropical storm warnings and watches were p...,Tropical storm warnings were in place Thursday...,"[But, tropical, storm, warnings, and, watches,...","[Tropical, storm, warnings, were, in, place, T...",0,1,"[Addition/Deletion, Addition/Deletion, Identit...","[25, 25, 29, 30, 4, 6, 11, 17]","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[But, tropical storm warnings watches , the so...","[Jamaica and, storm warnings watches , the sou..."
35,35,Trading in Loral was halted yesterday; the sha...,The New York Stock Exchange suspended trading ...,"[Trading, in, Loral, was, halted, yesterday, ;...","[The, New, York, Stock, Exchange, suspended, t...",0,1,"[Same Polarity Substitution (habitual), Diathe...","[5, 14, 18, 29, 30, 21]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[halted, Trading in Loral was halted, Trading ...","[suspended, The New York Stock Exchange suspen..."
40,40,Last year the court upheld Cleveland's school ...,"Last year, the court ruled 5-4 in an Ohio case...","[Last, year, the, court, upheld, Cleveland, 's...","[Last, year, ,, the, court, ruled, 5-4, in, an...",1,1,"[Same Polarity Substitution (contextual), Infl...","[6, 1, 25, 25, 29, 28, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29, 25, 29...","[provide, choice, Last year that vouchers are ...","[provide with, choices, government, among a ra..."
63,63,Contrary to what PeopleSoft management would h...,Ellison said that contrary to the contentions ...,"[Contrary, to, what, PeopleSoft, management, w...","[Ellison, said, that, contrary, to, the, conte...",1,1,"[Addition/Deletion, Identity, Semantic based, ...","[25, 29, 28, 21]","[28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 2...","[25, 25, 0, 29, 29, 28, 28, 28, 28, 28, 29, 29...","[Contrary to , Oracle intends to fully support...","[Ellison said, contrary to , Oracle intends to..."


The issue also exists in part in the original ETPC: some paraphrase types have scopes annotated as pretty much the entire sentence. This seems especially prevalent among 'Punctuation changes'.

TODO: rewrite this, show examples 

While this is certainly an issue for the original ETPC, it's at least partly offset there since their annotation scheme has separate scopes for each paraphrase type. So even if the annotated scope of some given type isn't very informative, the entire sentence isn't lost: you'd still have other paraphrase types, which are most likely annotated correctly. But Wahle's dataset (and consequently his training pipeline) doesn't account for this. Whatever process Wahle et al. used for generating that dataset on Huggingface seems to have an especially hard time with sentences in the original ETPC as exemplified above, but the issue happens throughout *all* their dataset.

## Getting paraphrases from the original ETPC

Let's first clean up the dataset

In [22]:
positives.drop(columns=['idx', 'etpc_label', 'mrpc_label', 
                                       'sentence1_scope_etpc', 
                                       'sentence2_scope_etpc', 
                                       'sentence1_segment_text', 
                                       'sentence2_segment_text'])

Unnamed: 0,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,ept_names,ept_ids
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...","[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]"
2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...","[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]"
4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...","[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]"
5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...","[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]"
7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...","[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]"
...,...,...,...,...,...,...
5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...","[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]"
5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...","[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]"
5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...","[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]"
5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...","[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]"


We'll need a column to house the new scopes. Let's initialize that column with empty strings for each token in the sentence. That way, we can easily tell which tokens haven't been annotated yet.

In [23]:
#TODO: get rid of SettingWithCopyWarning
positives['sentence1_scope'] = positives['sentence1_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U10'))
positives['sentence2_scope'] = positives['sentence2_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U10'))
positives['sentence1_scope'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['sentence1_scope'] = positives['sentence1_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U10'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['sentence2_scope'] = positives['sentence2_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U10'))


array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', ''], dtype='<U10')

## Populating types

Helper function to populate type

In [24]:
# TODO: Do we need a special case where array available is empty (no available 
# spots)? Unclear since we haven't run into this yet, but let's watch out 
# TODO: Docstring
def populate_type(idx, ept_id):
    array1 = np.copy(positives['sentence1_scope'][idx])
    array2 = np.copy(positives['sentence2_scope'][idx])
    array1_available = np.where(array1 == '')[0]
    array2_available = np.where(array2 == '')[0]
    array1_unavailable = np.where(array1 != '')[0]
    array2_unavailable = np.where(array2 != '')[0]
    
    paraop_id = ept_to_paraop(ept_id)

    subset = textual_paraphrases[(textual_paraphrases['pair_id'] == idx+1) & 
                                 (textual_paraphrases['type_id'] == int(ept_id))]
    subset.reset_index(drop=True, inplace=True)
    
    count = len(subset['type_id'].values)
    
    for i in range(count):
        s1_scope = np.array(subset.loc[subset['type_id'] == ept_id, 's1_scope'].iloc[0])
        s2_scope = np.array(subset.loc[subset['type_id'] == ept_id, 's2_scope'].iloc[0])
        if len(s1_scope) > 0:
            allowed_indices1 = np.intersect1d(s1_scope, array1_available)
            array1[allowed_indices1] = f'{paraop_id}_{i}'
            
            # For debugging
            unallowed_indices1 = np.intersect1d(s1_scope, array1_unavailable)
            if len(unallowed_indices1) > 0:
                print(f'Oops! Just tried to overwrite a type. Sentence 1, row {idx}')
                print(f'Common indices: {list(unallowed_indices1)}')
        
        if len(s2_scope) > 0:
            allowed_indices2 = np.intersect1d(s2_scope, array2_available)
            array2[allowed_indices2] = f'{paraop_id}_{i}'
            
            # For debugging
            unallowed_indices2 = np.intersect1d(s2_scope, array2_unavailable)
            if len(unallowed_indices2) > 0:
                print(f'Oops! Just tried to overwrite a type. Sentence 2, row {idx}')
                print(f'Common indices: {list(unallowed_indices2)}')
    
    return array1, array2

In [25]:
populate_type(0, 26)

(array(['3_0', '3_0', '3_0', '3_0', '', '', '', '', '', '', '', '', '', '',
        '', '', '', '', ''], dtype='<U10'),
 array(['', '', '', '', '', '', '', '', '', '', '3_0', '3_0', '3_0', '3_0',
        '', '', '', '', '', ''], dtype='<U10'))

In [26]:
positives['idx'].apply(populate_type, ept_id=23)

0       ([, , , , , , , , , , , , , , , , , , ], [, , ...
2       ([, , , , , , , , , , , , , , , , , , , , ], [...
4       ([, , , , , , , , , , , , , , , , , , , , , , ...
5       ([, , , , , , , , , , , , , , , , , , ], [, , ...
7        ([, , , , , , , , , ], [, , , , , , , , , , , ])
                              ...                        
5792    ([, , , , , , , , , , , , , , , , , , , , , , ...
5793    ([, , , , , , , , , , , , , , , , , , , , ], [...
5795    ([, , , , , , , , , , , , , , , , , , , , , , ...
5799    ([, , , , , , , , , , , , , , , , , , , , , , ...
5800    ([, , , , , , , , , , , , , , , , , , , , , , ...
Name: idx, Length: 3900, dtype: object

`populate_type` only returns new arrays, so it doesn't modify the original df. Use the function below to modify the df

In [45]:
def substitute(ept_id):
    series = positives['idx'].apply(populate_type, ept_id=ept_id)
    cols = pd.DataFrame(series.tolist(), columns=['sentence1', 'sentence2'])
    positives.loc[:, 'sentence1_scope'] = cols['sentence1'].values
    positives.loc[:, 'sentence2_scope'] = cols['sentence2'].values

### Performing the reannotation

The order matters!

In [40]:
# Same Polarity Substitution
substitute(6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['sentence1_scope'] = cols['sentence1'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['sentence2_scope'] = cols['sentence2'].values


In [46]:
# Derivational Changes
substitute(3)

Oops! Just tried to overwrite a type. Sentence 1, row 2112
Common indices: [5]
Oops! Just tried to overwrite a type. Sentence 2, row 2112
Common indices: [6]
Oops! Just tried to overwrite a type. Sentence 1, row 2346
Common indices: [2]
Oops! Just tried to overwrite a type. Sentence 2, row 2346
Common indices: [4]
Oops! Just tried to overwrite a type. Sentence 1, row 2394
Common indices: [11]
Oops! Just tried to overwrite a type. Sentence 2, row 2394
Common indices: [3]
Oops! Just tried to overwrite a type. Sentence 1, row 2973
Common indices: [14]
Oops! Just tried to overwrite a type. Sentence 2, row 2973
Common indices: [9]
Oops! Just tried to overwrite a type. Sentence 1, row 4562
Common indices: [14]
Oops! Just tried to overwrite a type. Sentence 2, row 4562
Common indices: [6]
Oops! Just tried to overwrite a type. Sentence 1, row 4901
Common indices: [23]
Oops! Just tried to overwrite a type. Sentence 2, row 4901
Common indices: [15]


In [30]:
# Inflectional Changes
#substitute(1)

In [31]:
# Identity
#substitute(29)

In [32]:
# Change of order
#substitute(26)

In [33]:
positives

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_scope_etpc,sentence2_scope_etpc,sentence1_segment_text,sentence2_segment_text,sentence1_scope,sentence2_scope
0,0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 2...","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26...","[whom, called, Amrozi accused his brother, `` ...","[to him, Referring, Amrozi accused his brother...","[, , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , ]"
2,2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, ...","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 2...","[They, cargo, on June 10, , he added, had publ...","[the ship 's owners, explosives, On June 10 ,,...","[, , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , ]"
4,4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...",0,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, ...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[The stock, rose, to close at, Friday, Friday,...","[PG & E Corp. shares, jumped, to, on Friday, o...","[, , , , , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , , ]"
5,5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...",1,1,"[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]","[25, 11, 11, 11, 11, 11, 11, 11, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 11, 11...","[in the first quarter of the year, Revenue dro...","[the first quarter of the year, With the scand...","[, , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , , ,..."
7,7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]","[25, 4, 25, 25, 25, 25, 5, 25, 25, 25]","[25, 4, 4, 25, 25, 25, 25, 25, 5, 25, 25, 25]","[DVD-CCA, state, then, The appealed to the Sup...","[DVD CCA, U.S., that decision, The appealed to...","[, , , , , , , , , ]","[, , , , , , , , , , , ]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5792,5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...",1,1,"[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[29, 29, 29, 29, 29, 29, 29, 26, 26, 26, 26, 2...","[authorities said, Gehring waived extradition ...",[Gehring waived extradition Monday during a he...,"[, , , , , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , , , ]"
5793,5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...",1,1,"[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[Silver, Silver, `` I am advised that certain ...","[the Silver statement, the Silver statement, ,...","[, , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , , ,..."
5795,5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...",0,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[deal, be completed, The deal , approved by bo...","[acquisition, close, The acquisition has been ...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , , ]"
5799,5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[26, 26, 0, 7, 7, 6, 1, 25, 25, 4, 25, 25, 25,...","[25, 25, 25, 25, 7, 0, 6, 1, 0, 0, 4, 25, 0, 2...","[Corp, power station’s, US, owners, Last week,...","[Corp., Drax, American, owner, last week, The ...","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."


In [41]:
ric = textual_paraphrases.loc[textual_paraphrases['type_id'].isin([3])]
ric[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
531,143,3,Derivational Changes,yes,[15],[12],full,fully,,,,
552,150,3,Derivational Changes,yes,[1],[6],estimated,estimated,,,,
918,239,3,Derivational Changes,yes,[16],[12],attacks,attacking,,,,
979,255,3,Derivational Changes,yes,[23],[21],graduate,graduated,,,,
1123,287,3,Derivational Changes,yes,[5],[5],northeast,northeastern,,,,
1332,338,3,Derivational Changes,yes,[12],[4],visits,visit,,,,
1382,357,3,Derivational Changes,yes,[5],[4],addition,adding,,,,
1638,429,3,Derivational Changes,yes,[9],[8],comment,comment,,,,
1657,434,3,Derivational Changes,yes,[7],[9],sentenced,sentence,,,,
1658,434,3,Derivational Changes,yes,[17],[17],showing,expressed,,,,


In [35]:
morethan = textual_paraphrases.loc[textual_paraphrases['type_id'].isin([3])]
morethan[(morethan['s1_text'].str.contains(' ')) | (morethan['s2_text'].str.contains(' '))]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
4145,1117,3,Derivational Changes,yes,"[5, 6]",[4],was centered,center,,,,
15041,3972,3,Derivational Changes,yes,"[13, 14]",[16],the U.S.,American,,,,
18766,4922,3,Derivational Changes,yes,"[11, 12]",[5],are suspected,suspected,,,,


In [36]:
morethan = textual_paraphrases.loc[textual_paraphrases['type_id'].isin([1])]
morethan[(morethan['s1_text'].str.contains(' ')) | (morethan['s2_text'].str.contains(' '))][:50]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
147,48,1,Inflectional Changes,yes,"[2, 3]","[2, 3]",has said,had announced,,,,
163,51,1,Inflectional Changes,yes,[1],"[1, 3]",said,has said,,,,
176,53,1,Inflectional Changes,yes,"[4, 5]","[5, 6, 7]",was downplayed,has been downplayed,,,,
190,56,1,Inflectional Changes,yes,[1],"[5, 7]",integrates,were integrated,,,,
322,89,1,Inflectional Changes,yes,[20],"[21, 22]",died,has died,,,,
326,90,1,Inflectional Changes,yes,"[2, 3]","[2, 3]",are declaring,have declared,,,,
397,111,1,Inflectional Changes,yes,[12],"[11, 12]",called,had called,,,,
469,128,1,Inflectional Changes,yes,[10],"[2, 3]",told,had told,,,,
501,136,1,Inflectional Changes,yes,"[1, 2]",[1],had been,were,,,,
529,143,1,Inflectional Changes,yes,[2],"[1, 2]",said,has said,,,,


Flagged rows:

2432, 5074, 12186


In [37]:
auxiliaries = ['are', 'am', 'be', 'been', 'being', 'had', 'has', 'have', 'having', 'is', 'was', 'were']

### Trimming duplicates

TODO: move this section up since this will have to be done before we actually reannotate

TODO: for dealing with change of order, we will need a function that does the opposite of trim_duplicates (keep_duplicates)

In [38]:
def trim_duplicates(s1_scope, s2_scope, s1_text, s2_text):
    s1_newtext = s1_text.split()
    s2_newtext = s2_text.split()
    
    in1 = np.where(np.in1d(s1_newtext, s2_newtext))[0]
    in2 = np.where(np.in1d(s2_newtext, s1_newtext))[0]

    s1_newscope = np.delete(s1_scope, in1)
    s2_newscope = np.delete(s2_scope, in2)
    s1_newtext = ' '.join(np.delete(s1_newtext, in1))
    s2_newtext = ' '.join(np.delete(s2_newtext, in2))

    return s1_newscope, s2_newscope, s1_newtext, s2_newtext

In [None]:
test = pd.DataFrame(columns = ric.columns, data = copy.deepcopy(ric.values))
series = test.apply(lambda x: trim_duplicates(x.s1_scope, x.s2_scope, x.s1_text, x.s2_text), axis=1)
output = pd.DataFrame(series.tolist(), columns=['s1_scope', 's2_scope', 's1_text', 's2_text'])
output[:5]

Unnamed: 0,s1_scope,s2_scope,s1_text,s2_text
0,[22],[21],choice,choices
1,"[2, 3]","[2, 3]",has said,had announced
2,[],[1],,has
3,[15],[9],album,albums
4,[4],"[5, 6]",was,has been


In [None]:
test[:5]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
0,41,1,Inflectional Changes,yes,[22],[21],choice,choices,,,,
1,48,1,Inflectional Changes,yes,"[2, 3]","[2, 3]",has said,had announced,,,,
2,51,1,Inflectional Changes,yes,[1],"[1, 3]",said,has said,,,,
3,52,1,Inflectional Changes,yes,[15],[9],album,albums,,,,
4,53,1,Inflectional Changes,yes,"[4, 5]","[5, 6, 7]",was downplayed,has been downplayed,,,,


# The garbage pail

Code that may or may not be useful will remain here for a while

Change of Order > Identity

Game plan:

Same Polarity Substitution > Derivational Changes > Inflectional Changes > ...Modal Verb Changes? > Change of Order (modified)