# Imports

In [1]:
import copy
import pandas as pd
import numpy as np

# Reading the ETPC

This is the ETPC dataset compiled by Wahle and posted on HuggingFace

In [2]:
# Unpickle etpc_raw
etpc = pd.read_pickle('datasets/etpc_raw.pkl')

These are the XML files from the ETPC github repo.

The first one contains all pairs marked as paraphrases by the MRPC:

In [3]:
textual_paraphrases = pd.read_xml('datasets/etpc/textual_paraphrases.xml')
# Convert scopes from strings to lists of ints
textual_paraphrases['s1_scope'] = textual_paraphrases['s1_scope'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)
textual_paraphrases['s2_scope'] = textual_paraphrases['s2_scope'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)
textual_paraphrases['key_s1'] = textual_paraphrases['key_s1'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)
textual_paraphrases['key_s2'] = textual_paraphrases['key_s2'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)

The second one contains the text and pair ids for *all* sentence pairs (paraphrases or not). It doesn't contain any data on whether they're paraphrases or not, or what EPT types are in them.

In [4]:
pairs = pd.read_xml('datasets/etpc/text_pairs.xml')
pairs.drop(columns=['negation'], inplace=True)
pairs.set_index('pair_id', inplace=True)

# Cleanup

## Cleaning up Columns

In [5]:
etpc.rename(columns={'paraphrase_type_ids': 'ept_ids', 'paraphrase_types': 'ept_names'}, inplace=True)
etpc.drop(columns={'negation'}, axis=1, inplace=True)
etpc

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_segment_location,sentence2_segment_location,sentence1_segment_location_indices,sentence2_segment_location_indices,sentence1_segment_text,sentence2_segment_text
0,1_0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 2...","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26...","[[5], [7], [0, 1, 2, 3], [8, 9, 10, 11, 12, 13...","[[1, 2], [0], [10, 11, 12, 13], [4]]","[whom, called, Amrozi accused his brother, `` ...","[to him, Referring, Amrozi accused his brother..."
1,2_1,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...,"[Yucaipa, owned, Dominick, 's, before, selling...","[Yucaipa, bought, Dominick, 's, in, 1995, for,...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
2,3_2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, ...","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 2...","[[0], [14], [8, 9, 10], [17, 18, 19]]","[[4, 5, 6, 7], [18], [0, 1, 2, 3], [8, 9, 10, ...","[They, cargo, on June 10, , he added, had publ...","[the ship 's owners, explosives, On June 10 ,,..."
3,4_3,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ...","[Around, 0335, GMT, ,, Tab, shares, were, up, ...","[Tab, shares, jumped, 20, cents, ,, or, 4.6, %...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
4,5_4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...",0,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, ...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[[0, 1], [2], [11, 12, 14], [13], [13], [7], [...","[[0, 1, 2, 3, 4], [5], [11], [20, 21], [20, 21...","[The stock, rose, to close at, Friday, Friday,...","[PG & E Corp. shares, jumped, to, on Friday, o..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5796,5797_5796,"After Hughes refused to rehire Hernandez, he c...",Hernandez filed an Equal Employment Opportunit...,"[After, Hughes, refused, to, rehire, Hernandez...","[Hernandez, filed, an, Equal, Employment, Oppo...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
5797,5798_5797,There are 103 Democrats in the Assembly and 47...,Democrats dominate the Assembly while Republic...,"[There, are, 103, Democrats, in, the, Assembly...","[Democrats, dominate, the, Assembly, while, Re...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
5798,5799_5798,Bethany Hamilton remained in stable condition ...,"Bethany, who remained in stable condition afte...","[Bethany, Hamilton, remained, in, stable, cond...","[Bethany, ,, who, remained, in, stable, condit...",0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],[],[],[]
5799,5800_5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[26, 26, 0, 7, 7, 6, 1, 25, 25, 4, 25, 25, 25,...","[25, 25, 25, 25, 7, 0, 6, 1, 0, 0, 4, 25, 0, 2...","[[9], [3, 4], [5], [6], [0, 1], [24], [7, 8, 1...","[[10], [4], [6], [7], [13, 14], [0, 1, 2, 3], ...","[Corp, power station’s, US, owners, Last week,...","[Corp., Drax, American, owner, last week, The ..."


# Remapping paraphrase IDs


First, make a list with paraphrase types and IDs from the ETPC:

In [6]:
id_map = pd.read_xml('https://raw.githubusercontent.com/venelink/ETPC/master/Corpus/paraphrase_types.xml')
# Rename columns for clarity
id_map.rename(columns={'type_id': 'ept_id', 'type_name': 'ept_name'}, inplace=True)
# Drop unused data
id_map = id_map[['ept_id', 'ept_name']] # No use for type_category column
id_map.drop(id_map.tail(2).index,inplace=True) # Types don't appear in ETPC
id_map.style.hide(axis="index")
id_map

Unnamed: 0,ept_id,ept_name
0,1,Inflectional Changes
1,2,Modal Verb Changes
2,3,Derivational Changes
3,4,Spelling changes
4,5,Same Polarity Substitution (habitual)
5,6,Same Polarity Substitution (contextual)
6,7,Same Polarity Substitution (named ent.)
7,8,Change of format
8,9,Opposite polarity substitution (habitual)
9,10,Opposite polarity substitution (contextual)


Now, make a list with paraphrase names and IDs for ParaOp types

In [7]:
data = [[0, 'No change'],
        [1, 'Addition/Deletion - Function Word'],
        [2, 'Addition/Deletion - Content Word'],
        [3, 'Change of Order'],
        [4, 'Substitution - Synonym'],
        [5, 'Substitution - Contextual Synonym'],
        [6, 'Substitution - Morphological'],
        [7, 'Substitution - Spelling and Format'],
        [8, 'Addition/Deletion - Punctuation']
       ]
paraop_map = pd.DataFrame(data, columns = ['paraop_id', 'paraop_name'])
paraop_map.set_index('paraop_id', inplace=True)
paraop_map

Unnamed: 0_level_0,paraop_name
paraop_id,Unnamed: 1_level_1
0,No change
1,Addition/Deletion - Function Word
2,Addition/Deletion - Content Word
3,Change of Order
4,Substitution - Synonym
5,Substitution - Contextual Synonym
6,Substitution - Morphological
7,Substitution - Spelling and Format
8,Addition/Deletion - Punctuation


## Mapping

We'll use the dataframe below for mapping. Each row will contain the name and ID of a paraphrase type in the ETPC, and the name and ID of the correspondent ParaOp type.

In [8]:
id_map['paraop_id'] = ''
id_map['paraop_name'] = ''
id_map

Unnamed: 0,ept_id,ept_name,paraop_id,paraop_name
0,1,Inflectional Changes,,
1,2,Modal Verb Changes,,
2,3,Derivational Changes,,
3,4,Spelling changes,,
4,5,Same Polarity Substitution (habitual),,
5,6,Same Polarity Substitution (contextual),,
6,7,Same Polarity Substitution (named ent.),,
7,8,Change of format,,
8,9,Opposite polarity substitution (habitual),,
9,10,Opposite polarity substitution (contextual),,


Here's where we do the mapping:

In [9]:
# Helper function to map an ETPC id to a Paraop id
def map_id(ept_id, paraop_id):
    """Given an EPT id and a Paraop id, look up the name of the Paraop id and 
    fill in the rows of id_map with paraop_id and the name."""
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'] = paraop_id
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_name'] = paraop_map.loc[paraop_id, 'paraop_name']

In [10]:
map_id(ept_id=1, paraop_id=6)
map_id(ept_id=3, paraop_id=6)
map_id(ept_id=26, paraop_id=3)
map_id(ept_id=29, paraop_id=0)
map_id(4, 7)
map_id(5, 4)
map_id(6, 5)
map_id(8, 7)
map_id(9, 4)
map_id(2, 5)
map_id(7, 5)
map_id(13, 5)
id_map.style.hide(axis="index")

ept_id,ept_name,paraop_id,paraop_name
1,Inflectional Changes,6.0,Substitution - Morphological
2,Modal Verb Changes,5.0,Substitution - Contextual Synonym
3,Derivational Changes,6.0,Substitution - Morphological
4,Spelling changes,7.0,Substitution - Spelling and Format
5,Same Polarity Substitution (habitual),4.0,Substitution - Synonym
6,Same Polarity Substitution (contextual),5.0,Substitution - Contextual Synonym
7,Same Polarity Substitution (named ent.),5.0,Substitution - Contextual Synonym
8,Change of format,7.0,Substitution - Spelling and Format
9,Opposite polarity substitution (habitual),4.0,Substitution - Synonym
10,Opposite polarity substitution (contextual),,


TODO: Figure out a way to hide index of map_id throughout whole notebook. For some reason this seems harder than it needs to be...

Helper function to convert an ETPC ID to a Paraop ID

In [11]:
# Helper function to get a Paraop id from an ETPC id
def ept_to_paraop(ept_id):
    return id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'].iloc[0]

ept_to_paraop(3)

6

# Reannotation

## Creating positives dataframe

In [12]:
positives = etpc.loc[etpc['mrpc_label'] == 1]
positives.rename(columns={'sentence1_segment_location': 'sentence1_scope_etpc', 
                          'sentence2_segment_location': 'sentence2_scope_etpc'}, inplace=True)
positives.drop(columns=['sentence1_segment_location_indices', 'sentence2_segment_location_indices'],inplace=True)
positives['idx'] = positives.index.to_series()
positives

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives.rename(columns={'sentence1_segment_location': 'sentence1_scope_etpc',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives.drop(columns=['sentence1_segment_location_indices', 'sentence2_segment_location_indices'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['idx'] = positives.index.to_series()


Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_scope_etpc,sentence2_scope_etpc,sentence1_segment_text,sentence2_segment_text
0,0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 2...","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26...","[whom, called, Amrozi accused his brother, `` ...","[to him, Referring, Amrozi accused his brother..."
2,2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, ...","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 2...","[They, cargo, on June 10, , he added, had publ...","[the ship 's owners, explosives, On June 10 ,,..."
4,4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...",0,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, ...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[The stock, rose, to close at, Friday, Friday,...","[PG & E Corp. shares, jumped, to, on Friday, o..."
5,5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...",1,1,"[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]","[25, 11, 11, 11, 11, 11, 11, 11, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 11, 11...","[in the first quarter of the year, Revenue dro...","[the first quarter of the year, With the scand..."
7,7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]","[25, 4, 25, 25, 25, 25, 5, 25, 25, 25]","[25, 4, 4, 25, 25, 25, 25, 25, 5, 25, 25, 25]","[DVD-CCA, state, then, The appealed to the Sup...","[DVD CCA, U.S., that decision, The appealed to..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5792,5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...",1,1,"[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[29, 29, 29, 29, 29, 29, 29, 26, 26, 26, 26, 2...","[authorities said, Gehring waived extradition ...",[Gehring waived extradition Monday during a he...
5793,5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...",1,1,"[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[Silver, Silver, `` I am advised that certain ...","[the Silver statement, the Silver statement, ,..."
5795,5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...",0,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[deal, be completed, The deal , approved by bo...","[acquisition, close, The acquisition has been ..."
5799,5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[26, 26, 0, 7, 7, 6, 1, 25, 25, 4, 25, 25, 25,...","[25, 25, 25, 25, 7, 0, 6, 1, 0, 0, 4, 25, 0, 2...","[Corp, power station’s, US, owners, Last week,...","[Corp., Drax, American, owner, last week, The ..."


## Why we cannot use the ETPC from Wahle et al.

Here's a fundamental part of the ETPC that I hadn't realized until now: each token in a sentence can have *more than one* paraphrase type. Here's an example--note how, in sentence 2, token 5 appears in the scopes both of inflectional and derivational changes.

In [13]:
ric = textual_paraphrases.loc[(textual_paraphrases['pair_id'] == 4205+1) & (textual_paraphrases['type_id'].isin([3,1]))]
ric[:2]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
15963,4206,1,Inflectional Changes,yes,[3],"[3, 5]",completed,had inspected,,,,
15964,4206,3,Derivational Changes,yes,[4],[5],inspections,inspected,,,,


It seems that this issue also wasn't noticed by Wahle et al: some paraphrase scopes consist of only a single number repeated for the entirety of the list:

In [14]:
positives[positives['sentence1_scope_etpc'].apply(lambda x: (len(np.unique(x)) == 1))][:10]

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_scope_etpc,sentence2_scope_etpc,sentence1_segment_text,sentence2_segment_text
14,14,He told The Sun newspaper that Mr. Hussein's d...,"""Saddam's daughters had British schools and ho...","[He, told, The, Sun, newspaper, that, Mr., Hus...","[``, Saddam, 's, daughters, had, British, scho...",1,1,"[Same Polarity Substitution (named ent.), Same...","[7, 6, 7, 26, 25, 29, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[0, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26...","[Hussein, The Sun newspaper, Mr. Hussein, Mr. ...","[Saddam, The Sun, Saddam, Saddam 's daughters ..."
22,22,But tropical storm warnings and watches were p...,Tropical storm warnings were in place Thursday...,"[But, tropical, storm, warnings, and, watches,...","[Tropical, storm, warnings, were, in, place, T...",0,1,"[Addition/Deletion, Addition/Deletion, Identit...","[25, 25, 29, 30, 4, 6, 11, 17]","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[But, tropical storm warnings watches , the so...","[Jamaica and, storm warnings watches , the sou..."
35,35,Trading in Loral was halted yesterday; the sha...,The New York Stock Exchange suspended trading ...,"[Trading, in, Loral, was, halted, yesterday, ;...","[The, New, York, Stock, Exchange, suspended, t...",0,1,"[Same Polarity Substitution (habitual), Diathe...","[5, 14, 18, 29, 30, 21]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[halted, Trading in Loral was halted, Trading ...","[suspended, The New York Stock Exchange suspen..."
40,40,Last year the court upheld Cleveland's school ...,"Last year, the court ruled 5-4 in an Ohio case...","[Last, year, the, court, upheld, Cleveland, 's...","[Last, year, ,, the, court, ruled, 5-4, in, an...",1,1,"[Same Polarity Substitution (contextual), Infl...","[6, 1, 25, 25, 29, 28, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29, 25, 29...","[provide, choice, Last year that vouchers are ...","[provide with, choices, government, among a ra..."
63,63,Contrary to what PeopleSoft management would h...,Ellison said that contrary to the contentions ...,"[Contrary, to, what, PeopleSoft, management, w...","[Ellison, said, that, contrary, to, the, conte...",1,1,"[Addition/Deletion, Identity, Semantic based, ...","[25, 29, 28, 21]","[28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 2...","[25, 25, 0, 29, 29, 28, 28, 28, 28, 28, 29, 29...","[Contrary to , Oracle intends to fully support...","[Ellison said, contrary to , Oracle intends to..."
72,72,Also demonstrating box-office strength _ and g...,Also demonstrating box-office strength -- and ...,"[Also, demonstrating, box-office, strength, _,...","[Also, demonstrating, box-office, strength, --...",1,1,"[Spelling changes, Spelling changes, Identity,...","[4, 4, 29, 21, 21]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[O'Neill 's, Day 's, Also demonstrating box-of...","[ONeills, Days, Also demonstrating box-office ..."
86,86,Sales - a figure watched closely as a baromete...,It also disclosed that sales -- a figure close...,"[Sales, -, a, figure, watched, closely, as, a,...","[It, also, disclosed, that, sales, --, a, figu...",1,1,"[Same Polarity Substitution (habitual), Synthe...","[5, 11, 26, 25, 25, 25, 25, 29, 28, 21]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 0, 25, 0, 25, 25, 26, 25, 25, 25,...","[rose, many industry experts, closely, 5 perce...","[were higher, industry experts, closely, by an..."
111,111,The suite comes complete with a word processor...,"The suite includes a word processor, spreadshe...","[The, suite, comes, complete, with, a, word, p...","[The, suite, includes, a, word, processor, ,, ...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 6, 5, 11, 18, 25, 25, 29, 21, 21]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[comes complete with, software, utilizing, an,...","[includes, application, built around, the, XML..."
124,124,"Powell fired back: ""He's accusing the presiden...","If so, Powell said, he's calling the president...","[Powell, fired, back, :, ``, He, 's, accusing,...","[If, so, ,, Powell, said, ,, he, 's, calling, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 25, 25, 25, 29, 21]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[accusing, he, , he said, Powell fired back :,...","[calling, Powell, , Powell said ,, , too, If s..."
126,126,The memo on protecting sales of Windows and ot...,"The memo specifically mentioned Linux, a still...","[The, memo, on, protecting, sales, of, Windows...","[The, memo, specifically, mentioned, Linux, ,,...",1,1,"[Addition/Deletion, Addition/Deletion, Identity]","[25, 25, 29]","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 2...",[on protecting sales of Windows and other desk...,"[specifically, The memo mentioned Linux , a st..."


The issue also exists in part in the original ETPC: some paraphrase types have scopes annotated as pretty much the entire sentence. This seems especially prevalent among 'Punctuation changes'.

TODO: rewrite this, show examples 

While this is certainly an issue for the original ETPC, it's at least partly offset there since their annotation scheme has separate scopes for each paraphrase type. So even if the annotated scope of some given type isn't very informative, the entire sentence isn't lost: you'd still have other paraphrase types, which are most likely annotated correctly. But Wahle's dataset (and consequently his training pipeline) doesn't account for this. Whatever process Wahle et al. used for generating that dataset on Huggingface seems to have an especially hard time with sentences in the original ETPC as exemplified above, but the issue happens throughout *all* their dataset.

## Getting paraphrases from the original ETPC

Let's first clean up the dataset

In [15]:
positives.drop(columns=['idx', 'etpc_label', 'mrpc_label', 
                                       'sentence1_scope_etpc', 
                                       'sentence2_scope_etpc', 
                                       'sentence1_segment_text', 
                                       'sentence2_segment_text'])

Unnamed: 0,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,ept_names,ept_ids
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...","[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]"
2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...","[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]"
4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...","[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]"
5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...","[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]"
7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...","[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]"
...,...,...,...,...,...,...
5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...","[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]"
5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...","[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]"
5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...","[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]"
5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...","[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]"


We'll need a column to house the new scopes. Let's initialize that column with empty arrays for each token in the sentence. That way, we can easily tell which tokens haven't been annotated yet.

In [16]:
#TODO: get rid of SettingWithCopyWarning
positives['sentence1_scope'] = positives['sentence1_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))
positives['sentence2_scope'] = positives['sentence2_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))
positives['sentence1_scope'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['sentence1_scope'] = positives['sentence1_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['sentence2_scope'] = positives['sentence2_tokenized'].apply(lambda x: np.array(['' for _ in x]).astype('U64'))


array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', ''], dtype='<U64')

### Populating types

In [17]:
def get_max_i(arr, search_label) -> int:
    """Returns the highest instance number (i.e. the number after '_') for a given Paraop label in a given array.
    Returns -1 if the search label is not present in the array."""
    max_i = -1
    # Subset elements in array containing label
    mask = np.where(np.char.find(arr, f'{search_label}_') >= 0)
    subset = arr[mask]
    # Look for the maximum 
    for item in subset:
        labels = item.split(' & ')
        for label in labels:
            if label[0] == str(search_label):
                max_i = max(max_i, int(label[-1]))
    return max_i

Helper function to populate type

In [18]:
# TODO: Convert to df apply (rather than series apply on idx)
# TODO: Figure out if 64 char limit will be an issue

def populate_type(idx, ept_id, lookup_df=textual_paraphrases, manual = None, overwrite = True):
    """Given a paraphrase pair (idx) and an EPT paraphrase type (ept_id), convert the EPT type to Paraop, look up the 
    scopes for both sentences in the pair, and fill in the scopes with the Paraop type. Returns a pair of arrays with
    the newly annotated scopes.
    
    The 'manual' argument controls whether we manually specify what the Paraop ID will be, or whether we automatically 
    get the Paraop ID from id_map.
    """
    
    paraop_id = manual if manual else ept_to_paraop(ept_id) 
    is_subs = paraop_id not in [0,1,2,8]

    # Copy array to avoid messing up the originals
    array1 = np.copy(positives['sentence1_scope'][idx])
    array2 = np.copy(positives['sentence2_scope'][idx])
    
    # Create a subset of the lookup array containing only the paraphrase types
    # we are interested in (ept_id)
    subset = lookup_df[(lookup_df['pair_id'] == idx+1) & (lookup_df['type_id'] == int(ept_id))]
    subset.reset_index(drop=True, inplace=True)
    instances = len(subset['type_id'].values) # Count how many discrete instances of that type are there in this pair

    
    offset = (get_max_i(array1, paraop_id) + 1) if is_subs else 0


    def fill(sentence_n, instance, array, scope):
        """Helper function for filling in ids"""

        # Do nothing if scope is None
        if scope.tolist() is None:
            return
        
        # Identify which indices in the array have not been filled yet
        empty = np.where(array == '')[0]
        nonempty = np.where(array != '')[0]

        # Fill in empty entries
        if len(scope) > 0:
            empty_intersect = np.intersect1d(scope, empty)
            array[empty_intersect] = f'{paraop_id}_{instance+offset}'

        if overwrite:    
            # Append to non-empty entries
            nonempty_intersect = np.intersect1d(scope, nonempty)
            if len(nonempty_intersect) > 0:
                # TODO: Log this in a better way (save to a file instead of just printing)
                print(f'Double check type overwriting: row {idx}, sentence {sentence_n}')
                print(f'Common indices: {list(nonempty_intersect)}', end=' | ')
                print(f'Pre-existing types: {array[nonempty_intersect]}')
                array[nonempty_intersect] = np.char.add(array[nonempty_intersect], f' & {paraop_id}_{instance+offset}')

    # Filling in
    for i in range(instances):
        # Get scopes from lookup df
        s1_scope = np.array(subset.loc[subset['type_id'] == ept_id, 's1_scope'].iloc[i])
        s2_scope = np.array(subset.loc[subset['type_id'] == ept_id, 's2_scope'].iloc[i])
        
        fill(1, i, array1, s1_scope)
        fill(2, i, array2, s2_scope)
    
    return array1, array2

Here's a demo of how the outputs to that function look like:

In [19]:
populate_type(0, 26)

(array(['3_0', '3_0', '3_0', '3_0', '', '', '', '', '', '', '', '', '', '',
        '', '', '', '', ''], dtype='<U64'),
 array(['', '', '', '', '', '', '', '', '', '', '3_0', '3_0', '3_0', '3_0',
        '', '', '', '', '', ''], dtype='<U64'))

`populate_type` returns new arrays, it doesn't modify the original df. Use the function below to actually modify the df

In [20]:
def substitute(ept_id, lookup_df=textual_paraphrases, manual = None, overwrite = True):
    series = positives['idx'].apply(populate_type, ept_id=ept_id, lookup_df=lookup_df, manual = manual, overwrite = overwrite)
    cols = pd.DataFrame(series.tolist(), columns=['sentence1', 'sentence2'])
    positives.loc[:, 'sentence1_scope'] = cols['sentence1'].values
    positives.loc[:, 'sentence2_scope'] = cols['sentence2'].values

## Performing the reannotation

Helper functions:

In [21]:
def print_sents(idx: int):
    """Prints both sentences in a sentence pair, given the pair's id"""
    idx -= 1
    print(positives.loc[positives['idx'] == idx, 'sentence1'].iloc[0])
    print(positives.loc[positives['idx'] == idx, 'sentence2'].iloc[0])

In [22]:
def duplicate_df(df: pd.DataFrame):
    """Returns a deep copy of a dataframe"""
    return pd.DataFrame(columns = df.columns, data = copy.deepcopy(df.values))

In [23]:
def split_add_sub(df: pd.DataFrame):
    """Splits a dataframe into two dataframes: one containing types to be annotated as Addition/Deletion, and another
    containing types to be annotated as Substitution."""
    add_del = df[(df['s1_scope'].isnull()) | (df['s2_scope'].isnull())]
    subs = df[~((df['s1_scope'].isnull()) | (df['s2_scope'].isnull()))]
    return add_del, subs

### Change of order

In [24]:
substitute(26)

Double check type overwriting: row 196, sentence 1
Common indices: [2] | Pre-existing types: ['3_0']
Double check type overwriting: row 196, sentence 2
Common indices: [9] | Pre-existing types: ['3_0']
Double check type overwriting: row 411, sentence 1
Common indices: [0, 1, 2, 3, 4] | Pre-existing types: ['3_0' '3_0' '3_0' '3_0' '3_0']
Double check type overwriting: row 411, sentence 2
Common indices: [12, 13, 14, 15] | Pre-existing types: ['3_0' '3_0' '3_0' '3_0']
Double check type overwriting: row 1014, sentence 1
Common indices: [5] | Pre-existing types: ['3_0']
Double check type overwriting: row 1014, sentence 2
Common indices: [10, 11] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 1543, sentence 1
Common indices: [19] | Pre-existing types: ['3_0']
Double check type overwriting: row 1543, sentence 2
Common indices: [5] | Pre-existing types: ['3_0']
Double check type overwriting: row 1864, sentence 1
Common indices: [5, 6] | Pre-existing types: ['3_0' '3_0'

### Same Polarity Substitution (Habitual)

In [25]:
substitute(5)

Double check type overwriting: row 75, sentence 1
Common indices: [5, 6] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 75, sentence 2
Common indices: [0] | Pre-existing types: ['3_0']
Double check type overwriting: row 152, sentence 1
Common indices: [13, 14] | Pre-existing types: ['4_0' '4_0']
Double check type overwriting: row 152, sentence 2
Common indices: [10] | Pre-existing types: ['4_0']
Double check type overwriting: row 172, sentence 1
Common indices: [22] | Pre-existing types: ['3_0']
Double check type overwriting: row 172, sentence 2
Common indices: [17] | Pre-existing types: ['3_0']
Double check type overwriting: row 226, sentence 1
Common indices: [0, 1, 2, 3] | Pre-existing types: ['3_0' '3_0' '3_0' '3_0']
Double check type overwriting: row 226, sentence 2
Common indices: [2] | Pre-existing types: ['3_0']
Double check type overwriting: row 310, sentence 1
Common indices: [9] | Pre-existing types: ['3_0']
Double check type overwriting: row 310, sen

### Same Polarity Substitution (Contextual)

#### Fixing Contextual Synonyms

In [26]:
contextual = textual_paraphrases[textual_paraphrases['type_id'] == 6]

In [27]:
pronouns = ['I', 'you', 'he', 'she', 'it', 'we', 'you', 'they',
            'me', 'you', 'him', 'her', 'it', 'us', 'you', 'them',
            'mine', 'yours', 'his', 'hers', 'its', 'ours', 'yours', 'theirs']

In [28]:
contextual['s1_text'] = contextual['s1_text'].str.lower()
contextual['s2_text'] = contextual['s2_text'].str.lower()
contextual.drop(columns=['key_s1', 'key_s2', 'k1_text', 'k2_text'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual['s1_text'] = contextual['s1_text'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual['s2_text'] = contextual['s2_text'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(columns=['key_s1', 'key_s2', 'k1_text', 'k2_text'], inplace=True)


In [29]:
# List of function words
fwords = ['a', 'an', 'the',                                                     # Articles
          'and', 'but', 'for', 'or',                                            # Conjunctions
          'that', 'this', 'those', 'these',                                     # Demonstratives
          'at', 'by', 'from', 'in', 'into', 'of', 'on', 'out', 'to', 'with']    # Prepositions

##### Things we know for sure are straight synonyms

In [30]:
fword_subs = contextual[(contextual['s1_text'].isin(fwords)) & (contextual['s2_text'].isin(fwords))]
fword_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
167,52,6,Same Polarity Substitution (contextual),yes,[10],[8],or,and
181,54,6,Same Polarity Substitution (contextual),yes,[8],[5],at,to
528,143,6,Same Polarity Substitution (contextual),yes,[10],[9],and,but
574,156,6,Same Polarity Substitution (contextual),yes,[4],[5],of,from
724,191,6,Same Polarity Substitution (contextual),yes,[8],[7],at,to
...,...,...,...,...,...,...,...,...
21731,5705,6,Same Polarity Substitution (contextual),yes,[9],[9],to,in
21736,5709,6,Same Polarity Substitution (contextual),yes,[10],[9],at,in
21785,5721,6,Same Polarity Substitution (contextual),yes,[22],[22],of,from
21914,5759,6,Same Polarity Substitution (contextual),yes,[6],[6],on,in


In [31]:
contextual.drop(fword_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(fword_subs.index.tolist(), inplace=True)


In [32]:
dicendi = ['say', 'says', 'said', 'told', 'tell', 'tells', 'speak', 'spoke']
dicendi_subs = contextual[(contextual['s1_text'].isin(dicendi)) | (contextual['s2_text'].isin(dicendi))]
dicendi_subs[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
35,14,6,Same Polarity Substitution (contextual),yes,[2],[2],added,said
47,20,6,Same Polarity Substitution (contextual),yes,[11],[14],hoped,said
591,159,6,Same Polarity Substitution (contextual),yes,[17],[31],added,said
791,205,6,Same Polarity Substitution (contextual),yes,[23],[21],finds,says
938,244,6,Same Polarity Substitution (contextual),yes,[7],[7],said,conceded
1190,305,6,Same Polarity Substitution (contextual),yes,[1],[13],said,wrote
1670,437,6,Same Polarity Substitution (contextual),yes,"[7, 8]",[7],rule out,say
2221,588,6,Same Polarity Substitution (contextual),yes,"[13, 14]",[2],according to,said
2237,592,6,Same Polarity Substitution (contextual),yes,[2],[2],says,insists
2416,631,6,Same Polarity Substitution (contextual),yes,[6],[1],said,conceded


In [33]:
contextual.drop(dicendi_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(dicendi_subs.index.tolist(), inplace=True)


In [34]:
if_subs = contextual[(contextual['s1_text'].isin(['if', 'should', 'whether'])) | (contextual['s2_text'].isin(['if', 'should', 'whether']))]
if_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
734,195,6,Same Polarity Substitution (contextual),yes,[19],[14],should,if
6480,1759,6,Same Polarity Substitution (contextual),yes,[10],[10],if,should
8670,2333,6,Same Polarity Substitution (contextual),yes,[19],[19],though,if
11167,2985,6,Same Polarity Substitution (contextual),yes,"[0, 1]",[0],in case,if
13763,3650,6,Same Polarity Substitution (contextual),yes,[12],[15],should,if
18059,4737,6,Same Polarity Substitution (contextual),yes,[15],[11],if,should


In [35]:
contextual.drop(if_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(if_subs.index.tolist(), inplace=True)


In [36]:
about_subs = contextual[(contextual['s1_text'].isin(['about', 'some'])) | (contextual['s2_text'].isin(['about', 'some']))]
about_subs.drop([6875, 9206, 3191, 9803], inplace=True) # Dropping indices that don't correspond to synonym sub
about_subs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  about_subs.drop([6875, 9206, 3191, 9803], inplace=True) # Dropping indices that don't correspond to synonym sub


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
854,220,6,Same Polarity Substitution (contextual),yes,[21],[23],about,some
1175,300,6,Same Polarity Substitution (contextual),yes,[14],[15],towards,about
1893,497,6,Same Polarity Substitution (contextual),yes,[4],[4],about,of
2082,550,6,Same Polarity Substitution (contextual),yes,[12],[5],about,over
2553,665,6,Same Polarity Substitution (contextual),yes,[21],[22],about,of
3753,1009,6,Same Polarity Substitution (contextual),yes,[20],[21],to,about
4668,1262,6,Same Polarity Substitution (contextual),yes,[2],[2],about,over
5923,1605,6,Same Polarity Substitution (contextual),yes,[8],"[10, 11]",about,a mere
6454,1755,6,Same Polarity Substitution (contextual),yes,[0],"[0, 1]",about,an estimated
7316,1979,6,Same Polarity Substitution (contextual),yes,[3],[3],nearly,about


In [37]:
contextual.drop(about_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(about_subs.index.tolist(), inplace=True)


In [38]:
print_sents(5)

The stock rose $2.11, or about 11 percent, to close Friday at $21.51 on the New York Stock Exchange.
PG&E Corp. shares jumped $1.63 or 8 percent to $21.03 on the New York Stock Exchange on Friday.


In [39]:
print_sents(2611)

Committee approval, expected today, would set the stage for debate on the Senate floor beginning Monday.
That would clear the way for debate in the full Senate beginning on Monday.


In [40]:
which_subs = contextual[(contextual['s1_text'].isin(['which', 'that'])) | (contextual['s2_text'].isin(['which', 'that']))]
which_subs.drop([4786, 1542, 1774, 2294, 4254, 9702, 10801, 11694, 13285, 18098, 20424, 20534, 21992, 14576, 15918], inplace=True)
which_subs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  which_subs.drop([4786, 1542, 1774, 2294, 4254, 9702, 10801, 11694, 13285, 18098, 20424, 20534, 21992, 14576, 15918], inplace=True)


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
72,27,6,Same Polarity Substitution (contextual),yes,[16],[18],that,which
3343,890,6,Same Polarity Substitution (contextual),yes,[2],[0],it,that
4852,1313,6,Same Polarity Substitution (contextual),yes,[14],[13],which,that
5667,1525,6,Same Polarity Substitution (contextual),yes,[2],[7],who,that
5739,1553,6,Same Polarity Substitution (contextual),yes,[17],[20],which,that
5900,1599,6,Same Polarity Substitution (contextual),yes,[5],[2],that,who
7379,1992,6,Same Polarity Substitution (contextual),yes,[12],[11],that,it
7446,2011,6,Same Polarity Substitution (contextual),yes,[17],[15],which,that
8726,2340,6,Same Polarity Substitution (contextual),yes,[12],[18],which,that
9280,2497,6,Same Polarity Substitution (contextual),yes,[17],[19],that,which


In [41]:
contextual.drop(which_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(which_subs.index.tolist(), inplace=True)


In [42]:
to_be_subs = contextual[(contextual['s1_text'].isin(['is', 'are', 'will'])) | (contextual['s2_text'].isin(['is', 'are', 'will']))]
to_be_subs.drop([10314], inplace=True)
to_be_subs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_be_subs.drop([10314], inplace=True)


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
7993,2157,6,Same Polarity Substitution (contextual),yes,[7],[1],mark,is
8470,2284,6,Same Polarity Substitution (contextual),yes,"[2, 3, 4]",[4],is slated to,will
9682,2608,6,Same Polarity Substitution (contextual),yes,[23],[21],get,are
11002,2946,6,Same Polarity Substitution (contextual),yes,[2],"[2, 3, 4]",will,is going to
11169,2985,6,Same Polarity Substitution (contextual),yes,[17],[16],is,was
11763,3122,6,Same Polarity Substitution (contextual),yes,[7],[6],are,include
11785,3128,6,Same Polarity Substitution (contextual),yes,[4],[1],became,is
11908,3160,6,Same Polarity Substitution (contextual),yes,[4],[4],approached,is
12952,3433,6,Same Polarity Substitution (contextual),yes,[1],[1],placed,is
13213,3498,6,Same Polarity Substitution (contextual),yes,[2],"[2, 3]",will,is expected


In [43]:
contextual.drop(to_be_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(to_be_subs.index.tolist(), inplace=True)


In [44]:
country_state_subs = contextual[(contextual['s1_text'].isin(['usa', 'us', 'nation', 'country', 'state'])) | (contextual['s2_text'].isin(['usa', 'us', 'nation', 'country', 'state']))]
country_state_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
320,89,6,Same Polarity Substitution (contextual),yes,[7],[11],nation,us
1578,413,6,Same Polarity Substitution (contextual),yes,[11],[11],nation,u.s.
1631,428,6,Same Polarity Substitution (contextual),yes,[4],"[6, 7]",state,west virginia
5115,1382,6,Same Polarity Substitution (contextual),yes,[10],[14],us,american
6232,1700,6,Same Polarity Substitution (contextual),yes,[3],[3],us,american
12183,3228,6,Same Polarity Substitution (contextual),yes,[7],[7],us,american
13475,3570,6,Same Polarity Substitution (contextual),yes,[12],[14],nations,state
14446,3826,6,Same Polarity Substitution (contextual),yes,[8],[1],state,texas
14642,3863,6,Same Polarity Substitution (contextual),yes,[0],[1],state,statewide
15636,4121,6,Same Polarity Substitution (contextual),yes,[14],[11],u.s.,nation


In [45]:
print_sents(2625)

Wild rock legend Ozzy Osbourne was in intensive care today as he continued his “steady” recovery from a quad bike accident.
Wild rock legend Ozzy Osbourne could be kept in intensive care for “several days” following his quad bike accident, his doctor said tonight.


In [46]:
before_after_subs = contextual[(contextual['s1_text'].isin(['before', 'after', 'following', 'prior to', 'since'])) | (contextual['s2_text'].isin(['before', 'after', 'following', 'prior to', 'since']))]
before_after_subs.drop([2368, 5414, 5812, 9744, 10370], inplace=True)
before_after_subs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  before_after_subs.drop([2368, 5414, 5812, 9744, 10370], inplace=True)


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
536,145,6,Same Polarity Substitution (contextual),yes,[0],[0],while,after
2929,774,6,Same Polarity Substitution (contextual),yes,[12],[12],after,when
3201,855,6,Same Polarity Substitution (contextual),yes,[11],[7],as,after
4006,1074,6,Same Polarity Substitution (contextual),yes,[18],[14],after,following
4263,1149,6,Same Polarity Substitution (contextual),yes,[9],[8],following,after
4748,1284,6,Same Polarity Substitution (contextual),yes,"[8, 9, 10, 11]",[8],around the time of,after
4944,1334,6,Same Polarity Substitution (contextual),yes,[18],[18],in,following
5017,1350,6,Same Polarity Substitution (contextual),yes,[9],"[9, 10]",before,in advance
5651,1519,6,Same Polarity Substitution (contextual),yes,[17],[14],since,before
5797,1578,6,Same Polarity Substitution (contextual),yes,[8],[7],when,after


In [47]:
contextual.drop(before_after_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(before_after_subs.index.tolist(), inplace=True)


In [48]:
with_subs = contextual[(contextual['s1_text'].isin(['with'])) | (contextual['s2_text'].isin(['with']))]
with_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
2025,534,6,Same Polarity Substitution (contextual),yes,[3],[3],with,between
2212,587,6,Same Polarity Substitution (contextual),yes,[8],[4],carrying,with
3262,867,6,Same Polarity Substitution (contextual),yes,[6],"[5, 6, 7, 8]",with,serving as chair of
3545,950,6,Same Polarity Substitution (contextual),yes,[8],[14],with,running
5460,1479,6,Same Polarity Substitution (contextual),yes,[14],"[18, 19]",with,thanks to
10873,2915,6,Same Polarity Substitution (contextual),yes,"[18, 19, 20, 21]",[13],to the likes of,with
13624,3612,6,Same Polarity Substitution (contextual),yes,[18],[14],with,plus
19131,5022,6,Same Polarity Substitution (contextual),yes,"[14, 15]",[12],that have,with
20084,5270,6,Same Polarity Substitution (contextual),yes,[19],[16],regarding,with


In [49]:
as_subs = contextual[(contextual['s1_text'].isin(['as', 'like', 'such as', 'including'])) | (contextual['s2_text'].isin(['as', 'like', 'such as', 'including']))]
as_subs.drop(1104, inplace=True)
as_subs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  as_subs.drop(1104, inplace=True)


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
1326,337,6,Same Polarity Substitution (contextual),yes,[16],[16],like,as
2368,622,6,Same Polarity Substitution (contextual),yes,[7],[9],after,as
5977,1621,6,Same Polarity Substitution (contextual),yes,[17],[19],including,even
6312,1720,6,Same Polarity Substitution (contextual),yes,"[5, 6]",[5],such as,including
7577,2052,6,Same Polarity Substitution (contextual),yes,[7],[4],when,as
7705,2085,6,Same Polarity Substitution (contextual),yes,[13],[9],as,of
10578,2849,6,Same Polarity Substitution (contextual),yes,"[15, 16]",[14],such as,like
10958,2938,6,Same Polarity Substitution (contextual),yes,[8],"[7, 8]",like,such as
11153,2981,6,Same Polarity Substitution (contextual),yes,[6],"[6, 7]",like,such as
11454,3051,6,Same Polarity Substitution (contextual),yes,[15],[16],even,including


In [50]:
contextual.drop(as_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(as_subs.index.tolist(), inplace=True)


In [51]:
# Resume from here
each_every_subs = contextual[(contextual['s1_text'].isin(['each', 'every', 'all', 'any', 'some'])) | (contextual['s2_text'].isin(['each', 'every', 'all', 'any', 'some']))]
each_every_subs.drop([11358, 21534], inplace=True)
each_every_subs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  each_every_subs.drop([11358, 21534], inplace=True)


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
1929,511,6,Same Polarity Substitution (contextual),yes,[9],[11],its,all
3191,853,6,Same Polarity Substitution (contextual),yes,"[0, 1, 2, 3]",[0],some of the computers,some
3724,1001,6,Same Polarity Substitution (contextual),yes,[12],[15],each,every
6848,1858,6,Same Polarity Substitution (contextual),yes,[1],[1],any,a
6875,1868,6,Same Polarity Substitution (contextual),yes,"[2, 3, 4, 5, 6, 7, 8]",[2],resistance forces hostile to the u.s. presence,some
9206,2472,6,Same Polarity Substitution (contextual),yes,[0],[0],many,some
20986,5495,6,Same Polarity Substitution (contextual),yes,[3],[3],every,any
21214,5560,6,Same Polarity Substitution (contextual),yes,"[7, 8, 9, 10, 11]",[14],any of the sick children,any


In [52]:
contextual.drop(each_every_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(each_every_subs.index.tolist(), inplace=True)


In [53]:
have_subs = contextual[(contextual['s1_text'].isin(['have', 'had', 'was', 'were'])) | (contextual['s2_text'].isin(['have', 'had', 'was', 'were']))]
have_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
205,60,6,Same Polarity Substitution (contextual),yes,[6],[6],approached,was
298,83,6,Same Polarity Substitution (contextual),yes,[7],[6],was,occurred
1079,277,6,Same Polarity Substitution (contextual),yes,[10],"[15, 16]",was,looked like
1391,358,6,Same Polarity Substitution (contextual),yes,[9],"[7, 8, 9, 10]",were,are scheduled to be
2439,635,6,Same Polarity Substitution (contextual),yes,"[3, 4, 5]",[8],came in at,was
2738,726,6,Same Polarity Substitution (contextual),yes,[3],[3],closed,were
2789,740,6,Same Polarity Substitution (contextual),yes,[8],[3],had,eat
3452,920,6,Same Polarity Substitution (contextual),yes,"[11, 12]",[11],suffering from,had
3632,975,6,Same Polarity Substitution (contextual),yes,[5],[5],remained,was
3731,1003,6,Same Polarity Substitution (contextual),yes,[14],[12],possessed,had


In [54]:
contextual.drop(have_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(have_subs.index.tolist(), inplace=True)


In [55]:
a_an_subs = contextual[(contextual['s1_text'].isin(['a', 'an'])) | (contextual['s2_text'].isin(['a', 'an']))]
a_an_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
1436,377,6,Same Polarity Substitution (contextual),yes,[3],[0],a,another
2494,650,6,Same Polarity Substitution (contextual),yes,[6],[8],a,its
2959,784,6,Same Polarity Substitution (contextual),yes,[8],[8],a,per
3350,893,6,Same Polarity Substitution (contextual),yes,[21],[14],a,per
4081,1091,6,Same Polarity Substitution (contextual),yes,[9],[9],a,one-quarter
4561,1235,6,Same Polarity Substitution (contextual),yes,[13],[15],another,an
4598,1243,6,Same Polarity Substitution (contextual),yes,[22],[25],per,a
4618,1248,6,Same Polarity Substitution (contextual),yes,[3],[4],their,a
4979,1343,6,Same Polarity Substitution (contextual),yes,[7],[8],one,a
5007,1348,6,Same Polarity Substitution (contextual),yes,[11],[13],its,an


In [56]:
contextual.drop(a_an_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(a_an_subs.index.tolist(), inplace=True)


In [57]:
for_subs = contextual[(contextual['s1_text'].isin(['for'])) | (contextual['s2_text'].isin(['for']))]
for_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
1420,369,6,Same Polarity Substitution (contextual),yes,[12],"[8, 9]",for,set by
3691,989,6,Same Polarity Substitution (contextual),yes,[20],[19],representing,for
4452,1206,6,Same Polarity Substitution (contextual),yes,[12],[13],seeking,for
5148,1396,6,Same Polarity Substitution (contextual),yes,"[13, 14, 15, 16, 17]",[12],to take a seat in,for
6921,1877,6,Same Polarity Substitution (contextual),yes,[17],[19],for,seeking
11612,3089,6,Same Polarity Substitution (contextual),yes,"[16, 17, 18]",[15],in connection with,for
13080,3465,6,Same Polarity Substitution (contextual),yes,[23],[24],over,for
13577,3599,6,Same Polarity Substitution (contextual),yes,"[14, 15]",[12],because of,for
15204,4018,6,Same Polarity Substitution (contextual),yes,[6],[7],for,behind
15356,4057,6,Same Polarity Substitution (contextual),yes,[10],"[7, 8]",for,due to


In [58]:
contextual.drop(for_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(for_subs.index.tolist(), inplace=True)


In [59]:
over_under_subs = contextual[(contextual['s1_text'].isin(['over', 'under'])) | (contextual['s2_text'].isin(['over', 'under']))]
over_under_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
3891,1045,6,Same Polarity Substitution (contextual),yes,[7],[8],over,in
3900,1046,6,Same Polarity Substitution (contextual),yes,[11],"[8, 9]",over,more than
3969,1061,6,Same Polarity Substitution (contextual),yes,[5],[5],over,in
4778,1292,6,Same Polarity Substitution (contextual),yes,"[2, 3]",[8],more than,over
6724,1827,6,Same Polarity Substitution (contextual),yes,"[12, 13]",[9],based on,under
7154,1933,6,Same Polarity Substitution (contextual),yes,[3],[6],over,above
7279,1965,6,Same Polarity Substitution (contextual),yes,[15],"[13, 16, 17]",under,if is implemented
10498,2828,6,Same Polarity Substitution (contextual),yes,[11],[9],over,in
10832,2902,6,Same Polarity Substitution (contextual),yes,[9],[11],over,during
12841,3405,6,Same Polarity Substitution (contextual),yes,[7],[7],under,by


In [60]:
contextual.drop(over_under_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(over_under_subs.index.tolist(), inplace=True)


In [61]:
from_subs = contextual[(contextual['s1_text'].isin(['from', 'by'])) | (contextual['s2_text'].isin(['from', 'by']))]
from_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
1251,318,6,Same Polarity Substitution (contextual),yes,[7],"[8, 9]",from,due to
7648,2068,6,Same Polarity Substitution (contextual),yes,"[14, 15]",[14],compared with,from
8191,2207,6,Same Polarity Substitution (contextual),yes,"[10, 11, 12]",[13],as early as,by
9120,2447,6,Same Polarity Substitution (contextual),yes,[7],[10],from,through
11756,3121,6,Same Polarity Substitution (contextual),yes,[20],[14],when,by
12679,3363,6,Same Polarity Substitution (contextual),yes,"[10, 11, 12]",[8],as much as,by
15800,4157,6,Same Polarity Substitution (contextual),yes,"[8, 9]",[8],starting at,from
20340,5331,6,Same Polarity Substitution (contextual),yes,[6],"[8, 9]",from,generated by
20759,5433,6,Same Polarity Substitution (contextual),yes,[17],[12],by,through


In [62]:
contextual.drop(from_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(from_subs.index.tolist(), inplace=True)


In [63]:
in_on_subs = contextual[(contextual['s1_text'].isin(['in', 'on', 'at', 'during'])) | (contextual['s2_text'].isin(['in', 'on', 'at', 'during']))]
in_on_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
1306,333,6,Same Polarity Substitution (contextual),yes,[5],[0],on,throughout
2947,782,6,Same Polarity Substitution (contextual),yes,[12],[15],involving,in
3543,950,6,Same Polarity Substitution (contextual),yes,[3],[10],on,against
3898,1046,6,Same Polarity Substitution (contextual),yes,[15],[13],in,during
4114,1109,6,Same Polarity Substitution (contextual),yes,[4],[4],in,inside
4703,1272,6,Same Polarity Substitution (contextual),yes,[4],[4],inside,in
5279,1427,6,Same Polarity Substitution (contextual),yes,[4],[4],in,across
6764,1834,6,Same Polarity Substitution (contextual),yes,[21],[11],on,during
7172,1938,6,Same Polarity Substitution (contextual),yes,[12],[8],in,through
7569,2051,6,Same Polarity Substitution (contextual),yes,[9],[6],in,inside


In [64]:
contextual.drop(in_on_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(in_on_subs.index.tolist(), inplace=True)


In [65]:
when_subs = contextual[(contextual['s1_text'].isin(['when'])) | (contextual['s2_text'].isin(['when']))]
when_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
16526,4363,6,Same Polarity Substitution (contextual),yes,[12],[13],where,when


In [66]:
contextual.drop(when_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(when_subs.index.tolist(), inplace=True)


In [67]:
get_subs = contextual[(contextual['s1_text'].isin(['get', 'got', 'receive'])) | (contextual['s2_text'].isin(['get', 'got', 'receive']))]
get_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
1487,391,6,Same Polarity Substitution (contextual),yes,[15],[8],win,get
1840,481,6,Same Polarity Substitution (contextual),yes,[2],[2],receive,pocket
3723,1001,6,Same Polarity Substitution (contextual),yes,[7],[10],get,receive
3759,1012,6,Same Polarity Substitution (contextual),yes,[5],[2],receive,get
4280,1154,6,Same Polarity Substitution (contextual),yes,[16],[16],get,see
4968,1339,6,Same Polarity Substitution (contextual),yes,[4],"[4, 7]",get,get out
6943,1880,6,Same Polarity Substitution (contextual),yes,[7],[11],receive,get
7530,2040,6,Same Polarity Substitution (contextual),yes,[2],[7],get,earn
11644,3094,6,Same Polarity Substitution (contextual),yes,[4],[4],get,collect
12594,3343,6,Same Polarity Substitution (contextual),yes,[3],[6],get,received


In [68]:
contextual.drop(get_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(get_subs.index.tolist(), inplace=True)


In [69]:
dfs = [fword_subs, dicendi_subs,  if_subs, about_subs, which_subs, to_be_subs, before_after_subs, as_subs,
    each_every_subs, have_subs, a_an_subs, for_subs, over_under_subs, from_subs, in_on_subs, when_subs, get_subs]

In [70]:
merged_straight_syn = pd.concat(dfs)

In [71]:
merged_straight_syn

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
167,52,6,Same Polarity Substitution (contextual),yes,[10],[8],or,and
181,54,6,Same Polarity Substitution (contextual),yes,[8],[5],at,to
528,143,6,Same Polarity Substitution (contextual),yes,[10],[9],and,but
574,156,6,Same Polarity Substitution (contextual),yes,[4],[5],of,from
724,191,6,Same Polarity Substitution (contextual),yes,[8],[7],at,to
...,...,...,...,...,...,...,...,...
11644,3094,6,Same Polarity Substitution (contextual),yes,[4],[4],get,collect
12594,3343,6,Same Polarity Substitution (contextual),yes,[3],[6],get,received
15594,4107,6,Same Polarity Substitution (contextual),yes,[2],[7],get,earn
17777,4667,6,Same Polarity Substitution (contextual),yes,[3],[2],has,receive


In [72]:
substitute(6, merged_straight_syn, 4)

Double check type overwriting: row 276, sentence 1
Common indices: [10] | Pre-existing types: ['3_0']
Double check type overwriting: row 276, sentence 2
Common indices: [15, 16] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 332, sentence 1
Common indices: [5] | Pre-existing types: ['3_0']
Double check type overwriting: row 332, sentence 2
Common indices: [0] | Pre-existing types: ['3_0']
Double check type overwriting: row 1067, sentence 1
Common indices: [6] | Pre-existing types: ['3_0']
Double check type overwriting: row 1067, sentence 2
Common indices: [8] | Pre-existing types: ['3_0']
Double check type overwriting: row 1888, sentence 1
Common indices: [9] | Pre-existing types: ['3_0']
Double check type overwriting: row 1888, sentence 2
Common indices: [9] | Pre-existing types: ['3_0']
Double check type overwriting: row 1937, sentence 1
Common indices: [12] | Pre-existing types: ['3_0']
Double check type overwriting: row 1937, sentence 2
Common indices: [8] |

##### Things we know for sure are additions/deletions

In [73]:
def trim_duplicates(s1_scope, s2_scope, s1_text, s2_text):
    s1_newtext = s1_text.split()
    s2_newtext = s2_text.split()
    
    in1 = np.where(np.in1d(s1_newtext, s2_newtext))[0]
    in2 = np.where(np.in1d(s2_newtext, s1_newtext))[0]

    s1_newscope = np.delete(s1_scope, in1)
    s2_newscope = np.delete(s2_scope, in2)
    s1_newtext = ' '.join(np.delete(s1_newtext, in1))
    s2_newtext = ' '.join(np.delete(s2_newtext, in2))

    return s1_newscope, s2_newscope, s1_newtext, s2_newtext

In [74]:
contextual['s1_scope'], contextual['s2_scope'], contextual['s1_text'], contextual['s2_text'] = contextual.apply(lambda x: trim_duplicates(x.s1_scope, x.s2_scope, x.s1_text, x.s2_text), axis=1, result_type='expand').transpose().values

contextual['s1_text'] = contextual['s1_text'].apply(lambda x: None if x == '' else x)
contextual['s2_text'] = contextual['s2_text'].apply(lambda x: None if x == '' else x)
contextual['s1_scope'] = contextual['s1_scope'].apply(lambda x: None if list(x) == [] else x)
contextual['s2_scope'] = contextual['s2_scope'].apply(lambda x: None if list(x) == [] else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual['s1_scope'], contextual['s2_scope'], contextual['s1_text'], contextual['s2_text'] = contextual.apply(lambda x: trim_duplicates(x.s1_scope, x.s2_scope, x.s1_text, x.s2_text), axis=1, result_type='expand').transpose().values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual['s1_text'] = contextual['s1_text'].apply(lambda x: None if x == '' else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc

In [75]:
print_sents(15)

He told The Sun newspaper that Mr. Hussein's daughters had British schools and hospitals in mind when they decided to ask for asylum.
"Saddam's daughters had British schools and hospitals in mind when they decided to ask for asylum -- especially the schools," he told The Sun.


In [76]:
contextual_none = contextual[(contextual['s1_scope'].isnull()) | (contextual['s2_scope'].isnull())]
contextual_none

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
12,5,6,Same Polarity Substitution (contextual),yes,"[12, 14]",,close at,
41,15,6,Same Polarity Substitution (contextual),yes,[4],,newspaper,
52,22,6,Same Polarity Substitution (contextual),yes,,"[4, 5]",,navigation tool
124,41,6,Same Polarity Substitution (contextual),yes,,[20],,with
295,83,6,Same Polarity Substitution (contextual),yes,[16],,school,
...,...,...,...,...,...,...,...,...
21930,5766,6,Same Polarity Substitution (contextual),yes,[19],,michelle,
21965,5776,6,Same Polarity Substitution (contextual),yes,,"[26, 27]",,of allegiance
21993,5785,6,Same Polarity Substitution (contextual),yes,[1],,kept,
22034,5793,6,Same Polarity Substitution (contextual),yes,[17],,back,


In [77]:
contextual.drop(contextual_none.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(contextual_none.index.tolist(), inplace=True)


In [78]:
substitute(6, contextual_none, 2)

Double check type overwriting: row 651, sentence 2
Common indices: [2, 3, 4] | Pre-existing types: ['3_0' '3_0' '3_0']
Double check type overwriting: row 1067, sentence 2
Common indices: [15, 16] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 1142, sentence 2
Common indices: [0, 1] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 1142, sentence 1
Common indices: [7, 8] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 1161, sentence 1
Common indices: [21] | Pre-existing types: ['3_0']
Double check type overwriting: row 1182, sentence 1
Common indices: [22] | Pre-existing types: ['3_0']
Double check type overwriting: row 1412, sentence 1
Common indices: [9] | Pre-existing types: ['3_0']
Double check type overwriting: row 1540, sentence 1
Common indices: [14] | Pre-existing types: ['3_0']
Double check type overwriting: row 2057, sentence 2
Common indices: [24, 25, 26, 27] | Pre-existing types: ['3_0' '3_0' '3_0' '3_0']
Do

##### Things we know for sure are contextual synonyms

In [79]:
alnum_space = lambda x: not all(i.isalpha() or i==' ' for i in x)
alnum_space('the sun')

False

In [80]:
anydigit = lambda x: any(i.isdigit() for i in x)

In [81]:
has_number = contextual[(contextual['s1_text'].apply(anydigit)) | contextual['s2_text'].apply(anydigit)]
has_number[:50]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
162,51,6,Same Polarity Substitution (contextual),yes,[8],"[12, 13, 14, 15]",million-plus,more than 1 million
499,136,6,Same Polarity Substitution (contextual),yes,"[5, 6, 7]",[4],the 35 year-old,heather
728,192,6,Same Polarity Substitution (contextual),yes,"[19, 20, 21]",[14],five years earlier,2002
1523,399,6,Same Polarity Substitution (contextual),yes,"[4, 5, 6]","[0, 1, 2]",in august 2000,at that time
1727,450,6,Same Polarity Substitution (contextual),yes,"[2, 3, 4]",[20],the standards body,w3c
2077,549,6,Same Polarity Substitution (contextual),yes,"[1, 2, 3]",[0],the 2002 study,it
2356,619,6,Same Polarity Substitution (contextual),yes,"[16, 17, 18, 19]","[14, 15]",more than 40 years,for decades
2719,720,6,Same Polarity Substitution (contextual),yes,[1],"[1, 2, 3, 4]",purchase,$ 22 million deal
2872,759,6,Same Polarity Substitution (contextual),yes,"[5, 6, 7, 8, 9]",[7],more than 10 per cent,sharply
2916,768,6,Same Polarity Substitution (contextual),yes,"[11, 12]","[16, 17, 18, 19]",much lower,only a 10 percent


In [82]:
contextual.drop(has_number.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(has_number.index.tolist(), inplace=True)


In [83]:
has_symbol = contextual[(contextual['s1_text'].apply(alnum_space)) | contextual['s2_text'].apply(alnum_space)]
has_symbol.drop([1860, 1930, 2194, 2516, 3228, 3532, 3668, 4047, 4276, 4445], inplace=True) # These are definitely synonym substitution
has_symbol.drop([3091], inplace=True) # These are not context synonym, but are also not straight syn
has_symbol

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  has_symbol.drop([1860, 1930, 2194, 2516, 3228, 3532, 3668, 4047, 4276, 4445], inplace=True) # These are definitely synonym substitution
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  has_symbol.drop([3091], inplace=True) # These are not context synonym, but are also not straight syn


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
5,3,6,Same Polarity Substitution (contextual),yes,[0],"[4, 5, 6, 7]",they,the ship 's owners
10,5,6,Same Polarity Substitution (contextual),yes,"[0, 1]","[0, 1, 2, 3, 4]",the stock,pg & e corp. shares
30,10,6,Same Polarity Substitution (contextual),yes,"[9, 10, 11]",[8],the company 's,our
159,51,6,Same Polarity Substitution (contextual),yes,[0],[0],wal-mart,it
240,72,6,Same Polarity Substitution (contextual),yes,"[21, 22]",[21],columbia 's,its
...,...,...,...,...,...,...,...,...
21817,5732,6,Same Polarity Substitution (contextual),yes,"[13, 14, 16, 17, 18]",[14],some of victims ' supporters,crowd
21829,5738,6,Same Polarity Substitution (contextual),yes,[0],"[0, 1, 2, 3, 4]",brendsel,the company 's chief executive
21949,5772,6,Same Polarity Substitution (contextual),yes,[19],[23],conventional,low-fat
21998,5786,6,Same Polarity Substitution (contextual),yes,"[6, 7, 8]","[6, 7, 8, 9, 10, 14, 15, 21]",all four states,"north carolina , virginia , maryland , pennsyl..."


In [84]:
contextual.drop(has_symbol.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(has_symbol.index.tolist(), inplace=True)


In [85]:
print_sents(419)

Money from Iraqi oil sales will go into that fund which will be controlled by the United States and Britain and used to rebuild the country.
Money from oil sales will now be deposited in a new Development Fund for Iraq, controlled by the United States and Britain and used to rebuild the country.


In [86]:
contextual

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
1,1,6,Same Polarity Substitution (contextual),yes,[7],[0],called,referring
6,3,6,Same Polarity Substitution (contextual),yes,[14],[18],cargo,explosives
34,14,6,Same Polarity Substitution (contextual),yes,[1],"[0, 1]",he,de sole
51,22,6,Same Polarity Substitution (contextual),yes,[1],[2],new,redesigned
62,23,6,Same Polarity Substitution (contextual),yes,[7],"[4, 5]",posted,in place
...,...,...,...,...,...,...,...,...
21956,5773,6,Same Polarity Substitution (contextual),yes,[8],[8],numbers,standards
21981,5782,6,Same Polarity Substitution (contextual),yes,"[14, 15, 16, 17, 18]",[18],that have been made against,surrounding
21992,5785,6,Same Polarity Substitution (contextual),yes,[0],[4],that,winikoff
22004,5787,6,Same Polarity Substitution (contextual),yes,"[21, 22]",[4],the votes,precincts


In [87]:
company_subs = contextual[(contextual['s1_text'].isin(['the company', 'company', 'firm', 'the firm'])) | (contextual['s2_text'].isin(['the company', 'company', 'firm', 'the firm']))]
company_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
661,174,6,Same Polarity Substitution (contextual),yes,"[7, 8]",[7],beverage maker,company
717,189,6,Same Polarity Substitution (contextual),yes,[9],"[14, 15]",cfsb,the company
1063,274,6,Same Polarity Substitution (contextual),yes,"[10, 11, 12, 13]","[10, 11]",oklahoma gas and electric,the company
1160,298,6,Same Polarity Substitution (contextual),yes,"[4, 5]",[0],the company,it
1316,336,6,Same Polarity Substitution (contextual),yes,[1],[1],company,service
1317,336,6,Same Polarity Substitution (contextual),yes,"[0, 1]",[12],the company,sprint
1374,355,6,Same Polarity Substitution (contextual),yes,[0],"[5, 6]",intel,the company
1511,394,6,Same Polarity Substitution (contextual),yes,"[17, 18]",[19],the company,it
1512,394,6,Same Polarity Substitution (contextual),yes,[20],"[21, 22]",microsoft,the company
2949,782,6,Same Polarity Substitution (contextual),yes,"[19, 20]","[22, 23]",westar business,the company


In [88]:
contextual.drop(company_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(company_subs.index.tolist(), inplace=True)


In [89]:
pronoun_subs = contextual[(contextual['s1_text'].isin(pronouns)) | (contextual['s2_text'].isin(pronouns))]
pronoun_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text
34,14,6,Same Polarity Substitution (contextual),yes,[1],"[0, 1]",he,de sole
65,24,6,Same Polarity Substitution (contextual),yes,[7],[0],him,zuccarini
84,29,6,Same Polarity Substitution (contextual),yes,[14],[16],he,tunick
243,72,6,Same Polarity Substitution (contextual),yes,[0],"[0, 1]",it,the document
320,89,6,Same Polarity Substitution (contextual),yes,[7],[11],nation,us
...,...,...,...,...,...,...,...,...
21782,5720,6,Same Polarity Substitution (contextual),yes,"[0, 1, 2]",[0],the new department,they
21839,5743,6,Same Polarity Substitution (contextual),yes,[9],[3],it,villagers
21864,5749,6,Same Polarity Substitution (contextual),yes,"[17, 18]",[20],mr comey,he
21936,5769,6,Same Polarity Substitution (contextual),yes,"[0, 1, 2, 3, 4, 5]",[3],the stunning art robbery on sunday,it


In [90]:
contextual.drop(pronoun_subs.index.tolist(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contextual.drop(pronoun_subs.index.tolist(), inplace=True)


In [91]:
merged_context_syn = pd.concat([has_number, has_symbol, company_subs, pronoun_subs])

In [92]:
substitute(6, merged_context_syn)

Double check type overwriting: row 124, sentence 1
Common indices: [16] | Pre-existing types: ['3_0']
Double check type overwriting: row 124, sentence 2
Common indices: [3] | Pre-existing types: ['3_0']
Double check type overwriting: row 191, sentence 1
Common indices: [19, 20, 21] | Pre-existing types: ['3_0' '3_0' '3_0']
Double check type overwriting: row 191, sentence 2
Common indices: [14] | Pre-existing types: ['3_0']
Double check type overwriting: row 335, sentence 1
Common indices: [1] | Pre-existing types: ['5_0']
Double check type overwriting: row 354, sentence 1
Common indices: [0] | Pre-existing types: ['3_0']
Double check type overwriting: row 354, sentence 2
Common indices: [5, 6] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 542, sentence 1
Common indices: [3, 4] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 542, sentence 2
Common indices: [8] | Pre-existing types: ['3_0']
Double check type overwriting: row 642, sentence 1

##### CSV Annotation for the rest

In [93]:
contextual.to_csv('remaining_contextual.csv')

After annotating on CSV file...

In [94]:
context_indices = pd.read_csv('contextual_only.csv')
straight_indices = pd.read_csv('straight_synonym_only.csv')

In [95]:
context_indices.set_index('idx', inplace=True)
straight_indices.set_index('idx', inplace=True)

In [96]:
remaining_context = contextual.loc[context_indices.index.to_list(),:]
remaining_straight = contextual.loc[straight_indices.index.to_list(),:]


In [97]:
substitute(6, remaining_context)

Double check type overwriting: row 1432, sentence 1
Common indices: [12] | Pre-existing types: ['3_0']
Double check type overwriting: row 1432, sentence 2
Common indices: [17, 18] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 1920, sentence 1
Common indices: [15] | Pre-existing types: ['3_0']
Double check type overwriting: row 1920, sentence 2
Common indices: [13] | Pre-existing types: ['3_0']
Double check type overwriting: row 2421, sentence 1
Common indices: [1] | Pre-existing types: ['2_0']
Double check type overwriting: row 2609, sentence 1
Common indices: [4] | Pre-existing types: ['3_0']
Double check type overwriting: row 2609, sentence 2
Common indices: [1] | Pre-existing types: ['3_0']
Double check type overwriting: row 2658, sentence 1
Common indices: [14, 20] | Pre-existing types: ['3_0 & 2_0' '2_1']
Double check type overwriting: row 3198, sentence 1
Common indices: [0, 1] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 3198, s

In [98]:
substitute(6, remaining_straight, 4)

Double check type overwriting: row 56, sentence 1
Common indices: [4] | Pre-existing types: ['3_0']
Double check type overwriting: row 56, sentence 2
Common indices: [0, 1] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 112, sentence 1
Common indices: [22] | Pre-existing types: ['3_1']
Double check type overwriting: row 112, sentence 2
Common indices: [7, 8] | Pre-existing types: ['3_1' '3_1']
Double check type overwriting: row 235, sentence 1
Common indices: [3] | Pre-existing types: ['3_0']
Double check type overwriting: row 235, sentence 2
Common indices: [8, 9, 10, 11] | Pre-existing types: ['3_0' '3_0' '3_0' '3_0']
Double check type overwriting: row 473, sentence 1
Common indices: [3] | Pre-existing types: ['3_0']
Double check type overwriting: row 473, sentence 2
Common indices: [1] | Pre-existing types: ['3_0']
Double check type overwriting: row 473, sentence 1
Common indices: [7, 8] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 4

### Derivational Changes

In [99]:
substitute(3)

Double check type overwriting: row 254, sentence 1
Common indices: [23] | Pre-existing types: ['4_0']
Double check type overwriting: row 254, sentence 2
Common indices: [21] | Pre-existing types: ['4_0']
Double check type overwriting: row 433, sentence 1
Common indices: [17] | Pre-existing types: ['4_0']
Double check type overwriting: row 433, sentence 2
Common indices: [17] | Pre-existing types: ['4_0']
Double check type overwriting: row 449, sentence 1
Common indices: [5] | Pre-existing types: ['4_1']
Double check type overwriting: row 449, sentence 2
Common indices: [22] | Pre-existing types: ['4_1']
Double check type overwriting: row 480, sentence 1
Common indices: [4] | Pre-existing types: ['3_0']
Double check type overwriting: row 480, sentence 2
Common indices: [16] | Pre-existing types: ['3_0']
Double check type overwriting: row 608, sentence 1
Common indices: [5] | Pre-existing types: ['4_2']
Double check type overwriting: row 608, sentence 2
Common indices: [5] | Pre-existing

Double check type overwriting: row 2226, sentence 1
Common indices: [13] | Pre-existing types: ['4_2']
Double check type overwriting: row 2226, sentence 2
Common indices: [19] | Pre-existing types: ['4_2']
Double check type overwriting: row 2346, sentence 1
Common indices: [2] | Pre-existing types: ['4_0']
Double check type overwriting: row 2346, sentence 2
Common indices: [4] | Pre-existing types: ['4_0']
Double check type overwriting: row 2394, sentence 1
Common indices: [11] | Pre-existing types: ['5_0']
Double check type overwriting: row 2394, sentence 2
Common indices: [3] | Pre-existing types: ['5_0']
Double check type overwriting: row 2625, sentence 1
Common indices: [1] | Pre-existing types: ['3_0']
Double check type overwriting: row 2625, sentence 2
Common indices: [22] | Pre-existing types: ['3_0']
Double check type overwriting: row 2973, sentence 1
Common indices: [14] | Pre-existing types: ['4_0']
Double check type overwriting: row 2973, sentence 2
Common indices: [9] | Pre

### Inflectional Changes

In [100]:
substitute(1)

Double check type overwriting: row 47, sentence 1
Common indices: [3] | Pre-existing types: ['4_0']
Double check type overwriting: row 47, sentence 2
Common indices: [3] | Pre-existing types: ['4_0']
Double check type overwriting: row 76, sentence 1
Common indices: [2] | Pre-existing types: ['3_0']
Double check type overwriting: row 76, sentence 2
Common indices: [20] | Pre-existing types: ['3_0']
Double check type overwriting: row 120, sentence 1
Common indices: [13] | Pre-existing types: ['4_0']
Double check type overwriting: row 120, sentence 2
Common indices: [11] | Pre-existing types: ['4_0']
Double check type overwriting: row 164, sentence 1
Common indices: [7] | Pre-existing types: ['4_0']
Double check type overwriting: row 164, sentence 2
Common indices: [11] | Pre-existing types: ['4_0']
Double check type overwriting: row 194, sentence 1
Common indices: [15] | Pre-existing types: ['3_0']
Double check type overwriting: row 194, sentence 2
Common indices: [19] | Pre-existing typ

### Spelling Changes

In [101]:
substitute(4)

Double check type overwriting: row 155, sentence 1
Common indices: [5] | Pre-existing types: ['4_0']
Double check type overwriting: row 155, sentence 2
Common indices: [8] | Pre-existing types: ['4_0']
Double check type overwriting: row 449, sentence 1
Common indices: [9, 10] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 449, sentence 2
Common indices: [4] | Pre-existing types: ['3_0']
Double check type overwriting: row 458, sentence 1
Common indices: [15] | Pre-existing types: ['3_0']
Double check type overwriting: row 458, sentence 2
Common indices: [1] | Pre-existing types: ['3_0']
Double check type overwriting: row 780, sentence 1
Common indices: [25] | Pre-existing types: ['3_1']
Double check type overwriting: row 780, sentence 2
Common indices: [11] | Pre-existing types: ['3_1']
Double check type overwriting: row 882, sentence 1
Common indices: [24] | Pre-existing types: ['3_0']
Double check type overwriting: row 882, sentence 2
Common indices: [20] | Pre

### Change of format

In [102]:
substitute(8)

Double check type overwriting: row 508, sentence 1
Common indices: [4] | Pre-existing types: ['3_0']
Double check type overwriting: row 508, sentence 2
Common indices: [9] | Pre-existing types: ['3_0']
Double check type overwriting: row 508, sentence 1
Common indices: [5] | Pre-existing types: ['3_0']
Double check type overwriting: row 508, sentence 2
Common indices: [10] | Pre-existing types: ['3_0']
Double check type overwriting: row 586, sentence 1
Common indices: [13] | Pre-existing types: ['3_0']
Double check type overwriting: row 586, sentence 2
Common indices: [6] | Pre-existing types: ['3_0']
Double check type overwriting: row 1322, sentence 1
Common indices: [25] | Pre-existing types: ['7_0']
Double check type overwriting: row 1322, sentence 2
Common indices: [23] | Pre-existing types: ['7_0']
Double check type overwriting: row 1974, sentence 1
Common indices: [16] | Pre-existing types: ['5_0']
Double check type overwriting: row 1974, sentence 2
Common indices: [16] | Pre-exis

### Opposite Polarity Substitution (Habitual)

In [103]:
substitute(9)

### Modal Verb Changes (TODO)

In [104]:
# TODO: Check overlapped words between (e.g.) derivational & inflectional changes
# The way this works right now, you'd have something like ['6_0 & 6_0'] for those
# Make sure this doesn't happen. Probably do a function that does a pass on the
# array of strings later and removes any duplicates

In [105]:
def trim_duplicates(s1_scope, s2_scope, s1_text, s2_text):
    s1_newtext = s1_text.split()
    s2_newtext = s2_text.split()
    
    in1 = np.where(np.in1d(s1_newtext, s2_newtext))[0]
    in2 = np.where(np.in1d(s2_newtext, s1_newtext))[0]

    s1_newscope = np.delete(s1_scope, in1)
    s2_newscope = np.delete(s2_scope, in2)
    s1_newtext = ' '.join(np.delete(s1_newtext, in1))
    s2_newtext = ' '.join(np.delete(s2_newtext, in2))

    return s1_newscope, s2_newscope, s1_newtext, s2_newtext

In [106]:
ric = textual_paraphrases.loc[textual_paraphrases['type_id'].isin([2])]
ric

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
120,40,2,Modal Verb Changes,yes,"[7, 8, 9]","[4, 5]",intend to seek,will seek,,,,
135,45,2,Modal Verb Changes,yes,"[4, 5, 6, 7]","[3, 4]",is expected to decline,will decline,,,,
197,57,2,Modal Verb Changes,yes,"[5, 6, 7]","[2, 3, 4, 5]",would shut down,plans to shut down,,,,
381,108,2,Modal Verb Changes,yes,"[6, 7]","[6, 7, 8]",were dispatched,will be sent,,,,
393,110,2,Modal Verb Changes,yes,"[11, 12]","[10, 11]",may issue,might issue,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
21449,5623,2,Modal Verb Changes,yes,"[8, 9]","[10, 11, 12]",will address,would participate in,,,,
21593,5665,2,Modal Verb Changes,yes,"[3, 4, 5]",[4],could have been,was,,,,
21679,5691,2,Modal Verb Changes,yes,"[4, 5]",[10],could bring,bringing,,,,
21762,5712,2,Modal Verb Changes,yes,"[3, 4]",[2],would give,gives,,,,


### Punctuation Changes

Let's look closely at punctuation changes. This is one of the types that annotates key elements, and those are what we 
want to use, so we'll use those as the scopes.  

In [107]:
punctuation = duplicate_df(textual_paraphrases)
punctuation = punctuation[punctuation['type_id'] == 21]
punctuation.drop(columns=['s1_scope', 's2_scope'], inplace=True)
punctuation.rename(columns={'key_s1': 's1_scope', 'key_s2': 's2_scope'}, inplace=True)
punctuation

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
20,5,21,Punctuation changes,yes,"The stock rose $ 2.11 , or about 11 percent , ...",PG & E Corp. shares jumped $ 1.63 or 8 percent...,"[5, 10]",,", ,",
33,10,21,Punctuation changes,yes,the foodservice pie business does n't fit the ...,`` The foodservice pie business does not fit o...,,[0],,``
46,15,21,Punctuation changes,yes,He told The Sun newspaper that Mr. Hussein 's ...,`` Saddam 's daughters had British schools and...,,"[0, 23]",,`` ''
87,29,21,Punctuation changes,yes,I wanted to bring the most beautiful people in...,`` I wanted to bring the most beautiful people...,,"[0, 15]",,`` ''
108,36,21,Punctuation changes,yes,Trading in Loral was halted yesterday ; the sh...,The New York Stock Exchange suspended trading ...,[6],[10],;,","
...,...,...,...,...,...,...,...,...,...,...
21962,5773,21,Punctuation changes,yes,`` I would rather be talking about positive nu...,But I would rather be talking about high stand...,[0],[14],``,''\n
21978,5780,21,Punctuation changes,yes,`` the man who has the blood of innocent peopl...,a man who has the blood of innocent people on ...,[4],,``,
21987,5782,21,Punctuation changes,yes,The Ministry of Defence said that `` an invest...,The Ministry of Defence said yesterday : “We c...,"[6, 27]",,`` '',
22033,5793,21,Punctuation changes,yes,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,[10],,",",


Some punctuation changes are substitutions (the ones with two keys), and some are additions/deletions (the ones where there's only one key, and the other key is `None`).

We'll need to treat them separately:

In [108]:
punct_adddel, punct_subs = split_add_sub(punctuation)

In [109]:
substitute(21, punct_adddel, 8)

Double check type overwriting: row 480, sentence 2
Common indices: [17] | Pre-existing types: ['3_0']
Double check type overwriting: row 896, sentence 2
Common indices: [7] | Pre-existing types: ['3_0']
Double check type overwriting: row 1115, sentence 2
Common indices: [5] | Pre-existing types: ['3_0']
Double check type overwriting: row 1346, sentence 2
Common indices: [1, 3] | Pre-existing types: ['3_1 & 5_0' '3_1 & 5_0']
Double check type overwriting: row 1398, sentence 2
Common indices: [14] | Pre-existing types: ['3_0']
Double check type overwriting: row 1533, sentence 1
Common indices: [18] | Pre-existing types: ['3_0']
Double check type overwriting: row 1586, sentence 2
Common indices: [27] | Pre-existing types: ['3_0']
Double check type overwriting: row 1586, sentence 2
Common indices: [19] | Pre-existing types: ['3_0']
Double check type overwriting: row 1593, sentence 1
Common indices: [3] | Pre-existing types: ['3_0']
Double check type overwriting: row 1667, sentence 2
Common

Double check type overwriting: row 2889, sentence 1
Common indices: [8, 12] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 2986, sentence 1
Common indices: [6] | Pre-existing types: ['3_0']
Double check type overwriting: row 3191, sentence 2
Common indices: [14] | Pre-existing types: ['3_0']
Double check type overwriting: row 3192, sentence 1
Common indices: [16] | Pre-existing types: ['3_0']
Double check type overwriting: row 3411, sentence 1
Common indices: [18] | Pre-existing types: ['3_0']
Double check type overwriting: row 3569, sentence 1
Common indices: [5, 6] | Pre-existing types: ['5_0' '5_0']
Double check type overwriting: row 4607, sentence 1
Common indices: [3, 11] | Pre-existing types: ['3_1 & 6_0' '3_1']
Double check type overwriting: row 4643, sentence 1
Common indices: [17] | Pre-existing types: ['3_1']
Double check type overwriting: row 5001, sentence 1
Common indices: [13] | Pre-existing types: ['3_0']
Double check type overwriting: row 5753, s

In [110]:
substitute(21, punct_subs, 7)

Double check type overwriting: row 401, sentence 1
Common indices: [0, 12] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 401, sentence 2
Common indices: [13] | Pre-existing types: ['3_0']
Double check type overwriting: row 449, sentence 1
Common indices: [14, 17] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 449, sentence 2
Common indices: [5, 16] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 573, sentence 1
Common indices: [0] | Pre-existing types: ['3_0']
Double check type overwriting: row 573, sentence 2
Common indices: [5] | Pre-existing types: ['3_0']
Double check type overwriting: row 656, sentence 1
Common indices: [25] | Pre-existing types: ['3_0']
Double check type overwriting: row 656, sentence 2
Common indices: [20] | Pre-existing types: ['3_0']
Double check type overwriting: row 790, sentence 1
Common indices: [3] | Pre-existing types: ['3_0']
Double check type overwriting: row 790, sentence 2
Common

### Named Entity Substitution

In [111]:
substitute(7)

Double check type overwriting: row 14, sentence 1
Common indices: [7] | Pre-existing types: ['3_0']
Double check type overwriting: row 14, sentence 2
Common indices: [1] | Pre-existing types: ['3_0']
Double check type overwriting: row 14, sentence 1
Common indices: [6, 7] | Pre-existing types: ['3_0' '3_0 & 5_0']
Double check type overwriting: row 14, sentence 2
Common indices: [1] | Pre-existing types: ['3_0 & 5_0']
Double check type overwriting: row 172, sentence 1
Common indices: [23, 24] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 172, sentence 2
Common indices: [18, 19, 20, 21] | Pre-existing types: ['3_0' '3_0' '3_0' '3_0']
Double check type overwriting: row 272, sentence 1
Common indices: [19] | Pre-existing types: ['3_0']
Double check type overwriting: row 272, sentence 2
Common indices: [18, 19] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 272, sentence 1
Common indices: [18, 19] | Pre-existing types: ['3_0' '3_0 & 5_0']
Dou

### Synthetic/Analytic Substitution

In [112]:
def differ_by_one(s1, s2):
    s1_list = s1.lower().split()
    s2_list = s2.lower().split()
    return abs(len(s1_list) - len(s2_list)) == 1


In [113]:
textual_paraphrases[textual_paraphrases['type_id'] == 11]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
13,5,11,Synthetic/analytic substitution,yes,[13],"[20, 21]",Friday,on Friday,,,,
21,6,11,Synthetic/analytic substitution,yes,"[1, 2, 3, 4, 5, 6, 7]","[10, 11, 12, 13, 14, 15]",in the first quarter of the year,the first quarter of the year,,,,
63,23,11,Synthetic/analytic substitution,yes,"[12, 13, 14, 15, 16, 17]","[14, 15, 16, 17]",western portions of the Dominican Republic,the western Dominican Republic,,,,
73,27,11,Synthetic/analytic substitution,yes,[8],"[8, 9, 10]",struck,managed to strike,,,,
88,32,11,Synthetic/analytic substitution,yes,"[3, 4, 5, 6, 7, 8, 9]","[7, 8, 9, 10, 11, 12, 13, 14]",the United States ' 12th-largest trading partner,the 12th-largest trading partner of the United...,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
22020,5791,11,Synthetic/analytic substitution,yes,"[0, 1]","[7, 8, 9]",Remaining shares,the remaining shares,,,,
22021,5791,11,Synthetic/analytic substitution,yes,"[6, 7, 8]","[3, 4, 5]",QVC 's management,QVC management team,,,,
22022,5791,11,Synthetic/analytic substitution,yes,"[6, 7, 8]","[2, 3, 4, 5]",QVC 's management,the QVC management team,,,,
22035,5793,11,Synthetic/analytic substitution,yes,"[21, 22]",[19],on Tuesday,Tuesday,,,,


In [114]:
new = pd.DataFrame(columns = textual_paraphrases.columns, data = copy.deepcopy(textual_paraphrases.values))
new = new[new['type_id'] == 11]
new['s1_text'] = new['s1_text'].str.lower() 
new['s2_text'] = new['s2_text'].str.lower() 
new = new[new.apply(lambda x: differ_by_one(x.s1_text, x.s2_text), axis=1)]
new

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
13,5,11,Synthetic/analytic substitution,yes,[13],"[20, 21]",friday,on friday,,,,
21,6,11,Synthetic/analytic substitution,yes,"[1, 2, 3, 4, 5, 6, 7]","[10, 11, 12, 13, 14, 15]",in the first quarter of the year,the first quarter of the year,,,,
88,32,11,Synthetic/analytic substitution,yes,"[3, 4, 5, 6, 7, 8, 9]","[7, 8, 9, 10, 11, 12, 13, 14]",the united states ' 12th-largest trading partner,the 12th-largest trading partner of the united...,,,,
146,48,11,Synthetic/analytic substitution,yes,"[8, 9]",[12],its earnings,earnings,,,,
156,50,11,Synthetic/analytic substitution,yes,"[10, 11, 12]","[10, 11, 12, 13]",significant economic growth,a significant economic growth,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
22011,5788,11,Synthetic/analytic substitution,yes,"[0, 1]",[4],pwc itself,pwc,,,,
22020,5791,11,Synthetic/analytic substitution,yes,"[0, 1]","[7, 8, 9]",remaining shares,the remaining shares,,,,
22022,5791,11,Synthetic/analytic substitution,yes,"[6, 7, 8]","[2, 3, 4, 5]",qvc 's management,the qvc management team,,,,
22035,5793,11,Synthetic/analytic substitution,yes,"[21, 22]",[19],on tuesday,tuesday,,,,


In [115]:
new['s1_scope'], new['s2_scope'], new['s1_text'], new['s2_text'] = new.apply(lambda x: trim_duplicates(x.s1_scope, x.s2_scope, x.s1_text, x.s2_text), axis=1, result_type='expand').transpose().values
new

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
13,5,11,Synthetic/analytic substitution,yes,[],[20],,on,,,,
21,6,11,Synthetic/analytic substitution,yes,[1],[],in,,,,,
88,32,11,Synthetic/analytic substitution,yes,[6],[11],',of,,,,
146,48,11,Synthetic/analytic substitution,yes,[8],[],its,,,,,
156,50,11,Synthetic/analytic substitution,yes,[],[10],,a,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
22011,5788,11,Synthetic/analytic substitution,yes,[1],[],itself,,,,,
22020,5791,11,Synthetic/analytic substitution,yes,[],[7],,the,,,,
22022,5791,11,Synthetic/analytic substitution,yes,[7],"[2, 5]",'s,the team,,,,
22035,5793,11,Synthetic/analytic substitution,yes,[21],[],on,,,,,


In [116]:
new['s1_text'] = new['s1_text'].apply(lambda x: None if x == '' else x)
new['s2_text'] = new['s2_text'].apply(lambda x: None if x == '' else x)
new['s1_scope'] = new['s1_scope'].apply(lambda x: None if list(x) == [] else x)
new['s2_scope'] = new['s2_scope'].apply(lambda x: None if list(x) == [] else x)

In [117]:
new

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
13,5,11,Synthetic/analytic substitution,yes,,[20],,on,,,,
21,6,11,Synthetic/analytic substitution,yes,[1],,in,,,,,
88,32,11,Synthetic/analytic substitution,yes,[6],[11],',of,,,,
146,48,11,Synthetic/analytic substitution,yes,[8],,its,,,,,
156,50,11,Synthetic/analytic substitution,yes,,[10],,a,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
22011,5788,11,Synthetic/analytic substitution,yes,[1],,itself,,,,,
22020,5791,11,Synthetic/analytic substitution,yes,,[7],,the,,,,
22022,5791,11,Synthetic/analytic substitution,yes,[7],"[2, 5]",'s,the team,,,,
22035,5793,11,Synthetic/analytic substitution,yes,[21],,on,,,,,


In [118]:
new_adddel = new[(new['s1_scope'].isnull()) | (new['s2_scope'].isnull())]
new_subs = new[~((new['s1_scope'].isnull()) | (new['s2_scope'].isnull()))]

In [119]:
new_adddel[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
13,5,11,Synthetic/analytic substitution,yes,,[20],,on,,,,
21,6,11,Synthetic/analytic substitution,yes,[1],,in,,,,,
146,48,11,Synthetic/analytic substitution,yes,[8],,its,,,,,
156,50,11,Synthetic/analytic substitution,yes,,[10],,a,,,,
253,74,11,Synthetic/analytic substitution,yes,[9],,all,,,,,
307,87,11,Synthetic/analytic substitution,yes,[20],,many,,,,,
551,150,11,Synthetic/analytic substitution,yes,,[13],,the,,,,
650,172,11,Synthetic/analytic substitution,yes,[14],,more,,,,,
656,173,11,Synthetic/analytic substitution,yes,[18],,'s,,,,,
662,174,11,Synthetic/analytic substitution,yes,[24],,its,,,,,


In [120]:
substitute(11, new_adddel, 1, False)

In [121]:
new_subs[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
88,32,11,Synthetic/analytic substitution,yes,[6],[11],',of,,,,
321,89,11,Synthetic/analytic substitution,yes,"[7, 8]","[9, 11]",nation 's,in us,,,,
649,172,11,Synthetic/analytic substitution,yes,"[5, 7]",[6],the cost,prices,,,,
686,180,11,Synthetic/analytic substitution,yes,[12],"[13, 14]",mexican,of mexico,,,,
860,222,11,Synthetic/analytic substitution,yes,[12],[8],federal,us,,,,
1023,265,11,Synthetic/analytic substitution,yes,[19],[9],in,',,,,
1090,279,11,Synthetic/analytic substitution,yes,[8],"[3, 5]",makers,the manufacturers,,,,
1210,309,11,Synthetic/analytic substitution,yes,"[20, 21]",[21],the pool,pools,,,,
1562,410,11,Synthetic/analytic substitution,yes,"[9, 10]",[8],indicating creditworthiness,credit,,,,
1567,411,11,Synthetic/analytic substitution,yes,[4],"[4, 6]",comment,an response,,,,


In [122]:
substitute(11, new_subs, 5, False)

In [123]:
aye = pd.DataFrame(columns = textual_paraphrases.columns, data = copy.deepcopy(textual_paraphrases.values))
aye = aye[aye['type_id'] == 11]
aye.drop(new.index.tolist(), inplace=True)
aye[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
63,23,11,Synthetic/analytic substitution,yes,"[12, 13, 14, 15, 16, 17]","[14, 15, 16, 17]",western portions of the Dominican Republic,the western Dominican Republic,,,,
73,27,11,Synthetic/analytic substitution,yes,[8],"[8, 9, 10]",struck,managed to strike,,,,
89,32,11,Synthetic/analytic substitution,yes,"[12, 13]","[17, 18]",two-way trade,trade volume,,,,
143,46,11,Synthetic/analytic substitution,yes,"[6, 7, 8, 9, 10, 11]","[6, 7, 8, 9, 10, 11, 12, 13, 14]",State Department official John S. Wolf,"John S. Wolf , an assistant secretary of state",,,,
182,54,11,Synthetic/analytic substitution,yes,"[12, 13, 14, 15, 16, 17, 18]","[10, 11, 12, 13]",morning trading on the Nasdaq Stock Market,Nasdaq Stock Market trading,,,,
201,59,11,Synthetic/analytic substitution,yes,"[2, 3, 4, 5, 6, 7]","[2, 3, 4]",the questions asked by the audience,the audience questions,,,,
304,86,11,Synthetic/analytic substitution,yes,"[11, 12, 13]","[8, 9, 10, 11, 12, 13, 14]",Longhorn 's release,release dates of Microsoft 's new products,,,,
331,94,11,Synthetic/analytic substitution,yes,[12],"[11, 12, 14]",resisted,put up resistance,,,,
392,110,11,Synthetic/analytic substitution,yes,"[4, 5, 6, 7]","[4, 5, 6, 7]",Russia 's foreign ministry,the Russian Foreign Ministry,,,,
405,112,11,Synthetic/analytic substitution,yes,[24],[25],XML-based,XML,,,,


In [124]:
substitute(11, aye, 5, False)

In [125]:
positives.loc[45]['sentence1']

'The group will be headed by State Department official John S. Wolf, who has served in Australia, Vietnam, Greece and Pakistan.'

In [126]:
positives.loc[45]['sentence2']

'The group will be headed by John S. Wolf, an assistant secretary of state who has served in Australia, Vietnam, Greece and Pakistan.'

In [127]:
positives.loc[45]['sentence1_scope']

array(['', '', '', '', '', '', '5_0', '4_0', '4_0', '5_0', '5_0', '5_0',
       '', '', '', '', '', '', '', '', '', '', '', '', ''], dtype='<U64')

### Converse Substitution

In [128]:
textual_paraphrases[textual_paraphrases['type_id'] == 13][:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
1139,292,13,Converse substitution,yes,"[10, 11]","[14, 15]",was taken,had brought,,,,
1354,347,13,Converse substitution,yes,[4],[1],was,leaves,,,,
2370,622,13,Converse substitution,yes,"[3, 4]",[4],be back,shelve,,,,
3021,798,13,Converse substitution,yes,"[10, 11]","[14, 15]",breaks down,interacts with,,,,
3389,902,13,Converse substitution,yes,"[2, 6]","[3, 4, 8]",includes in,would get of,,,,
3502,932,13,Converse substitution,yes,"[4, 5]",[3],were given,received,,,,
3503,932,13,Converse substitution,yes,[14],"[16, 17]",took,were given,,,,
4609,1244,13,Converse substitution,yes,"[2, 3]","[6, 7]",was subpoenaed,was cooperating,,,,
6113,1663,13,Converse substitution,yes,"[2, 3, 4]","[8, 10, 11]",were hurt by,put pressure on,,,,
7418,2001,13,Converse substitution,yes,[8],[9],swept,gave,,,,


In [129]:
substitute(13)

Double check type overwriting: row 2009, sentence 1
Common indices: [6, 7, 8] | Pre-existing types: ['6_0' '6_0' '6_0']
Double check type overwriting: row 2009, sentence 2
Common indices: [6, 7] | Pre-existing types: ['6_0' '6_0']
Double check type overwriting: row 3180, sentence 1
Common indices: [10, 13] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 3180, sentence 2
Common indices: [19] | Pre-existing types: ['3_0']
Double check type overwriting: row 4887, sentence 1
Common indices: [13] | Pre-existing types: ['6_0']
Double check type overwriting: row 4887, sentence 2
Common indices: [17, 18] | Pre-existing types: ['6_0' '6_0']
Double check type overwriting: row 5466, sentence 1
Common indices: [5] | Pre-existing types: ['6_0']
Double check type overwriting: row 5466, sentence 2
Common indices: [2] | Pre-existing types: ['6_0']


### Coordination Changes

This is similar to punctuation changes in two ways:
- It uses `keys` for annotation
- It can either be Addition/Deletion (in this case, function word) or Substitution
  - But which kind of substitution? Synonym?

In [130]:
coord = pd.DataFrame(columns = textual_paraphrases.columns, data = copy.deepcopy(textual_paraphrases.values))
coord = coord[coord['type_id'] == 17]
coord.drop(columns=['s1_scope', 's2_scope'], inplace=True)
coord.rename(columns={'key_s1': 's1_scope', 'key_s2': 's2_scope'}, inplace=True)

coord[30:40]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
11748,3120,17,Coordination changes,yes,traffic has disappeared from once bustling str...,traffic has disappeared from once-bustling str...,,[10],,and
12294,3262,17,Coordination changes,yes,"a 4.5-inch LCD screen , Memory Stick expansion...",a 4.5 in back-lit LCD screen and memory expans...,,[9],,and
13874,3675,17,Coordination changes,yes,"20 years , 63 days",20 years and 63 days,[7],[15],",",and
14171,3752,17,Coordination changes,yes,"State Sen. Vi Simpson , former state and natio...",former state and national Democratic Chairman ...,,[12],,and
16307,4301,17,Coordination changes,yes,Pacific Northwest has more than 800 employees ...,"It has 800 employees , compared with Wells Far...",[8],,and,
16346,4312,17,Coordination changes,yes,The victims were last seen ; their bodies were...,The family was last seen and their bodies were...,,[7],,and
16937,4474,17,Coordination changes,yes,"`` I have lots of bad dreams , I have flashbac...","`` I have lots of bad dreams , flashbacks and ...",,[9],,and
17198,4533,17,Coordination changes,yes,"some administrative material , some from a fai...",administrative paper work and some about a fai...,,[10],,and
17211,4537,17,Coordination changes,yes,a June opening record,a monster opening and a June record,,[17],,and
17247,4548,17,Coordination changes,yes,is still being held at the prison and is now i...,was held in isolation at the same prison,[8],,and,


In [131]:
print_sents(1316)

Five foreign embassies, including the Singapore embassy, in Bangkok were among the targets, it said.
Five foreign embassies in Bangkok, including the Singapore embassy, were among those targeted.


In [132]:
coord_adddel = coord[(coord['s1_scope'].isnull()) | (coord['s2_scope'].isnull())]
coord_subs = coord[~((coord['s1_scope'].isnull()) | (coord['s2_scope'].isnull()))]

In [133]:
substitute(17, coord_adddel, 1)

Double check type overwriting: row 4977, sentence 2
Common indices: [6] | Pre-existing types: ['3_1']
Double check type overwriting: row 5321, sentence 2
Common indices: [18] | Pre-existing types: ['5_1']


TODO: Deal with the below

In [134]:
coord_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
64,23,17,Coordination changes,yes,But tropical storm warnings and watches were p...,Tropical storm warnings were in place Thursday...,[4],[11],and,and
2925,769,17,Coordination changes,yes,"He was sent to Larned State Hospital , where h...",He ordered him sent to the Larned State Securi...,[8],[10],where,for
4868,1317,17,Coordination changes,yes,hammering consumer spending and leaving shops ...,"hammering consumer spending , with shops , res...",[23],[14],and,","
9107,2442,17,Coordination changes,yes,Ms Pike also said it was not unusual for hospi...,But Ms Pike said it was not unusual for hospit...,[15],[14],but,and
9663,2604,17,Coordination changes,yes,Jews and the US,"Jews , Americans",[3],[1],and,","
10342,2789,17,Coordination changes,yes,"In addition to O'Connor , Rehnquist 's majorit...","Justices Sandra Day O'Connor , David H. Souter...","[0, 1, 2, 20]",[19],In addition to and,and
13874,3675,17,Coordination changes,yes,"20 years , 63 days",20 years and 63 days,[7],[15],",",and
20398,5343,17,Coordination changes,yes,"vegetables , fruits",fruits and vegetables,[9],[14],",",and
20399,5343,17,Coordination changes,yes,"nuts , cereals",nuts and cereals,[15],[18],",",and
21161,5545,17,Coordination changes,yes,benefiting from a little luck Thursday to eras...,to erase a set point and beat unseeded Nadia P...,[23],[20],and,and


In [135]:
print_sents(769)

He was sent to Larned State Hospital, where he was evaluated and treated.
He ordered him sent to the Larned State Security Hospital for continued evaluation and treatment.


Let's isolate the punctuation and add it first

In [136]:
coord_punct = duplicate_df(coord.loc[[4868, 9663, 13874, 20398, 20399],:])
coord_punct

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,1317,17,Coordination changes,yes,hammering consumer spending and leaving shops ...,"hammering consumer spending , with shops , res...",[23],[14],and,","
1,2604,17,Coordination changes,yes,Jews and the US,"Jews , Americans",[3],[1],and,","
2,3675,17,Coordination changes,yes,"20 years , 63 days",20 years and 63 days,[7],[15],",",and
3,5343,17,Coordination changes,yes,"vegetables , fruits",fruits and vegetables,[9],[14],",",and
4,5343,17,Coordination changes,yes,"nuts , cereals",nuts and cereals,[15],[18],",",and


In [137]:
coord_subs.loc[[13874, 20398, 20399], 's1_scope'] = None
coord_subs.loc[[4868, 9663], 's2_scope'] = None
coord_subs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coord_subs.loc[[13874, 20398, 20399], 's1_scope'] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coord_subs.loc[[4868, 9663], 's2_scope'] = None


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
64,23,17,Coordination changes,yes,But tropical storm warnings and watches were p...,Tropical storm warnings were in place Thursday...,[4],[11],and,and
2925,769,17,Coordination changes,yes,"He was sent to Larned State Hospital , where h...",He ordered him sent to the Larned State Securi...,[8],[10],where,for
4868,1317,17,Coordination changes,yes,hammering consumer spending and leaving shops ...,"hammering consumer spending , with shops , res...",[23],,and,","
9107,2442,17,Coordination changes,yes,Ms Pike also said it was not unusual for hospi...,But Ms Pike said it was not unusual for hospit...,[15],[14],but,and
9663,2604,17,Coordination changes,yes,Jews and the US,"Jews , Americans",[3],,and,","
10342,2789,17,Coordination changes,yes,"In addition to O'Connor , Rehnquist 's majorit...","Justices Sandra Day O'Connor , David H. Souter...","[0, 1, 2, 20]",[19],In addition to and,and
13874,3675,17,Coordination changes,yes,"20 years , 63 days",20 years and 63 days,,[15],",",and
20398,5343,17,Coordination changes,yes,"vegetables , fruits",fruits and vegetables,,[14],",",and
20399,5343,17,Coordination changes,yes,"nuts , cereals",nuts and cereals,,[18],",",and
21161,5545,17,Coordination changes,yes,benefiting from a little luck Thursday to eras...,to erase a set point and beat unseeded Nadia P...,[23],[20],and,and


In [138]:
coord_punct.loc[[0,1], 's1_scope'] = None
coord_punct.loc[[2,3,4], 's2_scope'] = None
coord_punct

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,1317,17,Coordination changes,yes,hammering consumer spending and leaving shops ...,"hammering consumer spending , with shops , res...",,[14],and,","
1,2604,17,Coordination changes,yes,Jews and the US,"Jews , Americans",,[1],and,","
2,3675,17,Coordination changes,yes,"20 years , 63 days",20 years and 63 days,[7],,",",and
3,5343,17,Coordination changes,yes,"vegetables , fruits",fruits and vegetables,[9],,",",and
4,5343,17,Coordination changes,yes,"nuts , cereals",nuts and cereals,[15],,",",and


In [139]:
substitute(17, coord_punct, 8)

Double check type overwriting: row 3674, sentence 1
Common indices: [7] | Pre-existing types: ['3_0']


In [140]:
substitute(17, coord_subs, 1)

Double check type overwriting: row 3674, sentence 2
Common indices: [15] | Pre-existing types: ['3_0']


### Subordination and Nesting changes

In [141]:
subord = pd.DataFrame(columns = textual_paraphrases.columns, data = copy.deepcopy(textual_paraphrases.values))
subord = subord[subord['type_id'] == 18]
subord.drop(columns=['s1_scope', 's2_scope'], inplace=True)
subord.rename(columns={'key_s1': 's1_scope', 'key_s2': 's2_scope'}, inplace=True)

subord[:10]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
48,20,18,Subordination and nesting changes,yes,Sheena Young of Child,"Sheena Young , a spokesman for Child ,","[2, 3]","[3, 4, 5, 6]",of Child,a spokesman for Child
97,33,18,Subordination and nesting changes,yes,The AFL-CIO is waiting until October to decide,The AFL-CIO announced Wednesday that it will d...,,"[2, 4]",,announced that
105,36,18,Subordination and nesting changes,yes,Trading in Loral was halted yesterday ; the sh...,The New York Stock Exchange suspended trading ...,"[7, 8, 9, 10, 11, 12, 14]",[11],the shares closed on Monday at 3.01,which
131,43,18,Subordination and nesting changes,yes,", who faces charges of conspiracy lying to a g...",on charges of conspiracy and lying to a grand ...,[2],"[7, 8, 9]",who,on charges of
191,56,18,Subordination and nesting changes,yes,and allows developers,", which allows developers",[5],[12],and,which
227,69,18,Subordination and nesting changes,yes,a point system the U.S. Supreme Court found un...,the way it previously admitted undergraduates,"[16, 17, 18, 19, 20, 21]","[21, 22, 23, 24]",the U.S. Supreme Court found unconstitutional,it previously admitted undergraduates
406,112,18,Subordination and nesting changes,yes,The suite comes complete with a word processor...,"The suite includes a word processor , spreadsh...",[17],,while,
489,134,18,Subordination and nesting changes,yes,at the same time of the anthrax attacks,at the same time that real anthrax attacks,"[19, 20, 21, 22]",[19],of the anthrax attacks,that
497,135,18,Subordination and nesting changes,yes,"$ 200 billion annually , which Gephardt would ...",$ 200 billion annually and be paid for,[11],,which,
632,169,18,Subordination and nesting changes,yes,The Saudi newspaper Okaz reported Monday that ...,The newspaper Okaz reported that the six suspe...,"[8, 9, 10, 11, 12]","[11, 12]",who escaped Saturday 's raid,the raid


In [142]:
subord_adddel = subord[(subord['s1_scope'].isnull()) | (subord['s2_scope'].isnull())]
subord_subs = subord[~((subord['s1_scope'].isnull()) | (subord['s2_scope'].isnull()))]

In [143]:
subord_adddel[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
97,33,18,Subordination and nesting changes,yes,The AFL-CIO is waiting until October to decide,The AFL-CIO announced Wednesday that it will d...,,"[2, 4]",,announced that
406,112,18,Subordination and nesting changes,yes,The suite comes complete with a word processor...,"The suite includes a word processor , spreadsh...",[17],,while,
497,135,18,Subordination and nesting changes,yes,"$ 200 billion annually , which Gephardt would ...",$ 200 billion annually and be paid for,[11],,which,
967,252,18,Subordination and nesting changes,yes,A hearing on the matter was held Thursday morn...,A hearing Thursday morning before Judge Elizab...,"[14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 2...",,", marking one of the early steps in deciding t...",
1507,394,18,Subordination and nesting changes,yes,a patent verdict that went against the company,a patent verdict against it,[14],,that,
1524,399,18,Subordination and nesting changes,yes,"in August 2000 , when Rambus accused Infineon","At that time , Rambus accused Infineon",[8],,when,
1698,443,18,Subordination and nesting changes,yes,"Kyi , a U.N. envoy says , as Japan adds to gro...",JAPAN added to growing international pressure ...,[14],,by,
1781,465,18,Subordination and nesting changes,yes,"Meanwhile , rival contender , General Electric...",Other contenders included General Electric 's ...,,[9],,which
1982,525,18,Subordination and nesting changes,yes,500 clergy sex abuse lawsuits,500 sex abuse lawsuits involving priests,,"[20, 21]",,involving priests
2079,549,18,Subordination and nesting changes,yes,"In the 2002 study , the margin of error ranged...",It has a margin of error of plus or minus thre...,[0],,In,


Isolate single words, since those are almost certainly function words   

In [144]:
def singleword(word):
    if not word:
        return False
    else:
        return len(word.split()) == 1

In [145]:
subord_adddel_funct = subord_adddel[subord_adddel.apply(lambda x: singleword(x.k1_text) or singleword(x.k2_text), axis=1)]

In [146]:
subord_adddel_funct[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
406,112,18,Subordination and nesting changes,yes,The suite comes complete with a word processor...,"The suite includes a word processor , spreadsh...",[17],,while,
497,135,18,Subordination and nesting changes,yes,"$ 200 billion annually , which Gephardt would ...",$ 200 billion annually and be paid for,[11],,which,
1507,394,18,Subordination and nesting changes,yes,a patent verdict that went against the company,a patent verdict against it,[14],,that,
1524,399,18,Subordination and nesting changes,yes,"in August 2000 , when Rambus accused Infineon","At that time , Rambus accused Infineon",[8],,when,
1698,443,18,Subordination and nesting changes,yes,"Kyi , a U.N. envoy says , as Japan adds to gro...",JAPAN added to growing international pressure ...,[14],,by,
1781,465,18,Subordination and nesting changes,yes,"Meanwhile , rival contender , General Electric...",Other contenders included General Electric 's ...,,[9],,which
2079,549,18,Subordination and nesting changes,yes,"In the 2002 study , the margin of error ranged...",It has a margin of error of plus or minus thre...,[0],,In,
2106,554,18,Subordination and nesting changes,yes,"sales were flat , while the gross margin fell","sales were flat , with gross margin down",[12],,while,
2137,563,18,Subordination and nesting changes,yes,code which is from its Unix property,Unix intellectual property owned by SCO,[11],,which,
2334,612,18,Subordination and nesting changes,yes,A rebel who was captured,A captured rebel,[2],,who,


In [147]:
substitute(18, subord_adddel_funct, 1)

Double check type overwriting: row 790, sentence 1
Common indices: [8] | Pre-existing types: ['3_0']


Double check type overwriting: row 2879, sentence 2
Common indices: [10] | Pre-existing types: ['3_0']
Double check type overwriting: row 3152, sentence 1
Common indices: [26] | Pre-existing types: ['3_0']
Double check type overwriting: row 3658, sentence 1
Common indices: [18] | Pre-existing types: ['3_0']
Double check type overwriting: row 4132, sentence 1
Common indices: [18] | Pre-existing types: ['5_0']
Double check type overwriting: row 4844, sentence 2
Common indices: [3] | Pre-existing types: ['3_0']


In [148]:
subord_subs

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
48,20,18,Subordination and nesting changes,yes,Sheena Young of Child,"Sheena Young , a spokesman for Child ,","[2, 3]","[3, 4, 5, 6]",of Child,a spokesman for Child
105,36,18,Subordination and nesting changes,yes,Trading in Loral was halted yesterday ; the sh...,The New York Stock Exchange suspended trading ...,"[7, 8, 9, 10, 11, 12, 14]",[11],the shares closed on Monday at 3.01,which
131,43,18,Subordination and nesting changes,yes,", who faces charges of conspiracy lying to a g...",on charges of conspiracy and lying to a grand ...,[2],"[7, 8, 9]",who,on charges of
191,56,18,Subordination and nesting changes,yes,and allows developers,", which allows developers",[5],[12],and,which
227,69,18,Subordination and nesting changes,yes,a point system the U.S. Supreme Court found un...,the way it previously admitted undergraduates,"[16, 17, 18, 19, 20, 21]","[21, 22, 23, 24]",the U.S. Supreme Court found unconstitutional,it previously admitted undergraduates
...,...,...,...,...,...,...,...,...,...,...
21573,5660,18,Subordination and nesting changes,yes,the number one priority for David Jones,the number one issue David Jones would tackle,"[17, 18, 19]","[18, 19, 20, 21]",for David Jones,David Jones would tackle
21746,5710,18,Subordination and nesting changes,yes,Women who eat potatoes and other tuberous vege...,their mothers eating potatoes and other tubero...,[1],"[17, 18, 19, 20, 21, 22, 23, 24]",who,eating potatoes and other tuberous vegetables ...
21753,5711,18,Subordination and nesting changes,yes,State Senate Majority Leader Joseph Bruno,"Joseph L. Bruno , the State Senate majority le...","[7, 8]","[8, 9, 10, 11, 12]",Joseph Bruno,the State Senate majority leader
21767,5713,18,Subordination and nesting changes,yes,one subtype that represents a minority of cases,one subtype representing a minority of cases,[21],"[22, 23, 24, 25, 26]",that,representing a minority of cases


In [149]:
print_sents(5712)

"The £5m would give BA a considerable return on the £5 it originally paid the government for the aircraft."
The £5m gives BA a considerable return on the £5 they originally paid for Concorde.


In [150]:
textual_paraphrases[textual_paraphrases['pair_id'] == 5711]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
21750,5711,4,Spelling changes,yes,[0],[0],ALBANY,LBANY,,,,
21751,5711,7,Same Polarity Substitution (named ent.),yes,"[7, 8]","[4, 5, 6]",Joseph Bruno,Joseph L. Bruno,,,,
21752,5711,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","ALBANY , N.Y. State Senate Majority Leader Jos...","LBANY , Aug. 8 Joseph L. Bruno , the State Sen...",,[16],,that
21753,5711,18,Subordination and nesting changes,yes,"[3, 4, 5, 6, 7, 8]","[4, 5, 6, 7, 8, 9, 10, 11, 12, 13]",State Senate Majority Leader Joseph Bruno,"Joseph L. Bruno , the State Senate majority le...","[7, 8]","[8, 9, 10, 11, 12]",Joseph Bruno,the State Senate majority leader
21754,5711,8,Change of format,yes,"[5, 6]","[11, 12]",Majority Leader,majority leader,,,,
21755,5711,25,Addition/Deletion,yes,"[1, 2]",,", N.Y.",,,,,
21756,5711,25,Addition/Deletion,yes,,"[1, 2, 3]",,", Aug. 8",,,,
21757,5711,29,Identity,yes,"[9, 18]","[14, 21]",announced .,announced .\n,,,,
21758,5711,30,Non-paraphrase,yes,[10],[15],Friday,today,,,,
21759,5711,28,Semantic based,yes,"[11, 12, 13, 14, 15, 16, 17]","[17, 18, 19, 20]",he has been diagnosed with prostate cancer,he had prostate cancer,,,,


In [151]:
subord_adddel.drop(subord_adddel_funct.index.tolist(), inplace=True)
subord_adddel

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subord_adddel.drop(subord_adddel_funct.index.tolist(), inplace=True)


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
97,33,18,Subordination and nesting changes,yes,The AFL-CIO is waiting until October to decide,The AFL-CIO announced Wednesday that it will d...,,"[2, 4]",,announced that
967,252,18,Subordination and nesting changes,yes,A hearing on the matter was held Thursday morn...,A hearing Thursday morning before Judge Elizab...,"[14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 2...",,", marking one of the early steps in deciding t...",
1982,525,18,Subordination and nesting changes,yes,500 clergy sex abuse lawsuits,500 sex abuse lawsuits involving priests,,"[20, 21]",,involving priests
2812,744,18,Subordination and nesting changes,yes,The cleanup cost about $ 130 million,The $ 130 million cleanup,"[12, 13]",,cost about,
3000,792,18,Subordination and nesting changes,yes,Myanmar 's pro-democracy leader Aung San Suu K...,Burma pro-democracy leader Aung San Suu Kyi wi...,"[14, 15, 16, 17, 18, 19, 20, 21, 22, 23]",,following her release from a hospital where sh...,
5435,1475,18,Subordination and nesting changes,yes,The companies announced plans to collaborate,The two groups said they would collaborate,,"[4, 5, 6]",,they would collaborate
5604,1507,18,Subordination and nesting changes,yes,a Rhodes scholar at Oxford,an Rhodes Scholar he met while at Oxford,,"[16, 17]",,he met
6212,1695,18,Subordination and nesting changes,yes,the case of a nine-year-old girl who turned up...,the abduction of a 9-year-old who was found sa...,"[21, 22, 23, 24, 25, 26, 27]",,after being violently abducted from her home,
6698,1818,18,Subordination and nesting changes,yes,63 percent of home broadband users connected v...,63 percent of home broadband users had cable m...,,"[15, 16, 19]",,compared with who
7624,2060,18,Subordination and nesting changes,yes,Doctors have advised that the boy get chemothe...,Daren and Barbara Jensen refused to heed docto...,,"[5, 6, 7, 8, 9, 10, 11]",,to heed doctors ' recommendation of chemotherapy


In [152]:
substitute(18, subord_adddel, 2, False)

### Direct/Indirect Style Alternations (TODO)

In [153]:
textual_paraphrases[textual_paraphrases['type_id'] == 22][:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
521,141,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","`` This deal makes sense for both companies , ...","Brian Halla , CEO of NatSemi , claimed the dea...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",,"`` This deal makes sense for both companies , ''",
745,197,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11]","[2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 15]",`` I just got carried away and started making ...,he got carried away and just `` started making...,"[0, 11]",,`` '',
872,225,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",Mr Abbas said : `` Every day without an agreem...,"His Palestinian counterpart , Mahmoud Abbas , ...",,[8],,that
2076,547,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",The draft of the report was forthright : `` Cl...,The original report had concluded that ''clima...,,[5],,that
2676,701,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",He added that those `` are not solely American...,`` These are not solely American principles no...,[2],,that,
3362,895,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",We need a certifiable pay as you go budget by ...,Texas lawmakers must close a $ 185.9 million b...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",,We need a certifiable pay as you go budget by ...,
3988,1068,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 20, 21,...","[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...",`` There is no conscious policy of the United ...,there is no conscious policy by the United Sta...,"[0, 25]",,`` '',
4899,1323,22,Direct/indirect style alternations,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",The bank also said its offer was subject to th...,The offer is also subject to Goldman signing a...,"[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...",,its offer was subject to the agreement of Drax...,
4929,1331,22,Direct/indirect style alternations,yes,"[1, 2, 3, 4, 5, 6]","[0, 1, 2]",He said they were in distress,We 're asphyxiating,"[3, 4, 5, 6]",,they were in distress,
5003,1347,22,Direct/indirect style alternations,yes,"[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11]",the two men were `` defined by dedication and ...,`` [ They ] were defined by dedication and cou...,,"[0, 11]",,`` ''


In [154]:
print_sents(1346)

The $19.50-a-share bid, comes two days after PeopleSoft revised its bid for smaller rival J.D. Edwards & Co. JDEC.O to include cash as well as stock.
Oracle's $19.50-a-share bid comes two days after PeopleSoft added cash to its original all-share deal with smaller rival J.D. Edwards & Co. JDEC.O .


The annotation for this type seem to be all over the place. Nevertheless, an easy case to deal with seems to be 
additions of 'that':

In [155]:
direct = pd.DataFrame(columns = textual_paraphrases.columns, data = copy.deepcopy(textual_paraphrases.values))
direct = direct[direct['type_id'] == 22]
direct.drop(columns=['s1_scope', 's2_scope'], inplace=True)
direct.rename(columns={'key_s1': 's1_scope', 'key_s2': 's2_scope'}, inplace=True)

In [156]:
direct[:30]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
521,141,22,Direct/indirect style alternations,yes,"`` This deal makes sense for both companies , ...","Brian Halla , CEO of NatSemi , claimed the dea...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",,"`` This deal makes sense for both companies , ''",
745,197,22,Direct/indirect style alternations,yes,`` I just got carried away and started making ...,he got carried away and just `` started making...,"[0, 11]",,`` '',
872,225,22,Direct/indirect style alternations,yes,Mr Abbas said : `` Every day without an agreem...,"His Palestinian counterpart , Mahmoud Abbas , ...",,[8],,that
2076,547,22,Direct/indirect style alternations,yes,The draft of the report was forthright : `` Cl...,The original report had concluded that ''clima...,,[5],,that
2676,701,22,Direct/indirect style alternations,yes,He added that those `` are not solely American...,`` These are not solely American principles no...,[2],,that,
3362,895,22,Direct/indirect style alternations,yes,We need a certifiable pay as you go budget by ...,Texas lawmakers must close a $ 185.9 million b...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",,We need a certifiable pay as you go budget by ...,
3988,1068,22,Direct/indirect style alternations,yes,`` There is no conscious policy of the United ...,there is no conscious policy by the United Sta...,"[0, 25]",,`` '',
4899,1323,22,Direct/indirect style alternations,yes,The bank also said its offer was subject to th...,The offer is also subject to Goldman signing a...,"[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...",,its offer was subject to the agreement of Drax...,
4929,1331,22,Direct/indirect style alternations,yes,He said they were in distress,We 're asphyxiating,"[3, 4, 5, 6]",,they were in distress,
5003,1347,22,Direct/indirect style alternations,yes,the two men were `` defined by dedication and ...,`` [ They ] were defined by dedication and cou...,,"[0, 11]",,`` ''


In [157]:
print_sents(2059)

"No data exists to indicate that the situation with repair stations poses a safety concern."
However, FAA spokeswoman Kathleen Bergen said no data indicate that the situation poses safety problems.


In [158]:
direct_that = direct[(direct['k1_text'] == 'that') | (direct['k2_text'] == 'that')][:30]
direct_that

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
872,225,22,Direct/indirect style alternations,yes,Mr Abbas said : `` Every day without an agreem...,"His Palestinian counterpart , Mahmoud Abbas , ...",,[8],,that
2076,547,22,Direct/indirect style alternations,yes,The draft of the report was forthright : `` Cl...,The original report had concluded that ''clima...,,[5],,that
2676,701,22,Direct/indirect style alternations,yes,He added that those `` are not solely American...,`` These are not solely American principles no...,[2],,that,
5082,1373,22,Direct/indirect style alternations,yes,"O'Donnell wrote in her autobiography , `` Find...","In her autobiography , `` Find Me , '' O'Donne...",[11],,that,
5259,1422,22,Direct/indirect style alternations,yes,`` The discovery that the MAP bug is present i...,The researchers say that the fact the MAP bug ...,,[3],,that
6791,1838,22,Direct/indirect style alternations,yes,Neither military action nor large-scale briber...,"Indeed , Wolfowitz admitted Saturday that neit...",,[5],,that
7000,1892,22,Direct/indirect style alternations,yes,"`` If I was diagnosed today with CJD , I would...",He added that if he were diagnosed with vCJD `...,,[2],,that
7403,1996,22,Direct/indirect style alternations,yes,Gibson said last month in a press statement th...,Gibson said in a June statement that he and hi...,,[6],,that
8210,2211,22,Direct/indirect style alternations,yes,"In terms of a free trade area , we 've got a l...","As for a free trade area , the official stress...",,[10],,that
8816,2369,22,Direct/indirect style alternations,yes,The study found that only about one-third of p...,Only about one-third of parents of sexually ex...,[3],,that,


In [159]:
substitute(22, direct_that, 1)

In [160]:
direct_quotemarks = direct[(direct['k1_text'] == "`` ''") | (direct['k2_text'] == "`` ''")][:10]
# 'k2_text' in the row below has other elements of the sentence, and we don't want to mess with those
direct_quotemarks.loc[direct_quotemarks['pair_id'] == 2059, 'k2_text'] = None  
direct_quotemarks 

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
745,197,22,Direct/indirect style alternations,yes,`` I just got carried away and started making ...,he got carried away and just `` started making...,"[0, 11]",,`` '',
3988,1068,22,Direct/indirect style alternations,yes,`` There is no conscious policy of the United ...,there is no conscious policy by the United Sta...,"[0, 25]",,`` '',
5003,1347,22,Direct/indirect style alternations,yes,the two men were `` defined by dedication and ...,`` [ They ] were defined by dedication and cou...,,"[0, 11]",,`` ''
7622,2059,22,Direct/indirect style alternations,yes,`` No data exists to indicate that the situati...,"However , FAA spokeswoman Kathleen Bergen said...","[0, 17]","[7, 8, 9, 10, 11, 12, 13, 14, 15]",`` '',
16877,4454,22,Direct/indirect style alternations,yes,its contention that KBR had `` delivered fuel ...,`` We believe KBR delivered fuel to Iraq at th...,,"[0, 21]",,`` ''
17540,4612,22,Direct/indirect style alternations,yes,it believed `` the long-term prospects for the...,`` We believe the long-term prospects for the ...,,"[0, 16]",,`` ''


In [161]:
substitute(22, direct_quotemarks, 8)

Double check type overwriting: row 196, sentence 1
Common indices: [0, 11] | Pre-existing types: ['3_1' '3_1']
Double check type overwriting: row 1067, sentence 1
Common indices: [0, 25] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 1346, sentence 2
Common indices: [0, 11] | Pre-existing types: ['3_1' '3_1']
Double check type overwriting: row 2058, sentence 2
Common indices: [9, 15] | Pre-existing types: ['6_0' '4_0 & 6_1']
Double check type overwriting: row 4453, sentence 2
Common indices: [0, 21] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 4611, sentence 2
Common indices: [0, 16] | Pre-existing types: ['3_0' '3_0']


In [162]:
print_sents(2058)

"The NAFTA ruling confirms that Canadian producers dump lumber in to the U.S. market," Rusty Wood, chairman of the coalition, said in a release.
"The NAFTA ruling confirms that Canadian producers dump lumber into the U.S. market," said Rusty Wood, chairman of the Coalition for Fair Lumber Imports.


In [163]:
direct_that.index.tolist()

[872,
 2076,
 2676,
 5082,
 5259,
 6791,
 7000,
 7403,
 8210,
 8816,
 9848,
 13045,
 13164,
 14395,
 14774,
 14911,
 16950,
 17053,
 17788,
 18117,
 18955,
 19905,
 20192,
 20985,
 21929]

In [164]:
indices = direct_that.index.tolist() + direct_quotemarks.index.tolist()
indices
direct_remaining = direct.drop(index=indices)
direct_remaining

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
521,141,22,Direct/indirect style alternations,yes,"`` This deal makes sense for both companies , ...","Brian Halla , CEO of NatSemi , claimed the dea...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",,"`` This deal makes sense for both companies , ''",
3362,895,22,Direct/indirect style alternations,yes,We need a certifiable pay as you go budget by ...,Texas lawmakers must close a $ 185.9 million b...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",,We need a certifiable pay as you go budget by ...,
4899,1323,22,Direct/indirect style alternations,yes,The bank also said its offer was subject to th...,The offer is also subject to Goldman signing a...,"[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...",,its offer was subject to the agreement of Drax...,
4929,1331,22,Direct/indirect style alternations,yes,He said they were in distress,We 're asphyxiating,"[3, 4, 5, 6]",,they were in distress,
5663,1521,22,Direct/indirect style alternations,yes,The vast majority of trades will be priced at ...,Eurex said `` the vast majority '' of trades o...,,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...",,`` the vast majority '' of trades on Eurex US ...
6286,1710,22,Direct/indirect style alternations,yes,"`` It '' s absurd , '' Funny Cide 's trainer B...","Meanwhile , Funny Cide 's trainer , Barclay Ta...",,"[11, 12, 13, 14, 16]",,the allegations `` ridiculous ''\n
7561,2048,22,Direct/indirect style alternations,yes,`` We will work with the board to ensure a smo...,He said federal regulators would work with the...,,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]",,federal regulators would work with the corpora...
7632,2061,22,Direct/indirect style alternations,yes,IAAF council member Jose Maria Odriozola said ...,`` I have proposed to the [ IAAF ] council tha...,"[7, 8, 9, 10, 11, 12, 13]",,Drummond should be excluded from the champions...,
9039,2422,22,Direct/indirect style alternations,yes,"`` Frank Quattrone is innocent , '' Keker said...",Quattrone lawyer John W. Keker said his client...,"[0, 1, 2, 3, 4, 5, 6]","[6, 7, 8, 9]","`` Frank Quattrone is innocent , ''",his client is innocent
9790,2634,22,Direct/indirect style alternations,yes,But he confessed : `` There 's total fear to s...,But he said there was a `` total fear to start...,,"[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...",,there was a `` total fear to start with becaus...


### Sentence Modality Changes

Nothing needed here; this type has zero occurences in the ETPC

### Syntax/Discourse Structure Changes (TODO)

In [165]:
textual_paraphrases[textual_paraphrases['type_id'] == 24][:10]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
36,14,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",But he added group performance would improve i...,De Sole said in the results statement that gro...,,[7],,that
148,48,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",The company has said it plans to restate its e...,The company had announced in January that it w...,,[6],,that
217,67,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","The downtime , to take place in May and June ,...",The downtime is expected to take 60 million to...,"[14, 16]","[5, 13, 14]",cut by,take out of
226,69,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",The University of Michigan released a new unde...,The University of Michigan released today a ne...,"[11, 12]","[10, 11, 12, 13, 14, 15, 16]",", dropping",after the U.S. Supreme Court struck down
265,76,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","Thomas and Tauzin say , as do many doctors , t...","Like many doctors , Mr. Thomas and Mr. Tauzin ...",[10],,that,
283,80,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",The delegates said raising and distributing fu...,Bin Laden’s men pointed out that raising and d...,,[5],,that
342,100,24,Syntax/discourse structure changes,yes,"[24, 25, 26, 27, 28, 29]","[21, 22, 23, 24, 25, 26]",after it admitted falsifying inspection reports,after admitting it falsified inspection reports,"[25, 26, 27]","[22, 23, 24]",it admitted falsifying,admitting it falsified
532,143,24,Syntax/discourse structure changes,yes,"[11, 12, 13, 14, 15, 16, 17, 18, 19]","[10, 11, 12, 13, 14, 15, 16]","Swartz repaid it in full , with interest ,",that Swartz fully repaid it with interest,,[10],,that
562,153,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 10, 11]","[0, 1, 2, 3, 4]","In two new schemes , target families",Two new schemes target families,[0],,In,
567,155,24,Syntax/discourse structure changes,yes,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1...",Miss Novikova said while there is no standard ...,Ms. Novikova said that there was no standard w...,,[9],,that


In [166]:
syn_disc = duplicate_df(textual_paraphrases)
syn_disc = syn_disc[syn_disc['type_id'] == 24]
syn_disc.drop(columns=['s1_scope', 's2_scope'], inplace=True)
syn_disc.rename(columns={'key_s1': 's1_scope', 'key_s2': 's2_scope'}, inplace=True)

syn_disc_add, syn_disc_sub = split_add_sub(syn_disc)

In [167]:
syn_disc_add

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
36,14,24,Syntax/discourse structure changes,yes,But he added group performance would improve i...,De Sole said in the results statement that gro...,,[7],,that
148,48,24,Syntax/discourse structure changes,yes,The company has said it plans to restate its e...,The company had announced in January that it w...,,[6],,that
265,76,24,Syntax/discourse structure changes,yes,"Thomas and Tauzin say , as do many doctors , t...","Like many doctors , Mr. Thomas and Mr. Tauzin ...",[10],,that,
283,80,24,Syntax/discourse structure changes,yes,The delegates said raising and distributing fu...,Bin Laden’s men pointed out that raising and d...,,[5],,that
532,143,24,Syntax/discourse structure changes,yes,"Swartz repaid it in full , with interest ,",that Swartz fully repaid it with interest,,[10],,that
...,...,...,...,...,...,...,...,...,...,...
21229,5565,24,Syntax/discourse structure changes,yes,State Education Commissioner Kent King said We...,Missouri Education Commissioner Kent King said...,[7],,that,
21490,5637,24,Syntax/discourse structure changes,yes,"Colin Powell , the Secretary of State , said c...",Secretary of State Colin Powell said yesterday...,,[7],,that
21752,5711,24,Syntax/discourse structure changes,yes,"ALBANY , N.Y. State Senate Majority Leader Jos...","LBANY , Aug. 8 Joseph L. Bruno , the State Sen...",,[16],,that
21793,5722,24,Syntax/discourse structure changes,yes,Bush declared that the British government `` h...,"Bush said , `` The British government has lear...",[13],,that,


Of those, let's subset only additions/deletions of `'that'`, since we know those map nicely to addition/deletion 
of function words

In [168]:
syn_disc_add_that = syn_disc_add[(syn_disc_add['k1_text'] == 'that') | (syn_disc_add['k2_text'] == 'that')]
syn_disc_add_that

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
36,14,24,Syntax/discourse structure changes,yes,But he added group performance would improve i...,De Sole said in the results statement that gro...,,[7],,that
148,48,24,Syntax/discourse structure changes,yes,The company has said it plans to restate its e...,The company had announced in January that it w...,,[6],,that
265,76,24,Syntax/discourse structure changes,yes,"Thomas and Tauzin say , as do many doctors , t...","Like many doctors , Mr. Thomas and Mr. Tauzin ...",[10],,that,
283,80,24,Syntax/discourse structure changes,yes,The delegates said raising and distributing fu...,Bin Laden’s men pointed out that raising and d...,,[5],,that
532,143,24,Syntax/discourse structure changes,yes,"Swartz repaid it in full , with interest ,",that Swartz fully repaid it with interest,,[10],,that
...,...,...,...,...,...,...,...,...,...,...
21229,5565,24,Syntax/discourse structure changes,yes,State Education Commissioner Kent King said We...,Missouri Education Commissioner Kent King said...,[7],,that,
21490,5637,24,Syntax/discourse structure changes,yes,"Colin Powell , the Secretary of State , said c...",Secretary of State Colin Powell said yesterday...,,[7],,that
21752,5711,24,Syntax/discourse structure changes,yes,"ALBANY , N.Y. State Senate Majority Leader Jos...","LBANY , Aug. 8 Joseph L. Bruno , the State Sen...",,[16],,that
21793,5722,24,Syntax/discourse structure changes,yes,Bush declared that the British government `` h...,"Bush said , `` The British government has lear...",[13],,that,


In [169]:
substitute(24, syn_disc_add_that, 1)

Double check type overwriting: row 2334, sentence 1
Common indices: [4] | Pre-existing types: ['3_0']
Double check type overwriting: row 2368, sentence 2
Common indices: [10] | Pre-existing types: ['3_0']


Double check type overwriting: row 4792, sentence 2
Common indices: [12] | Pre-existing types: ['3_0']


In [170]:
positives.loc[2334]

idx                                                                    2334
sentence1                 "The only thing that I know for certain is tha...
sentence2                 But he added: "The only thing I know for certa...
sentence1_tokenized       [``, The, only, thing, that, I, know, for, cer...
sentence2_tokenized       [But, he, added, :, ``, The, only, thing, I, k...
etpc_label                                                                1
mrpc_label                                                                1
ept_names                 [Same Polarity Substitution (contextual), Same...
ept_ids                                              [6, 6, 24, 26, 25, 29]
sentence1_scope_etpc      [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 2...
sentence2_scope_etpc      [25, 6, 6, 26, 26, 26, 26, 26, 26, 26, 26, 26,...
sentence1_segment_text    [Bush, told, The only thing that I know for ce...
sentence2_segment_text    [he, added, The only thing I know for certain ...
sentence1_sc

Let's see what remains

In [171]:
syn_disc_add.drop(syn_disc_add_that.index.tolist(), inplace=True)
syn_disc_add

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syn_disc_add.drop(syn_disc_add_that.index.tolist(), inplace=True)


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
562,153,24,Syntax/discourse structure changes,yes,"In two new schemes , target families",Two new schemes target families,[0],,In,
613,164,24,Syntax/discourse structure changes,yes,She claimed all the babies were born full-term...,What she told our investigators was that all t...,,"[1, 6, 7]",,What was that
1204,308,24,Syntax/discourse structure changes,yes,"The new effort , Taxpayers Against the Recall ...","Called `` Taxpayers Against the Recall , '' it...",,"[0, 1, 7, 8]",,Called `` '' it
1731,450,24,Syntax/discourse structure changes,yes,the standards body warns,according to the W3C 's notice,,"[17, 18]",,according to
1821,474,24,Syntax/discourse structure changes,yes,Sendmail said,according to Sendmail,,"[10, 11]",,according to
2239,592,24,Syntax/discourse structure changes,yes,it 's a technique that 's been successful in p...,the technique has successfully predicted,"[3, 4, 7]",,it 's that,
2517,656,24,Syntax/discourse structure changes,yes,according to the report,the report noted,"[13, 14]",,according to,
2735,724,24,Syntax/discourse structure changes,yes,"For the third time in the past four years ,",It was the third time in four years that,,"[0, 1, 8]",,It was that
3037,802,24,Syntax/discourse structure changes,yes,The numbers highlight a conundrum :,"As stark as the numbers themselves , is the co...",,"[0, 2, 7]",,As as is
3160,843,24,Syntax/discourse structure changes,yes,US pressure had provoked,it was U.S. pressure which had provoked,,"[12, 13, 16]",,it was which


In [172]:
substitute(24, syn_disc_add, 2)

Double check type overwriting: row 307, sentence 2
Common indices: [8] | Pre-existing types: ['5_0']
Double check type overwriting: row 3975, sentence 1
Common indices: [9, 10] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 4499, sentence 1
Common indices: [17] | Pre-existing types: ['3_0']


In [173]:
syn_disc_sub[150:]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
21025,5513,24,Syntax/discourse structure changes,yes,a minibus detonated a bomb in a Baghdad traffi...,a bomb explosion in a Baghdad traffic tunnel k...,"[5, 6, 8, 15, 19]","[3, 4, 11, 15]",detonated a bomb killing wounding,a bomb killed wounded
21237,5566,24,Syntax/discourse structure changes,yes,", which has agreed to handle his surrender",about arranging his surrender,"[12, 13, 14, 15, 16]","[11, 12]",which has agreed to handle,about arranging
21348,5593,24,Syntax/discourse structure changes,yes,The company posted a profit of $ 54.3 million ...,"That was up from the year-ago quarter , when t...",[16],"[0, 1, 2, 3, 8]",in,That was up from when
21353,5600,24,Syntax/discourse structure changes,yes,"Among those waiting a turn was Jodie Singer , ...","Jodie Singer , a sixth-grader from Washington ...","[0, 1, 2, 3, 4, 5]","[10, 11, 12, 13]",Among those waiting a turn was,anxiously awaited her turn
21416,5614,24,Syntax/discourse structure changes,yes,California lost $ 937 million to corporate tax...,California 's lost tax revenue was mostly due ...,[8],"[5, 6, 7, 8]",to,was mostly due to
21515,5645,24,Syntax/discourse structure changes,yes,Another shooting linked to the spree occurred ...,The latest shooting linked to the spree was a ...,[6],"[7, 8, 11]",occurred,was a shooting
21527,5648,24,Syntax/discourse structure changes,yes,Congress is the best forum for weighing,Congress is the best forum to address,[8],[8],for,to
21535,5651,24,Syntax/discourse structure changes,yes,And because it is so far out in international ...,It is so far out in international water that t...,"[1, 10]",[8],"because ,",that
21543,5652,24,Syntax/discourse structure changes,yes,a set of guidelines to help public administrat...,guidelines to member governments on how to mig...,"[10, 13, 14, 23, 24]","[10, 11]",help decide whether or not,on how
21611,5672,24,Syntax/discourse structure changes,yes,Only Intel Corp. 's 0.3 percent yield was lower .,Only Intel Corp. has a lower dividend yield .\n,"[7, 8]","[3, 4, 5]",was lower,has a lower


In [174]:
print_sents(2540)

Wal-Mart, the nation's largest private employer, has expanded its antidiscrimination policy to protect gay and lesbian employees, company officials said Tuesday.
Wal-Mart Stores Inc., the nation's largest private employer, will now include gays and lesbians in its anti-discrimination policy, company officials said Wednesday.


In [175]:
positives.loc[438,:]

idx                                                                     438
sentence1                 The letter stated that a premature stillborn b...
sentence2                 According to the writer of the letter, the inf...
sentence1_tokenized       [The, letter, stated, that, a, premature, stil...
sentence2_tokenized       [According, to, the, writer, of, the, letter, ...
etpc_label                                                                1
mrpc_label                                                                1
ept_names                 [Same Polarity Substitution (contextual), Synt...
ept_ids                                             [6, 24, 25, 25, 29, 21]
sentence1_scope_etpc      [24, 24, 24, 24, 6, 6, 6, 6, 25, 25, 25, 25, 2...
sentence2_scope_etpc      [24, 24, 24, 24, 24, 24, 24, 24, 6, 6, 0, 0, 0...
sentence1_segment_text    [a premature stillborn baby, The letter stated...
sentence2_segment_text    [the infant, According to the writer of the le...
sentence1_sc

### Semantic Based

In [176]:
textual_paraphrases[textual_paraphrases['type_id'] == 28]

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
102,35,28,Semantic based,yes,"[0, 1, 2, 3, 4, 5]",[10],The largest gains were seen in,increased,,,,
129,41,28,Semantic based,yes,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","[3, 4, 5, 6, 7, 8, 9, 10]",the court upheld Cleveland 's school voucher p...,the court ruled 5-4 in an Ohio case,,,,
166,51,28,Semantic based,yes,"[13, 14, 15, 16]","[18, 19, 20, 21]",they were legally employed,they have legal status,,,,
214,64,28,Semantic based,yes,"[2, 3, 4, 5, 6, 7, 8]","[5, 6, 7, 8, 9]",what PeopleSoft management would have you believe,the contentions of PeopleSoft management,,,,
223,68,28,Semantic based,yes,"[4, 5, 6, 7, 8, 9, 10, 11, 12, 13]","[0, 1, 2, 3, 4]",Troy is expected to be sentenced to life in pr...,Troy faces life in prison,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
21906,5756,28,Semantic based,yes,"[20, 21]","[22, 25, 26, 27, 28, 29, 30, 31]",work for,be & apos ; s chief operating officer,,,,
21935,5767,28,Semantic based,yes,"[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 2...","[13, 14, 15, 16, 17, 18, 19, 20, 21, 22]","43,000 jobs in Santa Clara County and 18,000 j...","nearly 62,000 jobs in the Santa Clara County-S...",,,,
21953,5772,28,Semantic based,yes,[0],"[2, 3, 4]",After,into the study,,,,
21980,5781,28,Semantic based,yes,"[8, 9, 10, 11, 12, 13, 14, 15, 16]","[8, 9, 10, 11]",13 of the state 's 16 fatalities were reported,13 people were killed,,,,


In [177]:
print_sents(226)

We remain hopeful that the city will agree to work with us and engage in good-faith discussions on this issue."
Alhart said the governor "remains hopeful that the city will continue to work with us and engage in good-faith discussions."


### Ellipsis

In [178]:
ellip = duplicate_df(textual_paraphrases[textual_paraphrases['type_id'] == 16])
ellip['k1_text'] = ellip['k1_text'].str.lower()
ellip['k2_text'] = ellip['k2_text'].str.lower()
ellip.drop(columns=['s1_scope', 's2_scope'], inplace=True)
ellip.rename(columns={'key_s1': 's1_scope', 'key_s2': 's2_scope'}, inplace=True)
ellip

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,150,16,Ellipsis,yes,would take would require,would take require,"[3, 8]",[3],would would,would
1,273,16,Ellipsis,yes,short-lived or long-lived,short- or long-lived,"[13, 15]",[14],short-lived long-lived,long-lived
2,336,16,Ellipsis,yes,"The company will launch 800 hot spots , or `` ...",The service will launch later this summer with...,"[0, 1]","[0, 1, 12]",the company,the service sprint
3,470,16,Ellipsis,yes,with aggravated harassment and criminal posses...,with aggravated harassment in the phone call c...,[21],"[10, 19]",with,with with
4,594,16,Ellipsis,yes,"After three months , Atkins dieters had lost a...","Three months into the study , the Atkins group...","[12, 16]",[16],pounds pounds,pounds
...,...,...,...,...,...,...,...,...,...,...
61,5384,16,Ellipsis,yes,12-by-18-inch,12-inch-by-18-inch,[9],[11],12-by-18-inch,12-inch-by-18-inch
62,5744,16,Ellipsis,yes,to establish relationships and to make sure,to establish relationships and make sure,"[6, 10]",[6],to to,to
63,5749,16,Ellipsis,yes,not because of who she is but what she did,not because of who she is but because of what ...,"[6, 7]","[7, 8, 13, 14]",because of,because of because of
64,5772,16,Ellipsis,yes,14.7 pounds 5.8 pounds,15 pounds five,"[12, 16]",[16],pounds pounds,pounds


In [179]:
def same(string1, string2):
    return set(string1.split()) == set(string2.split())

In [180]:
samie = ellip[ellip.apply(lambda x: same(x.k1_text, x.k2_text), axis=1)]
ellip.drop(samie.index.tolist(), inplace=True)
samie

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,150,16,Ellipsis,yes,would take would require,would take require,"[3, 8]",[3],would would,would
3,470,16,Ellipsis,yes,with aggravated harassment and criminal posses...,with aggravated harassment in the phone call c...,[21],"[10, 19]",with,with with
4,594,16,Ellipsis,yes,"After three months , Atkins dieters had lost a...","Three months into the study , the Atkins group...","[12, 16]",[16],pounds pounds,pounds
5,627,16,Ellipsis,yes,We believe and will defend,We believe and we will defend,[0],"[1, 8]",we,we we
7,768,16,Ellipsis,yes,"Dell has about 32 percent of the U.S. market ,...",Dell has 32 percent of the PC market in the Un...,[1],"[1, 15]",has,has has
9,1207,16,Ellipsis,yes,was lying watching,was lying was watching,[10],"[11, 24]",was,was was
10,1357,16,Ellipsis,yes,"Of personal vehicles , 57 percent are cars or ...","Of all personal vehicles , 57 percent are cars...",[6],"[7, 15, 24]",are,are are are
17,1936,16,Ellipsis,yes,other producers ' server software can work wit...,other producers ' server software can connect ...,[16],"[17, 29]",can,can can
18,2232,16,Ellipsis,yes,About 10 percent of high school and 16 percent...,16 percent of elementary and middle school stu...,[11],"[10, 17]",students,students students
19,2240,16,Ellipsis,yes,will keep the Interwoven name and be headquart...,will be named Interwoven and will be headquart...,[3],"[3, 8]",will,will will


In [181]:
preps = ('at', 'from', 'in', 'the', 'to')
samie_preps = samie[(samie['k1_text'].isin(preps)) | (samie['k2_text'].isin(preps))]
samie_preps

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
20,2339,16,Ellipsis,yes,from multiple screen names and other POP and I...,"from multiple AOL screen names , as well as fr...",[6],"[5, 14]",from,from from
25,2945,16,Ellipsis,yes,at 160 on June 16 and at 120 on June 23,at 160 on June 16 and 120 at June 23,"[16, 22]",[4],at at,at
41,3873,16,Ellipsis,yes,Testing of the swimsuit at a state police lab ...,Testing at a Massachusetts State Police lab an...,"[4, 10]",[1],at at,at
45,4220,16,Ellipsis,yes,to resign or negotiate,to resign or to negotiate,[9],"[6, 10]",to,to to
52,4613,16,Ellipsis,yes,in Washington and in New York City,in Washington and New York,"[11, 18]",[9],in in,in
57,5106,16,Ellipsis,yes,the pledges and the minute of silence,the pledges and moment of silence,"[20, 23]",[21],the the,the
62,5744,16,Ellipsis,yes,to establish relationships and to make sure,to establish relationships and make sure,"[6, 10]",[6],to to,to


In [182]:
print_sents(4220)

The MDC called the strike to force Mr Mugabe to either resign or negotiate a settlement of the Zimbabwe crisis.
The MDC called the week-long protest to urge Mugabe either to resign or to negotiate a settlement of the crisis gripping the country.


Let's delete the first preposition

In [183]:
samie_preps['s1_scope'].apply(lambda x: x.pop(0))
samie_preps['s2_scope'].apply(lambda x: x.pop(0))

20     5
25     4
41     1
45     6
52     9
57    21
62     6
Name: s2_scope, dtype: int64

In [184]:
samie_preps

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
20,2339,16,Ellipsis,yes,from multiple screen names and other POP and I...,"from multiple AOL screen names , as well as fr...",[],[14],from,from from
25,2945,16,Ellipsis,yes,at 160 on June 16 and at 120 on June 23,at 160 on June 16 and 120 at June 23,[22],[],at at,at
41,3873,16,Ellipsis,yes,Testing of the swimsuit at a state police lab ...,Testing at a Massachusetts State Police lab an...,[10],[],at at,at
45,4220,16,Ellipsis,yes,to resign or negotiate,to resign or to negotiate,[],[10],to,to to
52,4613,16,Ellipsis,yes,in Washington and in New York City,in Washington and New York,[18],[],in in,in
57,5106,16,Ellipsis,yes,the pledges and the minute of silence,the pledges and moment of silence,[23],[],the the,the
62,5744,16,Ellipsis,yes,to establish relationships and to make sure,to establish relationships and make sure,[10],[],to to,to


In [185]:
samie_preps.loc[[20,45], 's1_scope'] = None

In [186]:
samie_preps.loc[[25,41,52,57,62], 's2_scope'] = None

In [187]:
samie_preps

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
20,2339,16,Ellipsis,yes,from multiple screen names and other POP and I...,"from multiple AOL screen names , as well as fr...",,[14],from,from from
25,2945,16,Ellipsis,yes,at 160 on June 16 and at 120 on June 23,at 160 on June 16 and 120 at June 23,[22],,at at,at
41,3873,16,Ellipsis,yes,Testing of the swimsuit at a state police lab ...,Testing at a Massachusetts State Police lab an...,[10],,at at,at
45,4220,16,Ellipsis,yes,to resign or negotiate,to resign or to negotiate,,[10],to,to to
52,4613,16,Ellipsis,yes,in Washington and in New York City,in Washington and New York,[18],,in in,in
57,5106,16,Ellipsis,yes,the pledges and the minute of silence,the pledges and moment of silence,[23],,the the,the
62,5744,16,Ellipsis,yes,to establish relationships and to make sure,to establish relationships and make sure,[10],,to to,to


In [188]:
substitute(16, samie_preps, 1)

In [189]:
samie.drop(samie_preps.index.tolist(), inplace=True)
samie

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samie.drop(samie_preps.index.tolist(), inplace=True)


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,150,16,Ellipsis,yes,would take would require,would take require,"[3, 8]",[3],would would,would
3,470,16,Ellipsis,yes,with aggravated harassment and criminal posses...,with aggravated harassment in the phone call c...,[21],"[10, 19]",with,with with
4,594,16,Ellipsis,yes,"After three months , Atkins dieters had lost a...","Three months into the study , the Atkins group...","[12, 16]",[16],pounds pounds,pounds
5,627,16,Ellipsis,yes,We believe and will defend,We believe and we will defend,[0],"[1, 8]",we,we we
7,768,16,Ellipsis,yes,"Dell has about 32 percent of the U.S. market ,...",Dell has 32 percent of the PC market in the Un...,[1],"[1, 15]",has,has has
9,1207,16,Ellipsis,yes,was lying watching,was lying was watching,[10],"[11, 24]",was,was was
10,1357,16,Ellipsis,yes,"Of personal vehicles , 57 percent are cars or ...","Of all personal vehicles , 57 percent are cars...",[6],"[7, 15, 24]",are,are are are
17,1936,16,Ellipsis,yes,other producers ' server software can work wit...,other producers ' server software can connect ...,[16],"[17, 29]",can,can can
18,2232,16,Ellipsis,yes,About 10 percent of high school and 16 percent...,16 percent of elementary and middle school stu...,[11],"[10, 17]",students,students students
19,2240,16,Ellipsis,yes,will keep the Interwoven name and be headquart...,will be named Interwoven and will be headquart...,[3],"[3, 8]",will,will will


In [190]:
print_sents(2995)

But I would rather be talking about high standards than low standards."
"I would rather be talking about positive numbers rather than negative.


In [191]:
samie['s1_scope'].apply(lambda x: x.pop(0))
samie['s2_scope'].apply(lambda x: x.pop(0))

0      3
3     10
4     16
5      1
7      1
9     11
10     7
17    17
18    10
19     3
22     1
23     3
24     9
26     9
28     8
29    17
30     3
37    15
42    11
43     2
44     6
46    16
47     5
48     4
49     5
50     1
53     7
58     5
59    10
60     9
63     7
64    16
Name: s2_scope, dtype: int64

In [192]:
samie

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,150,16,Ellipsis,yes,would take would require,would take require,[8],[],would would,would
3,470,16,Ellipsis,yes,with aggravated harassment and criminal posses...,with aggravated harassment in the phone call c...,[],[19],with,with with
4,594,16,Ellipsis,yes,"After three months , Atkins dieters had lost a...","Three months into the study , the Atkins group...",[16],[],pounds pounds,pounds
5,627,16,Ellipsis,yes,We believe and will defend,We believe and we will defend,[],[8],we,we we
7,768,16,Ellipsis,yes,"Dell has about 32 percent of the U.S. market ,...",Dell has 32 percent of the PC market in the Un...,[],[15],has,has has
9,1207,16,Ellipsis,yes,was lying watching,was lying was watching,[],[24],was,was was
10,1357,16,Ellipsis,yes,"Of personal vehicles , 57 percent are cars or ...","Of all personal vehicles , 57 percent are cars...",[],"[15, 24]",are,are are are
17,1936,16,Ellipsis,yes,other producers ' server software can work wit...,other producers ' server software can connect ...,[],[29],can,can can
18,2232,16,Ellipsis,yes,About 10 percent of high school and 16 percent...,16 percent of elementary and middle school stu...,[],[17],students,students students
19,2240,16,Ellipsis,yes,will keep the Interwoven name and be headquart...,will be named Interwoven and will be headquart...,[],[8],will,will will


In [193]:
samie['s1_scope'] = samie['s1_scope'].apply(lambda x: None if not x else x)
samie['s2_scope'] = samie['s2_scope'].apply(lambda x: None if not x else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samie['s1_scope'] = samie['s1_scope'].apply(lambda x: None if not x else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samie['s2_scope'] = samie['s2_scope'].apply(lambda x: None if not x else x)


In [194]:
samie

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
0,150,16,Ellipsis,yes,would take would require,would take require,[8],,would would,would
3,470,16,Ellipsis,yes,with aggravated harassment and criminal posses...,with aggravated harassment in the phone call c...,,[19],with,with with
4,594,16,Ellipsis,yes,"After three months , Atkins dieters had lost a...","Three months into the study , the Atkins group...",[16],,pounds pounds,pounds
5,627,16,Ellipsis,yes,We believe and will defend,We believe and we will defend,,[8],we,we we
7,768,16,Ellipsis,yes,"Dell has about 32 percent of the U.S. market ,...",Dell has 32 percent of the PC market in the Un...,,[15],has,has has
9,1207,16,Ellipsis,yes,was lying watching,was lying was watching,,[24],was,was was
10,1357,16,Ellipsis,yes,"Of personal vehicles , 57 percent are cars or ...","Of all personal vehicles , 57 percent are cars...",,"[15, 24]",are,are are are
17,1936,16,Ellipsis,yes,other producers ' server software can work wit...,other producers ' server software can connect ...,,[29],can,can can
18,2232,16,Ellipsis,yes,About 10 percent of high school and 16 percent...,16 percent of elementary and middle school stu...,,[17],students,students students
19,2240,16,Ellipsis,yes,will keep the Interwoven name and be headquart...,will be named Interwoven and will be headquart...,,[8],will,will will


In [195]:
samie_none = samie[(samie['s1_scope'].isnull()) | (samie['s2_scope'].isnull())]

Correcting annotation mistake (present in original ETPC)

In [196]:
samie_none.loc[26, 'key_s2'] = [9]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samie_none.loc[26, 'key_s2'] = [9]


In [197]:
samie_none

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text,key_s2
0,150,16,Ellipsis,yes,would take would require,would take require,[8],,would would,would,
3,470,16,Ellipsis,yes,with aggravated harassment and criminal posses...,with aggravated harassment in the phone call c...,,[19],with,with with,
4,594,16,Ellipsis,yes,"After three months , Atkins dieters had lost a...","Three months into the study , the Atkins group...",[16],,pounds pounds,pounds,
5,627,16,Ellipsis,yes,We believe and will defend,We believe and we will defend,,[8],we,we we,
7,768,16,Ellipsis,yes,"Dell has about 32 percent of the U.S. market ,...",Dell has 32 percent of the PC market in the Un...,,[15],has,has has,
9,1207,16,Ellipsis,yes,was lying watching,was lying was watching,,[24],was,was was,
10,1357,16,Ellipsis,yes,"Of personal vehicles , 57 percent are cars or ...","Of all personal vehicles , 57 percent are cars...",,"[15, 24]",are,are are are,
17,1936,16,Ellipsis,yes,other producers ' server software can work wit...,other producers ' server software can connect ...,,[29],can,can can,
18,2232,16,Ellipsis,yes,About 10 percent of high school and 16 percent...,16 percent of elementary and middle school stu...,,[17],students,students students,
19,2240,16,Ellipsis,yes,will keep the Interwoven name and be headquart...,will be named Interwoven and will be headquart...,,[8],will,will will,


In [198]:
substitute(16, samie_none, 2)

Double check type overwriting: row 3361, sentence 1
Common indices: [19] | Pre-existing types: ['3_0']
Double check type overwriting: row 4282, sentence 2
Common indices: [13] | Pre-existing types: ['5_0']


In [199]:
samie.drop(samie_none.index.tolist(), inplace=True)
samie

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samie.drop(samie_none.index.tolist(), inplace=True)


Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
22,2643,16,Ellipsis,yes,"`` She was crying and scared , '","`` She was crying and she was really scared , ''",[2],"[2, 5, 6]",she was,she was she was
23,2772,16,Ellipsis,yes,sales of grocery and other consumer packaged p...,sales of grocery and other consumer packaged p...,[3],"[4, 15, 16]",sales of,sales of sales of
28,3338,16,Ellipsis,yes,a profit of 30 cents a share and $ 1.31,a profit of 30 cents a share and $ 1.31 a shar...,[11],"[9, 17, 18]",a share,a share a share
43,4039,16,Ellipsis,yes,She was the only woman in her unit and a membe...,She was the only woman employed as a warehouse...,"[3, 4]","[3, 4, 15, 16, 17]",the only woman,the only woman the only woman
47,4256,16,Ellipsis,yes,the fourth most common in men and the eighth m...,the fourth most common in men and the eighth i...,"[9, 18, 19]",[6],most common most common,most common
50,4474,16,Ellipsis,yes,"`` I have lots of bad dreams , I have flashbac...","`` I have lots of bad dreams , flashbacks and ...","[2, 8, 9, 12, 13]",[2],i have i have i have,i have
59,5127,16,Ellipsis,yes,will be consolidated and will be based,will be consolidated and based,"[9, 16, 17]",[11],will be will be,will be
63,5749,16,Ellipsis,yes,not because of who she is but what she did,not because of who she is but because of what ...,[7],"[8, 13, 14]",because of,because of because of


In [200]:
samie.loc[[23,28,43,63], 's1_scope'] = None
samie.loc[[47,50,59], 's2_scope'] = None
samie

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
22,2643,16,Ellipsis,yes,"`` She was crying and scared , '","`` She was crying and she was really scared , ''",[2],"[2, 5, 6]",she was,she was she was
23,2772,16,Ellipsis,yes,sales of grocery and other consumer packaged p...,sales of grocery and other consumer packaged p...,,"[4, 15, 16]",sales of,sales of sales of
28,3338,16,Ellipsis,yes,a profit of 30 cents a share and $ 1.31,a profit of 30 cents a share and $ 1.31 a shar...,,"[9, 17, 18]",a share,a share a share
43,4039,16,Ellipsis,yes,She was the only woman in her unit and a membe...,She was the only woman employed as a warehouse...,,"[3, 4, 15, 16, 17]",the only woman,the only woman the only woman
47,4256,16,Ellipsis,yes,the fourth most common in men and the eighth m...,the fourth most common in men and the eighth i...,"[9, 18, 19]",,most common most common,most common
50,4474,16,Ellipsis,yes,"`` I have lots of bad dreams , I have flashbac...","`` I have lots of bad dreams , flashbacks and ...","[2, 8, 9, 12, 13]",,i have i have i have,i have
59,5127,16,Ellipsis,yes,will be consolidated and will be based,will be consolidated and based,"[9, 16, 17]",,will be will be,will be
63,5749,16,Ellipsis,yes,not because of who she is but what she did,not because of who she is but because of what ...,,"[8, 13, 14]",because of,because of because of


In [201]:
samie.loc[[47,50,59], 's1_scope'].apply(lambda x: x.pop(0))
samie.loc[[23,28,63], 's2_scope'].apply(lambda x: x.pop(0))
samie

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
22,2643,16,Ellipsis,yes,"`` She was crying and scared , '","`` She was crying and she was really scared , ''",[2],"[2, 5, 6]",she was,she was she was
23,2772,16,Ellipsis,yes,sales of grocery and other consumer packaged p...,sales of grocery and other consumer packaged p...,,"[15, 16]",sales of,sales of sales of
28,3338,16,Ellipsis,yes,a profit of 30 cents a share and $ 1.31,a profit of 30 cents a share and $ 1.31 a shar...,,"[17, 18]",a share,a share a share
43,4039,16,Ellipsis,yes,She was the only woman in her unit and a membe...,She was the only woman employed as a warehouse...,,"[3, 4, 15, 16, 17]",the only woman,the only woman the only woman
47,4256,16,Ellipsis,yes,the fourth most common in men and the eighth m...,the fourth most common in men and the eighth i...,"[18, 19]",,most common most common,most common
50,4474,16,Ellipsis,yes,"`` I have lots of bad dreams , I have flashbac...","`` I have lots of bad dreams , flashbacks and ...","[8, 9, 12, 13]",,i have i have i have,i have
59,5127,16,Ellipsis,yes,will be consolidated and will be based,will be consolidated and based,"[16, 17]",,will be will be,will be
63,5749,16,Ellipsis,yes,not because of who she is but what she did,not because of who she is but because of what ...,,"[13, 14]",because of,because of because of


In [202]:
samie.at[50, 's1_scope'] = [8, 9, 12, 13]
samie.at[43, 's2_scope'] = [15,16,17]
samie

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
22,2643,16,Ellipsis,yes,"`` She was crying and scared , '","`` She was crying and she was really scared , ''",[2],"[2, 5, 6]",she was,she was she was
23,2772,16,Ellipsis,yes,sales of grocery and other consumer packaged p...,sales of grocery and other consumer packaged p...,,"[15, 16]",sales of,sales of sales of
28,3338,16,Ellipsis,yes,a profit of 30 cents a share and $ 1.31,a profit of 30 cents a share and $ 1.31 a shar...,,"[17, 18]",a share,a share a share
43,4039,16,Ellipsis,yes,She was the only woman in her unit and a membe...,She was the only woman employed as a warehouse...,,"[15, 16, 17]",the only woman,the only woman the only woman
47,4256,16,Ellipsis,yes,the fourth most common in men and the eighth m...,the fourth most common in men and the eighth i...,"[18, 19]",,most common most common,most common
50,4474,16,Ellipsis,yes,"`` I have lots of bad dreams , I have flashbac...","`` I have lots of bad dreams , flashbacks and ...","[8, 9, 12, 13]",,i have i have i have,i have
59,5127,16,Ellipsis,yes,will be consolidated and will be based,will be consolidated and based,"[16, 17]",,will be will be,will be
63,5749,16,Ellipsis,yes,not because of who she is but what she did,not because of who she is but because of what ...,,"[13, 14]",because of,because of because of


In [203]:
substitute(16, samie, 2)

In [204]:
ellip

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
1,273,16,Ellipsis,yes,short-lived or long-lived,short- or long-lived,"[13, 15]",[14],short-lived long-lived,long-lived
2,336,16,Ellipsis,yes,"The company will launch 800 hot spots , or `` ...",The service will launch later this summer with...,"[0, 1]","[0, 1, 12]",the company,the service sprint
6,697,16,Ellipsis,yes,People who are high in positive emotions sleep...,"Happy people sleep better , have better diets ...","[1, 2, 3, 4, 5, 6, 7, 11, 16, 20]","[1, 2]",people who are high in positive emotions they ...,happy people
8,785,16,Ellipsis,yes,"Jacob has pushed consolidation for years , but...",Jacob has pushed consolidation for years but said,"[0, 8]",[0],jacob he,jacob
11,1458,16,Ellipsis,yes,the Fed will acknowledge risks are tilted towa...,the central bank will say risks are tilted tow...,"[10, 11, 18, 19]","[10, 11]",risks are they are,risks are
12,1500,16,Ellipsis,yes,immune systems suppressed by medications or by...,suppressed immune systems due to illness or me...,"[10, 13]","[10, 11]",by by,due to
13,1588,16,Ellipsis,yes,"He said it was a mistake , and he reimbursed t...",The governor said the use of the credit card w...,"[0, 8]","[0, 1]",he he,the governor
14,1598,16,Ellipsis,yes,some of the passengers,no passenger but some,[6],[4],passengers,passenger
15,1635,16,Ellipsis,yes,"11M , 22M , 33M , 44M and 55Mbit/sec","11 , 22 , 33 , 44 and 55Mbit/s","[18, 20, 22, 24]",[25],22m 33m 44m 55mbit/sec,55mbit/s
16,1830,16,Ellipsis,yes,someone strangled her and she may have been se...,Park appeared to have been strangled and may h...,"[4, 6]",[0],her she,park


In [205]:
ellip['s1_scope'], ellip['s2_scope'], ellip['k1_text'], ellip['k2_text'] = ellip.apply(lambda x: trim_duplicates(x.s1_scope, x.s2_scope, x.k1_text, x.k2_text), axis=1, result_type='expand').transpose().values
ellip

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
1,273,16,Ellipsis,yes,short-lived or long-lived,short- or long-lived,[13],[],short-lived,
2,336,16,Ellipsis,yes,"The company will launch 800 hot spots , or `` ...",The service will launch later this summer with...,[1],"[1, 12]",company,service sprint
6,697,16,Ellipsis,yes,People who are high in positive emotions sleep...,"Happy people sleep better , have better diets ...","[2, 3, 4, 5, 6, 7, 11, 16, 20]",[1],who are high in positive emotions they they they,happy
8,785,16,Ellipsis,yes,"Jacob has pushed consolidation for years , but...",Jacob has pushed consolidation for years but said,[8],[],he,
11,1458,16,Ellipsis,yes,the Fed will acknowledge risks are tilted towa...,the central bank will say risks are tilted tow...,[18],[],they,
12,1500,16,Ellipsis,yes,immune systems suppressed by medications or by...,suppressed immune systems due to illness or me...,"[10, 13]","[10, 11]",by by,due to
13,1588,16,Ellipsis,yes,"He said it was a mistake , and he reimbursed t...",The governor said the use of the credit card w...,"[0, 8]","[0, 1]",he he,the governor
14,1598,16,Ellipsis,yes,some of the passengers,no passenger but some,[6],[4],passengers,passenger
15,1635,16,Ellipsis,yes,"11M , 22M , 33M , 44M and 55Mbit/sec","11 , 22 , 33 , 44 and 55Mbit/s","[18, 20, 22, 24]",[25],22m 33m 44m 55mbit/sec,55mbit/s
16,1830,16,Ellipsis,yes,someone strangled her and she may have been se...,Park appeared to have been strangled and may h...,"[4, 6]",[0],her she,park


In [206]:
ellip['k1_text'] = ellip['k1_text'].apply(lambda x: None if x == '' else x)
ellip['k2_text'] = ellip['k2_text'].apply(lambda x: None if x == '' else x)
ellip['s1_scope'] = ellip['s1_scope'].apply(lambda x: None if list(x) == [] else x)
ellip['s2_scope'] = ellip['s2_scope'].apply(lambda x: None if list(x) == [] else x)
ellip

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
1,273,16,Ellipsis,yes,short-lived or long-lived,short- or long-lived,[13],,short-lived,
2,336,16,Ellipsis,yes,"The company will launch 800 hot spots , or `` ...",The service will launch later this summer with...,[1],"[1, 12]",company,service sprint
6,697,16,Ellipsis,yes,People who are high in positive emotions sleep...,"Happy people sleep better , have better diets ...","[2, 3, 4, 5, 6, 7, 11, 16, 20]",[1],who are high in positive emotions they they they,happy
8,785,16,Ellipsis,yes,"Jacob has pushed consolidation for years , but...",Jacob has pushed consolidation for years but said,[8],,he,
11,1458,16,Ellipsis,yes,the Fed will acknowledge risks are tilted towa...,the central bank will say risks are tilted tow...,[18],,they,
12,1500,16,Ellipsis,yes,immune systems suppressed by medications or by...,suppressed immune systems due to illness or me...,"[10, 13]","[10, 11]",by by,due to
13,1588,16,Ellipsis,yes,"He said it was a mistake , and he reimbursed t...",The governor said the use of the credit card w...,"[0, 8]","[0, 1]",he he,the governor
14,1598,16,Ellipsis,yes,some of the passengers,no passenger but some,[6],[4],passengers,passenger
15,1635,16,Ellipsis,yes,"11M , 22M , 33M , 44M and 55Mbit/sec","11 , 22 , 33 , 44 and 55Mbit/s","[18, 20, 22, 24]",[25],22m 33m 44m 55mbit/sec,55mbit/s
16,1830,16,Ellipsis,yes,someone strangled her and she may have been se...,Park appeared to have been strangled and may h...,"[4, 6]",[0],her she,park


In [207]:
ellip.at[2, 's1_scope'], ellip.at[2, 's2_scope'] = None, [12]
ellip.at[6, 's1_scope'], ellip.at[6, 's2_scope'] = [11,16,20], None
ellip.at[12, 's1_scope'], ellip.at[12, 's2_scope'] = [13], None
ellip.at[13, 's1_scope'], ellip.at[13, 's2_scope'] = [8], None
ellip.at[15, 's1_scope'], ellip.at[15, 's2_scope'] = [18,20,22,24], None
ellip.at[16, 's1_scope'], ellip.at[16, 's2_scope'] = [6], None
ellip.at[21, 's1_scope'], ellip.at[21, 's2_scope'] = None, [13]
ellip.at[27, 's1_scope'], ellip.at[27, 's2_scope'] = [11], None
ellip.at[32, 's1_scope'], ellip.at[32, 's2_scope'] = [10], None
ellip.at[39, 's1_scope'], ellip.at[39, 's2_scope'] = None, [11]
ellip.at[55, 's1_scope'], ellip.at[55, 's2_scope'] = [17], None
ellip.at[56, 's1_scope'], ellip.at[56, 's2_scope'] = None, [19]
ellip.at[65, 's1_scope'], ellip.at[65, 's2_scope'] = None, [12]
ellip

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
1,273,16,Ellipsis,yes,short-lived or long-lived,short- or long-lived,[13],,short-lived,
2,336,16,Ellipsis,yes,"The company will launch 800 hot spots , or `` ...",The service will launch later this summer with...,,[12],company,service sprint
6,697,16,Ellipsis,yes,People who are high in positive emotions sleep...,"Happy people sleep better , have better diets ...","[11, 16, 20]",,who are high in positive emotions they they they,happy
8,785,16,Ellipsis,yes,"Jacob has pushed consolidation for years , but...",Jacob has pushed consolidation for years but said,[8],,he,
11,1458,16,Ellipsis,yes,the Fed will acknowledge risks are tilted towa...,the central bank will say risks are tilted tow...,[18],,they,
12,1500,16,Ellipsis,yes,immune systems suppressed by medications or by...,suppressed immune systems due to illness or me...,[13],,by by,due to
13,1588,16,Ellipsis,yes,"He said it was a mistake , and he reimbursed t...",The governor said the use of the credit card w...,[8],,he he,the governor
14,1598,16,Ellipsis,yes,some of the passengers,no passenger but some,[6],[4],passengers,passenger
15,1635,16,Ellipsis,yes,"11M , 22M , 33M , 44M and 55Mbit/sec","11 , 22 , 33 , 44 and 55Mbit/s","[18, 20, 22, 24]",,22m 33m 44m 55mbit/sec,55mbit/s
16,1830,16,Ellipsis,yes,someone strangled her and she may have been se...,Park appeared to have been strangled and may h...,[6],,her she,park


In [208]:
ellip_none = ellip[(ellip['s1_scope'].isnull()) | (ellip['s2_scope'].isnull())]
ellip_none

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_text,s2_text,s1_scope,s2_scope,k1_text,k2_text
1,273,16,Ellipsis,yes,short-lived or long-lived,short- or long-lived,[13],,short-lived,
2,336,16,Ellipsis,yes,"The company will launch 800 hot spots , or `` ...",The service will launch later this summer with...,,[12],company,service sprint
6,697,16,Ellipsis,yes,People who are high in positive emotions sleep...,"Happy people sleep better , have better diets ...","[11, 16, 20]",,who are high in positive emotions they they they,happy
8,785,16,Ellipsis,yes,"Jacob has pushed consolidation for years , but...",Jacob has pushed consolidation for years but said,[8],,he,
11,1458,16,Ellipsis,yes,the Fed will acknowledge risks are tilted towa...,the central bank will say risks are tilted tow...,[18],,they,
12,1500,16,Ellipsis,yes,immune systems suppressed by medications or by...,suppressed immune systems due to illness or me...,[13],,by by,due to
13,1588,16,Ellipsis,yes,"He said it was a mistake , and he reimbursed t...",The governor said the use of the credit card w...,[8],,he he,the governor
15,1635,16,Ellipsis,yes,"11M , 22M , 33M , 44M and 55Mbit/sec","11 , 22 , 33 , 44 and 55Mbit/s","[18, 20, 22, 24]",,22m 33m 44m 55mbit/sec,55mbit/s
16,1830,16,Ellipsis,yes,someone strangled her and she may have been se...,Park appeared to have been strangled and may h...,[6],,her she,park
21,2555,16,Ellipsis,yes,Mauresmo has the confidence of having beaten S...,She has the confidence of having beaten her fo...,,[13],mauresmo,she she


In [209]:
substitute(16, ellip_none, 2, False)

### Addition/Deletion

In [210]:
substitute(25, textual_paraphrases[textual_paraphrases['type_id'] == 25], 2)

Double check type overwriting: row 37, sentence 2
Common indices: [12] | Pre-existing types: ['3_0']
Double check type overwriting: row 37, sentence 2
Common indices: [14, 15] | Pre-existing types: ['3_0' '3_0']
Double check type overwriting: row 53, sentence 1
Common indices: [12] | Pre-existing types: ['5_0']
Double check type overwriting: row 76, sentence 2
Common indices: [22, 23, 24] | Pre-existing types: ['3_0' '3_0' '3_0']
Double check type overwriting: row 124, sentence 2
Common indices: [2] | Pre-existing types: ['3_0']
Double check type overwriting: row 172, sentence 2
Common indices: [16] | Pre-existing types: ['3_0']
Double check type overwriting: row 174, sentence 2
Common indices: [0] | Pre-existing types: ['5_1']
Double check type overwriting: row 203, sentence 2
Common indices: [2] | Pre-existing types: ['7_0']
Double check type overwriting: row 213, sentence 2
Common indices: [2, 3, 4] | Pre-existing types: ['5_0' '5_0' '5_0']
Double check type overwriting: row 217, se

Double check type overwriting: row 1316, sentence 1
Common indices: [24] | Pre-existing types: ['2_1']
Double check type overwriting: row 1376, sentence 2
Common indices: [2, 3, 4, 5, 6, 7, 8] | Pre-existing types: ['5_0' '5_0' '5_0' '5_0' '5_0' '5_0' '5_0']
Double check type overwriting: row 1396, sentence 2
Common indices: [4] | Pre-existing types: ['5_0']
Double check type overwriting: row 1412, sentence 1
Common indices: [10] | Pre-existing types: ['3_0']
Double check type overwriting: row 1484, sentence 1
Common indices: [4] | Pre-existing types: ['3_0']
Double check type overwriting: row 1484, sentence 2
Common indices: [6] | Pre-existing types: ['3_0']
Double check type overwriting: row 1484, sentence 2
Common indices: [8, 9, 10] | Pre-existing types: ['3_0' '3_0' '3_0']
Double check type overwriting: row 1494, sentence 2
Common indices: [11] | Pre-existing types: ['3_0']
Double check type overwriting: row 1596, sentence 2
Common indices: [17] | Pre-existing types: ['3_0']
Doubl

## Adding zeroes

In [211]:
test = positives['sentence1_scope'][0]
np.where(test == '')

(array([ 4,  6,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18]),)

In [212]:
test

array(['3_0', '3_0', '3_0', '3_0', '', '4_0', '', '4_1', '', '', '', '',
       '', '', '', '', '', '', ''], dtype='<U64')

In [213]:
def fill_zeroes(arr):
    """Fill '0_0' in place of empty strings in a given array"""
    arr[np.where(arr == '')] = '0_0'

In [214]:
fill_zeroes(test)
test

array(['3_0', '3_0', '3_0', '3_0', '0_0', '4_0', '0_0', '4_1', '0_0',
       '0_0', '0_0', '0_0', '0_0', '0_0', '0_0', '0_0', '0_0', '0_0',
       '0_0'], dtype='<U64')

In [215]:
positives['sentence1_scope'].apply(fill_zeroes)
positives['sentence2_scope'].apply(fill_zeroes)
positives

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,ept_names,ept_ids,sentence1_scope_etpc,sentence2_scope_etpc,sentence1_segment_text,sentence2_segment_text,sentence1_scope,sentence2_scope
0,0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...",1,1,"[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 2...","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26...","[whom, called, Amrozi accused his brother, `` ...","[to him, Referring, Amrozi accused his brother...","[3_0, 3_0, 3_0, 3_0, 0_0, 4_0, 0_0, 4_1, 0_0, ...","[4_1, 4_0, 4_0, 0_0, 2_0, 0_0, 0_0, 0_0, 0_0, ..."
2,2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...",1,1,"[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, ...","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 2...","[They, cargo, on June 10, , he added, had publ...","[the ship 's owners, explosives, On June 10 ,,...","[5_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 3_0, ...","[3_0, 3_0, 3_0, 3_0, 5_0, 5_0, 5_0, 5_0, 0_0, ..."
4,4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...",0,1,"[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, ...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[The stock, rose, to close at, Friday, Friday,...","[PG & E Corp. shares, jumped, to, on Friday, o...","[5_0, 5_0, 4_0, 0_0, 0_0, 8_0, 0_0, 2_0, 0_0, ...","[5_0, 5_0, 5_0, 5_0, 5_0, 4_0, 0_0, 0_0, 0_0, ..."
5,5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...",1,1,"[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]","[25, 11, 11, 11, 11, 11, 11, 11, 25, 25, 25, 2...","[25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 11, 11...","[in the first quarter of the year, Revenue dro...","[the first quarter of the year, With the scand...","[0_0, 1_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[2_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, ..."
7,7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]","[25, 4, 25, 25, 25, 25, 5, 25, 25, 25]","[25, 4, 4, 25, 25, 25, 25, 25, 5, 25, 25, 25]","[DVD-CCA, state, then, The appealed to the Sup...","[DVD CCA, U.S., that decision, The appealed to...","[0_0, 7_0, 2_0, 0_0, 0_0, 0_0, 4_0, 0_0, 0_0, ...","[0_0, 7_0, 7_0, 0_0, 2_1, 2_1, 0_0, 0_0, 4_0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5792,5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...",1,1,"[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]","[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 2...","[29, 29, 29, 29, 29, 29, 29, 26, 26, 26, 26, 2...","[authorities said, Gehring waived extradition ...",[Gehring waived extradition Monday during a he...,"[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 2_1, 2_1, ..."
5793,5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...",1,1,"[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 2...","[Silver, Silver, `` I am advised that certain ...","[the Silver statement, the Silver statement, ,...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ..."
5795,5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...",0,1,"[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[29, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[deal, be completed, The deal , approved by bo...","[acquisition, close, The acquisition has been ...","[0_0, 4_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, ...","[0_0, 4_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ..."
5799,5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...",1,1,"[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[26, 26, 0, 7, 7, 6, 1, 25, 25, 4, 25, 25, 25,...","[25, 25, 25, 25, 7, 0, 6, 1, 0, 0, 4, 25, 0, 2...","[Corp, power station’s, US, owners, Last week,...","[Corp., Drax, American, owner, last week, The ...","[3_0, 3_0, 0_0, 5_1, 5_1, 5_0, 6_0, 0_0, 0_0, ...","[2_0, 2_0, 2_0, 2_0, 5_1, 0_0, 5_0, 6_0, 0_0, ..."


In [216]:
positives.drop(columns=['idx', 'etpc_label', 'mrpc_label', 'ept_names', 'ept_ids', 'sentence1_scope_etpc', 
                        'sentence2_scope_etpc', 'sentence1_segment_text', 'sentence2_segment_text'], inplace=True)
positives

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives.drop(columns=['idx', 'etpc_label', 'mrpc_label', 'ept_names', 'ept_ids', 'sentence1_scope_etpc',


Unnamed: 0,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,sentence1_scope,sentence2_scope
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...","[3_0, 3_0, 3_0, 3_0, 0_0, 4_0, 0_0, 4_1, 0_0, ...","[4_1, 4_0, 4_0, 0_0, 2_0, 0_0, 0_0, 0_0, 0_0, ..."
2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...","[5_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 3_0, ...","[3_0, 3_0, 3_0, 3_0, 5_0, 5_0, 5_0, 5_0, 0_0, ..."
4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...","[5_0, 5_0, 4_0, 0_0, 0_0, 8_0, 0_0, 2_0, 0_0, ...","[5_0, 5_0, 5_0, 5_0, 5_0, 4_0, 0_0, 0_0, 0_0, ..."
5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...","[0_0, 1_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[2_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, ..."
7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...","[0_0, 7_0, 2_0, 0_0, 0_0, 0_0, 4_0, 0_0, 0_0, ...","[0_0, 7_0, 7_0, 0_0, 2_1, 2_1, 0_0, 0_0, 4_0, ..."
...,...,...,...,...,...,...
5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 2_1, 2_1, ..."
5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ..."
5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...","[0_0, 4_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, ...","[0_0, 4_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ..."
5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...","[3_0, 3_0, 0_0, 5_1, 5_1, 5_0, 6_0, 0_0, 0_0, ...","[2_0, 2_0, 2_0, 2_0, 5_1, 0_0, 5_0, 6_0, 0_0, ..."


# Last bit of cleanup

In [217]:
def search(search_str, lookup_array):
    return any(f"{search_str}_" in element for element in lookup_array) 

In [218]:
add_dels1 = positives[positives['sentence1_scope'].apply(lambda x: search('2', x))]
add_dels2 = positives[positives['sentence2_scope'].apply(lambda x: search('2', x))]

In [219]:
def get_mask(search_str, lookup_array):
    return np.where(np.char.find(lookup_array, f'{search_str}_') >= 0)

In [220]:
add_dels1 = add_dels1.loc[:,['sentence1_tokenized', 'sentence1_scope']]
add_dels2 = add_dels2.loc[:,['sentence2_tokenized', 'sentence2_scope']]

In [221]:
# List of function words
fwords = ['a', 'an', 'the',                                                     # Articles
          'and', 'but', 'for', 'or',                                            # Conjunctions
          'that', 'this', 'those', 'these',                                     # Demonstratives
          'at', 'by', 'from', 'in', 'into', 'of', 'on', 'out', 'to', 'with']    # Prepositions

## Add/Del - Sentence 1

In [222]:
# Find indices where there are Addition/Deletion labels
add_dels1['masks'] = add_dels1['sentence1_scope'].apply([lambda x: get_mask('2', x)[0]])
# Get words and labels at these indices
add_dels1['words'] = add_dels1.apply(lambda x: x.sentence1_tokenized[x.masks], axis=1)
add_dels1['ids'] = add_dels1.apply(lambda x: x.sentence1_scope[x.masks], axis=1)
# Convert words to lowercase
add_dels1['words'] = add_dels1['words'].apply(lambda x: x.astype(str))
add_dels1['words'] = add_dels1['words'].apply(np.char.lower)

In [223]:
def get_funcwords(words, indices):
    # There's probably a non for loop way of doing this but ¯\_(ツ)_/¯
    indices = []
    for i, word in enumerate(words):
        if word in fwords:
            indices.append(i)
    return indices if indices else None


In [224]:
# Get function word indices
add_dels1['function_word_idx'] = add_dels1.apply(lambda x: get_funcwords(x.words, x.masks), axis=1)
# Filter out null values
add_dels1_f = add_dels1[add_dels1['function_word_idx'].notnull()][:50]
# Get function words from indices
add_dels1_f['function_words'] = add_dels1.apply(lambda x: x.words[x.function_word_idx], axis=1)

In [225]:
def newids(old_ids, indices):
    subset = old_ids[indices]
    counter = 3
    for i, element in enumerate(subset):
        subset[i] = f'1_{counter}' # TODO: This is incorrect
        # The correct way to do this is to find the max instance of 1_ and have counter start after that
        # However, since we're not currently using instance counters for modelling, this will work for now
        counter += 1
    return subset


In [226]:
add_dels1_f['newids'] = add_dels1_f.apply(lambda x: newids(x.ids, x.function_word_idx), axis=1)

In [227]:
def subbie(scope, masks, fword_ids, new_ids):
    newmask = masks[fword_ids]
    scope[newmask] = new_ids

In [228]:
add_dels1_f.apply(lambda x: subbie(x.sentence1_scope, x.masks, x.function_word_idx, x.newids), axis=1)

4      None
13     None
22     None
28     None
39     None
42     None
55     None
58     None
66     None
67     None
74     None
80     None
93     None
99     None
102    None
103    None
105    None
111    None
115    None
117    None
122    None
125    None
126    None
127    None
133    None
140    None
141    None
144    None
146    None
152    None
155    None
157    None
160    None
162    None
164    None
173    None
180    None
183    None
188    None
199    None
202    None
203    None
205    None
208    None
212    None
225    None
245    None
251    None
256    None
259    None
dtype: object

## Add/Del - Sentence 2

In [229]:
# Find indices where there are Addition/Deletion labels
add_dels2['masks'] = add_dels2['sentence2_scope'].apply([lambda x: get_mask('2', x)[0]])
# Get words and labels at these indices
add_dels2['words'] = add_dels2.apply(lambda x: x.sentence2_tokenized[x.masks], axis=1)
add_dels2['ids'] = add_dels2.apply(lambda x: x.sentence2_scope[x.masks], axis=1)
# Convert words to lowercase
add_dels2['words'] = add_dels2['words'].apply(lambda x: x.astype(str))
add_dels2['words'] = add_dels2['words'].apply(np.char.lower)

In [230]:
# Get function word indices
add_dels2['function_word_idx'] = add_dels2.apply(lambda x: get_funcwords(x.words, x.masks), axis=1)
# Filter out null values
add_dels2_f = add_dels2[add_dels2['function_word_idx'].notnull()][:50]
# Get function words from indices
add_dels2_f['function_words'] = add_dels2.apply(lambda x: x.words[x.function_word_idx], axis=1)

In [231]:
add_dels2_f['newids'] = add_dels2_f.apply(lambda x: newids(x.ids, x.function_word_idx), axis=1)

In [232]:
add_dels2_f.apply(lambda x: subbie(x.sentence2_scope, x.masks, x.function_word_idx, x.newids), axis=1)

5      None
7      None
13     None
14     None
22     None
26     None
31     None
32     None
37     None
39     None
40     None
44     None
47     None
59     None
67     None
76     None
80     None
86     None
87     None
96     None
102    None
104    None
107    None
111    None
115    None
122    None
128    None
133    None
138    None
140    None
141    None
149    None
154    None
156    None
158    None
163    None
167    None
170    None
180    None
183    None
186    None
188    None
196    None
198    None
200    None
205    None
213    None
216    None
230    None
231    None
dtype: object

# Checking Impossible Label Combinations

In [None]:
def list_types(str_types: str) -> list:
    """Converts a string of types to a list"""
    return str_types.split(' & ')

multiple = lambda x : np.where(np.char.find(x, '&')>=0)[0]

In [None]:
sentence1_multiple = positives[positives['sentence1_scope'].apply(lambda x: len(x[multiple(x)])) > 0]
sentence1_multiple['s1_multi'] = sentence1_multiple['sentence1_scope'].apply(lambda x: x[multiple(x)])
sentence1_multiple['s1_multi_i'] = sentence1_multiple['sentence1_scope'].apply(multiple)
sentence1_multiple['s1_multi'] = sentence1_multiple['s1_multi'].apply(lambda x: [list_types(typee) for typee in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence1_multiple['s1_multi'] = sentence1_multiple['sentence1_scope'].apply(lambda x: x[multiple(x)])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence1_multiple['s1_multi_i'] = sentence1_multiple['sentence1_scope'].apply(multiple)


In [None]:
def contains_multiple(some_multiple, biglist):
    return all(x in biglist for x in some_multiple)

In [None]:
searchie = ['2_0', '5_0']
sentence1_multiple['s1_multi'].apply(lambda x: contains_multiple(searchie, x))
matches = sentence1_multiple[sentence1_multiple['s1_multi'].apply(lambda x: any(contains_multiple(searchie, item) for item in x))]
matches.loc[:,['sentence1_tokenized', 'sentence1_scope', 's1_multi', 's1_multi_i']]

Unnamed: 0,sentence1_tokenized,sentence1_scope,s1_multi,s1_multi_i
354,"[Intel, was, disappointed, and, assessing, its...","[3_0 & 5_0 & 2_0, 3_0, 3_0, 3_0, 3_0, 3_0, 3_0...","[[3_0, 5_0, 2_0]]",[0]
383,"[That, 's, the, highest, third-quarter, growth...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[[5_0, 2_0], [5_0, 2_0]]","[20, 21]"
576,"[The, winner, of, the, Williams-Mauresmo, matc...","[5_0, 5_0, 5_0, 5_0, 5_0, 5_0 & 2_0, 0_0, 0_0,...","[[5_0, 2_0]]",[5]
945,"[The, face, of, President, Saddam, Hussein, wa...","[5_0, 4_0, 5_0, 5_0 & 2_0, 5_0, 2_0, 0_0, 0_0,...","[[5_0, 2_0]]",[3]
1034,"[Schering-Plough, shares, fell, 72, cents, to,...","[5_0 & 2_0, 5_0, 0_0, 0_0, 0_0, 0_0, 2_2, 2_2,...","[[5_0, 2_0]]",[0]
1297,"[Brian, Florence, ,, 38, ,, died, Sept., 25, o...","[2_0 & 5_0, 5_0, 2_0, 2_0, 2_0, 0_0, 0_0, 0_0,...","[[2_0, 5_0]]",[0]
1338,"[``, For, customers, to, get, the, most, of, t...","[0_0, 0_0, 0_0, 0_0, 4_0, 0_0, 0_0, 0_0, 0_0, ...","[[2_0, 5_0]]",[17]
1622,"[He, also, noted, Tom, Siebel, had, turned, in...","[0_0, 0_0, 4_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[[5_0, 2_0], [5_0, 2_0], [5_0, 2_0]]","[18, 19, 20]"
2421,"[``, Frank, Quattrone, is, innocent, ,, '', Ke...","[0_0, 2_0 & 5_0, 5_0, 0_0, 0_0, 0_0, 0_0, 0_0,...","[[2_0, 5_0]]",[1]
2658,"[It, passed, only, after, Republicans, won, th...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[[5_0, 2_0], [3_0, 2_0, 5_0], [5_0, 2_1], [2_1...","[13, 14, 19, 20]"


In [None]:
def another_subbie(scope, mask,):
    scope[mask] = '5_0'

In [None]:
operands1 = matches.loc[[383, 576, 945, 1034, 1297, 1338, 1622, 2421, 3328, 3588, 4247, 5174, 5184, 5731],['sentence1_tokenized', 'sentence1_scope', 's1_multi', 's1_multi_i']]
operands1

Unnamed: 0,sentence1_tokenized,sentence1_scope,s1_multi,s1_multi_i
383,"[That, 's, the, highest, third-quarter, growth...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[[5_0, 2_0], [5_0, 2_0]]","[20, 21]"
576,"[The, winner, of, the, Williams-Mauresmo, matc...","[5_0, 5_0, 5_0, 5_0, 5_0, 5_0 & 2_0, 0_0, 0_0,...","[[5_0, 2_0]]",[5]
945,"[The, face, of, President, Saddam, Hussein, wa...","[5_0, 4_0, 5_0, 5_0 & 2_0, 5_0, 2_0, 0_0, 0_0,...","[[5_0, 2_0]]",[3]
1034,"[Schering-Plough, shares, fell, 72, cents, to,...","[5_0 & 2_0, 5_0, 0_0, 0_0, 0_0, 0_0, 2_2, 2_2,...","[[5_0, 2_0]]",[0]
1297,"[Brian, Florence, ,, 38, ,, died, Sept., 25, o...","[2_0 & 5_0, 5_0, 2_0, 2_0, 2_0, 0_0, 0_0, 0_0,...","[[2_0, 5_0]]",[0]
1338,"[``, For, customers, to, get, the, most, of, t...","[0_0, 0_0, 0_0, 0_0, 4_0, 0_0, 0_0, 0_0, 0_0, ...","[[2_0, 5_0]]",[17]
1622,"[He, also, noted, Tom, Siebel, had, turned, in...","[0_0, 0_0, 4_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[[5_0, 2_0], [5_0, 2_0], [5_0, 2_0]]","[18, 19, 20]"
2421,"[``, Frank, Quattrone, is, innocent, ,, '', Ke...","[0_0, 2_0 & 5_0, 5_0, 0_0, 0_0, 0_0, 0_0, 0_0,...","[[2_0, 5_0]]",[1]
3328,"[Advertising, and, circulation, revenues, from...","[0_0, 0_0, 0_0, 0_0, 4_0, 5_0, 5_0 & 2_0, 5_0,...","[[5_0, 2_0]]",[6]
3588,"[The, operating, revenues, were, $, 1.45, bill...","[1_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[[5_0, 2_0]]",[14]


In [None]:
operands1.apply(lambda x: another_subbie(x.sentence1_scope, x.s1_multi_i), axis=1)

383     None
576     None
945     None
1034    None
1297    None
1338    None
1622    None
2421    None
3328    None
3588    None
4247    None
5174    None
5184    None
5731    None
dtype: object

In [None]:
operands1['sentence1_scope'][576] == positives['sentence1_scope'][576]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [None]:
[354, 2658]
[4179]
[383, 576, 945, 1034, 1297, 1338, 1622, 2421, 3328, 3588, 4247, 5174, 5184, 5731]

In [None]:
sentence2_multiple = positives[positives['sentence2_scope'].apply(lambda x: len(x[multiple(x)])) > 0]
sentence2_multiple['s2_multi'] = sentence2_multiple['sentence2_scope'].apply(lambda x: x[multiple(x)])
sentence2_multiple['s2_multi_i'] = sentence2_multiple['sentence2_scope'].apply(multiple)

searchie = ['2_0', '5_0']
sentence2_multiple['s2_multi'].apply(lambda x: contains_multiple(searchie, x))
matches = sentence2_multiple[sentence2_multiple['s2_multi'].apply(lambda x: any(contains_multiple(searchie, item) for item in x))]
operands2 = matches.loc[:,['sentence2_tokenized', 'sentence2_scope', 's2_multi', 's2_multi_i']]
operands2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence2_multiple['s2_multi'] = sentence2_multiple['sentence2_scope'].apply(lambda x: x[multiple(x)])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence2_multiple['s2_multi_i'] = sentence2_multiple['sentence2_scope'].apply(multiple)


Unnamed: 0,sentence2_tokenized,sentence2_scope,s2_multi,s2_multi_i
213,"[A, downturn, in, services, activity, in, Marc...","[4_0, 5_0, 1_3, 5_0 & 2_0, 5_0 & 2_0, 5_0, 5_0...","[5_0 & 2_0, 5_0 & 2_0]","[3, 4]"
307,"[Called, ``, Taxpayers, Against, the, Recall, ...","[2_0, 2_0, 0_0, 0_0, 0_0, 0_0, 0_0, 2_0, 5_0 &...",[5_0 & 2_0],[8]
1162,"[On, Wednesday, ,, analysts, say, ,, UN, membe...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[5_0 & 2_0, 5_0 & 2_0, 5_0 & 2_0]","[17, 18, 19]"
1376,"[The, pressure, on, Peter, Hollingworth, to, r...","[5_0, 5_0, 5_0 & 2_0, 5_0 & 2_0, 5_0 & 2_0, 5_...","[5_0 & 2_0, 5_0 & 2_0, 5_0 & 2_0, 5_0 & 2_0, 5...","[2, 3, 4, 5, 6, 7, 8]"
1758,"[But, Mr, Kenny, said, his, advice, to, David,...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 2_0 & 5_0,...",[2_0 & 5_0],[7]
1801,"[GBI, spokesman, John, Bankhead, said, the, mu...","[5_0, 5_0, 2_0 & 5_0, 0_0, 0_0, 0_0, 4_0, 0_0,...",[2_0 & 5_0],[2]
2199,"[Schering-Plough, shares, fell, 3.8, percent, ...","[5_0 & 2_0 & 2_1, 5_0, 0_0, 0_0, 0_0, 0_0, 0_0...",[5_0 & 2_0 & 2_1],[0]
2375,"[``, I, know, of, nobody, who, pressured, anyb...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 6_0, 0_0, 0_0, ...",[2_0 & 5_0],[10]
2690,"[``, Prospects, for, the, whole, Canadian, aer...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[3_0 & 2_0 & 5_0, 3_0 & 5_0]","[26, 27]"
2966,"[Dave, Delainey, agreed, to, cooperate, with, ...","[2_0 & 5_0, 5_0, 6_0, 0_0, 0_0, 0_0, 0_0, 0_0,...",[2_0 & 5_0],[0]


In [None]:
operands2 = operands2.loc[[213, 307, 1162, 1376, 1758, 1801, 2375, 2966, 3014, 3280, 3782, 3862, 4282, 4656, 5460],:]
operands2

Unnamed: 0,sentence2_tokenized,sentence2_scope,s2_multi,s2_multi_i
213,"[A, downturn, in, services, activity, in, Marc...","[4_0, 5_0, 1_3, 5_0 & 2_0, 5_0 & 2_0, 5_0, 5_0...","[5_0 & 2_0, 5_0 & 2_0]","[3, 4]"
307,"[Called, ``, Taxpayers, Against, the, Recall, ...","[2_0, 2_0, 0_0, 0_0, 0_0, 0_0, 0_0, 2_0, 5_0 &...",[5_0 & 2_0],[8]
1162,"[On, Wednesday, ,, analysts, say, ,, UN, membe...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[5_0 & 2_0, 5_0 & 2_0, 5_0 & 2_0]","[17, 18, 19]"
1376,"[The, pressure, on, Peter, Hollingworth, to, r...","[5_0, 5_0, 5_0 & 2_0, 5_0 & 2_0, 5_0 & 2_0, 5_...","[5_0 & 2_0, 5_0 & 2_0, 5_0 & 2_0, 5_0 & 2_0, 5...","[2, 3, 4, 5, 6, 7, 8]"
1758,"[But, Mr, Kenny, said, his, advice, to, David,...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 2_0 & 5_0,...",[2_0 & 5_0],[7]
1801,"[GBI, spokesman, John, Bankhead, said, the, mu...","[5_0, 5_0, 2_0 & 5_0, 0_0, 0_0, 0_0, 4_0, 0_0,...",[2_0 & 5_0],[2]
2375,"[``, I, know, of, nobody, who, pressured, anyb...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 6_0, 0_0, 0_0, ...",[2_0 & 5_0],[10]
2966,"[Dave, Delainey, agreed, to, cooperate, with, ...","[2_0 & 5_0, 5_0, 6_0, 0_0, 0_0, 0_0, 0_0, 0_0,...",[2_0 & 5_0],[0]
3014,"[The, Virginia, attorney-general, ,, Jerry, Ki...","[2_1, 2_1, 2_1, 2_1, 2_0 & 5_0, 5_0, 0_0, 0_0,...",[2_0 & 5_0],[4]
3280,"[A, similar, Florida, straw, poll, in, 1991, b...","[5_0, 5_0 & 2_0, 5_0, 5_0, 5_0, 5_0, 5_0, 4_0,...",[5_0 & 2_0],[1]


In [None]:
operands2.apply(lambda x: another_subbie(x.sentence2_scope, x.s2_multi_i), axis=1)

213     None
307     None
1162    None
1376    None
1758    None
1801    None
2375    None
2966    None
3014    None
3280    None
3782    None
3862    None
4282    None
4656    None
5460    None
dtype: object

In [None]:
operands2['sentence2_scope'][213] == positives['sentence2_scope'][213]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [None]:
[2199, 2690, 5249]
[213, 307, 1162, 1376, 1758, 1801, 2375, 2966, 3014, 3280, 3782, 3862, 4282, 4656, 5460]

[213,
 307,
 1162,
 1376,
 1758,
 1801,
 2199,
 2375,
 2690,
 2966,
 3014,
 3280,
 3782,
 3862,
 4282,
 4656,
 5249,
 5460]

In [None]:
# TODO: rows to drop from textual_paraphrases:
[2196]

[2196]

# Collapsing labels for sequence classification

In [268]:
def collapse_multilabel(arr1, arr2):
    labels1 = {label[0] for label in arr1}
    labels2 = {label[0] for label in arr2}
    return sorted(list(labels1 | labels2))

In [269]:
positives['collapsed_labels'] = positives.apply(lambda x: collapse_multilabel(x.sentence1_scope, x.sentence2_scope), axis=1)
positives

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['collapsed_labels'] = positives.apply(lambda x: collapse_multilabel(x.sentence1_scope, x.sentence2_scope), axis=1)


Unnamed: 0,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,sentence1_scope,sentence2_scope,collapsed_labels,s1_token_labs,s2_token_labs
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...","[3_0, 3_0, 3_0, 3_0, 0_0, 4_0, 0_0, 4_1, 0_0, ...","[4_1, 4_0, 4_0, 0_0, 2_0, 0_0, 0_0, 0_0, 0_0, ...","[0, 2, 3, 4]","[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0...."
2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...","[5_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 3_0, ...","[3_0, 3_0, 3_0, 3_0, 5_0, 5_0, 5_0, 5_0, 0_0, ...","[0, 2, 3, 4, 5]","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0...."
4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...","[5_0, 5_0, 4_0, 0_0, 0_0, 8_0, 0_0, 2_0, 0_0, ...","[5_0, 5_0, 5_0, 5_0, 5_0, 4_0, 0_0, 0_0, 0_0, ...","[0, 1, 2, 3, 4, 5, 8]","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0...."
5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...","[0_0, 1_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[1_3, 1_4, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, ...","[0, 1, 2]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1....","[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0...."
7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...","[0_0, 7_0, 2_0, 0_0, 0_0, 0_0, 4_0, 0_0, 0_0, ...","[0_0, 7_0, 7_0, 0_0, 1_3, 2_1, 0_0, 0_0, 4_0, ...","[0, 1, 2, 4, 7]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0...."
...,...,...,...,...,...,...,...,...,...
5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 2_1, 2_1, ...","[0, 2, 3, 8]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0...."
5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[0, 2, 3, 8]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0...."
5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...","[0_0, 4_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, ...","[0_0, 4_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[0, 2, 4]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0...."
5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...","[3_0, 3_0, 0_0, 5_1, 5_1, 5_0, 6_0, 0_0, 0_0, ...","[2_0, 2_0, 2_0, 2_0, 5_1, 0_0, 5_0, 6_0, 0_0, ...","[0, 2, 3, 5, 6, 7]","[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0....","[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0...."


# Collapsing labels for token classification

In [270]:
def paraop_label_str_to_list(str_arr) -> list:
    """Converts the strings in a Paraop label array to lists of ints, ommiting instance counters.
    Returns a list containing the converted indices."""
    output = []
    for i, each_str in enumerate(str_arr):
        labels = each_str.split(' & ')
        label_output = {int(label[0]) for label in labels}
        output.append(list(label_output))
    return output

# Example output
example = np.array(['0_0', '0_0', '3_0', '5_0 & 5_1', '5_0 & 6_1'])
paraop_label_str_to_list(example)

[[0], [0], [3], [5], [5, 6]]

In [271]:
def make_array(str_arr):
    output_arr = np.zeros((len(str_arr), 8))
    converted_arr = paraop_label_str_to_list(str_arr)
    for i, each_list in enumerate(converted_arr):
        if each_list == [0]: continue
        indices = np.array(each_list) - 1
        output_arr[i, indices] = 1
    return output_arr

# Example output
make_array(example)

array([[0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 1., 0., 0.]])

In [272]:
positives['s1_token_labs'] = positives['sentence1_scope'].apply(make_array)
positives['s2_token_labs'] = positives['sentence2_scope'].apply(make_array)
positives

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['s1_token_labs'] = positives['sentence1_scope'].apply(make_array)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives['s2_token_labs'] = positives['sentence2_scope'].apply(make_array)


Unnamed: 0,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,sentence1_scope,sentence2_scope,collapsed_labels,s1_token_labs,s2_token_labs
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...","[3_0, 3_0, 3_0, 3_0, 0_0, 4_0, 0_0, 4_1, 0_0, ...","[4_1, 4_0, 4_0, 0_0, 2_0, 0_0, 0_0, 0_0, 0_0, ...","[0, 2, 3, 4]","[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0...."
2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...","[5_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 3_0, ...","[3_0, 3_0, 3_0, 3_0, 5_0, 5_0, 5_0, 5_0, 0_0, ...","[0, 2, 3, 4, 5]","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0...."
4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...","[5_0, 5_0, 4_0, 0_0, 0_0, 8_0, 0_0, 2_0, 0_0, ...","[5_0, 5_0, 5_0, 5_0, 5_0, 4_0, 0_0, 0_0, 0_0, ...","[0, 1, 2, 3, 4, 5, 8]","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0...."
5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...","[0_0, 1_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[1_3, 1_4, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, ...","[0, 1, 2]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1....","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1...."
7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...","[0_0, 7_0, 2_0, 0_0, 0_0, 0_0, 4_0, 0_0, 0_0, ...","[0_0, 7_0, 7_0, 0_0, 1_3, 2_1, 0_0, 0_0, 4_0, ...","[0, 1, 2, 4, 7]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0...."
...,...,...,...,...,...,...,...,...,...
5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 2_1, 2_1, ...","[0, 2, 3, 8]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0...."
5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[0, 2, 3, 8]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0...."
5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...","[0_0, 4_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, 2_0, ...","[0_0, 4_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...","[0, 2, 4]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0...."
5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...","[3_0, 3_0, 0_0, 5_1, 5_1, 5_0, 6_0, 0_0, 0_0, ...","[2_0, 2_0, 2_0, 2_0, 5_1, 0_0, 5_0, 6_0, 0_0, ...","[0, 2, 3, 5, 6, 7]","[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0....","[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0...."


# Exporting

In [273]:
positives.to_csv('datasets/etpc_reannotated.csv')

In [274]:
positives.to_pickle('datasets/etpc_reannotated.pkl')

# Garbage Pail

## Cleanup

## Other stuff

In [289]:
test = positives['sentence1_scope'][14]
test

array(['0_0', '0_0', '0_0', '0_0', '2_0', '0_0', '3_0 & 5_1',
       '3_0 & 5_0 & 5_1', '3_0', '3_0', '3_0', '3_0', '3_0', '3_0', '3_0',
       '3_0', '3_0', '3_0', '3_0', '3_0', '3_0', '3_0', '3_0', '3_0',
       '0_0'], dtype='<U64')

In [290]:
int(test[0][-1])

0

In [291]:
'3_0 & 4_0'.split(' & ')

['3_0', '4_0']

In [292]:
test

array(['0_0', '0_0', '0_0', '0_0', '2_0', '0_0', '3_0 & 5_1',
       '3_0 & 5_0 & 5_1', '3_0', '3_0', '3_0', '3_0', '3_0', '3_0', '3_0',
       '3_0', '3_0', '3_0', '3_0', '3_0', '3_0', '3_0', '3_0', '3_0',
       '0_0'], dtype='<U64')

In [293]:
mask = np.where(np.char.find(test, '5_') >= 0)
test[mask]

array(['3_0 & 5_1', '3_0 & 5_0 & 5_1'], dtype='<U64')

In [294]:
get_max_i(test, 5)

1

In [295]:
maxie = positives['sentence1_scope'].apply(lambda x: get_max_i(x, 7))

In [296]:
countie = lambda x: len(maxie[maxie >= x-1])/len(maxie[maxie >= 0])
print(f'{countie(2):.2%}')

21.09%


In [297]:
def printie(row, n):
    tokens = positives.loc[row, f'sentence{n}_tokenized']
    scopes = positives.loc[row, f'sentence{n}_scope']

    data = {'list1': tokens, 'list2': scopes}
    df = pd.DataFrame(data)

    return df.T

def displayie(n):
    df1 = printie(n, 1)
    df2 = printie(n, 2)

    center_hide = lambda x: x.style.set_properties(**{'text-align': 'center'}).hide(axis='index').hide(axis='columns')

    display(center_hide(df1)) 
    display(center_hide(df2)) 

In [298]:
displayie(1957)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
Malvo,'s,attorneys,mounted,an,insanity,defense,",",arguing,that,Muhammad,'s,indoctrination,left,him,unable,to,tell,right,from,wrong,.
0_0,0_0,4_1,4_2 & 6_0,0_0,0_0,0_0,0_0,4_5,0_0,5_1,5_1,4_3,0_0,5_0,4_4,0_0,4_0,0_0,0_0,0_0,0_0


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
Malvo,'s,lawyers,have,presented,an,insanity,defense,",",saying,brainwashing,by,convicted,sniper,John,Allen,Muhammad,left,Malvo,incapable,of,knowing,right,from,wrong,.
0_0,0_0,4_1,4_2 & 6_0,4_2 & 6_0,0_0,0_0,0_0,0_0,4_5,4_3,5_1,5_1 & 2_0,5_1 & 2_0,2_0,2_0,5_1,0_0,5_0,4_4,0_0,4_0,0_0,0_0,0_0,0_0


In [299]:
def make_array(str_arr):
    output_arr = np.zeros((len(str_arr), 8))
    converted_arr = paraop_label_str_to_list(str_arr)
    for i, each_list in enumerate(converted_arr):
        if each_list == [0]: continue
        indices = np.array(each_list) - 1
        output_arr[i, indices] = 1
    return output_arr

    """for labie, cols in enumerate(col_idx_map):
        if labie == 0: continue
        max_i = len(cols)
        for instance, col in enumerate(cols):
            mask = np.where(np.char.find(str_arr, f'{labie}_{instance}') >= 0)
            #subset = str_arr[mask]
            output_arr[mask,col] = 1"""

    """for i, each_str in enumerate(str_arr):
        if each_str == '0_0':
            continue
        labels = each_str.split(' & ')
        for label in labels:
                if label[0] == str(i):
                    max_i = max(max_i, int(label[-1]))

        # Subset elements in array containing label
        mask = np.where(np.char.find(str_arr, f'{i}_') >= 0)
        subset = str_arr[mask]
        # Look for the maximum 
        for item in subset:
            labels = item.split(' & ')
            for label in labels:
                if label[0] == str(i):
                    max_i = max(max_i, int(label[-1]))"""
    
    return output_arr