# Imports

In [None]:
import pandas as pd
import numpy as np

# Reading the ETPC

In [None]:
# Unpickle etpc_raw
etpc = pd.read_pickle('datasets/etpc_raw.pkl')

# Cleanup

## Cleaning up Columns

In [None]:
etpc.rename(columns={'paraphrase_type_ids': 'ept_ids', 'paraphrase_types': 'ept_names'}, inplace=True)
etpc.drop(columns={'negation'}, axis=1, inplace=True)
etpc

## Remapping Paraphrase IDs


First, make a list with paraphrase types and IDs from the ETPC:

In [None]:
id_map = pd.read_xml('https://raw.githubusercontent.com/venelink/ETPC/master/Corpus/paraphrase_types.xml')
# Rename columns for clarity
id_map.rename(columns={'type_id': 'ept_id', 'type_name': 'ept_name'}, inplace=True)
# Drop unused data
id_map = id_map[['ept_id', 'ept_name']] # No use for type_category column
id_map.drop(id_map.tail(2).index,inplace=True) # Types don't appear in ETPC
id_map.style.hide(axis="index")
id_map

Now, make a list with paraphrase names and IDs for ParaOp types

In [None]:
data = [[0, 'No change'],
        [1, 'Addition/Deletion - Function Word'],
        [2, 'Addition/Deletion - Content Word'],
        [3, 'Change of Order'],
        [4, 'Substitution - Synonym'],
        [5, 'Substitution - Contextual Synonym'],
        [6, 'Substitution - Morphological'],
        [7, 'Substitution - Spelling and Format']
       ]
paraop_map = pd.DataFrame(data, columns = ['paraop_id', 'paraop_name'])
paraop_map.set_index('paraop_id', inplace=True)
paraop_map

## Mapping

We'll use the dataframe below for mapping. Each row will contain the name and ID of a paraphrase type in the ETPC, and the name and ID of the correspondent ParaOp type.

In [None]:
id_map['paraop_id'] = ''
id_map['paraop_name'] = ''
id_map

Here's where we do the mapping:

In [None]:
# Helper function to map an ETPC id to a Paraop id
def map_id(ept_id, paraop_id):
    """Given an EPT id and a Paraop id, look up the name of the Paraop id and 
    fill in the rows of id_map with paraop_id and the name."""
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'] = paraop_id
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_name'] = paraop_map.loc[paraop_id, 'paraop_name']

In [None]:
map_id(ept_id=1, paraop_id=6)
map_id(ept_id=3, paraop_id=6)
map_id(ept_id=26, paraop_id=3)
map_id(ept_id=29, paraop_id=0)
id_map

TODO: Figure out a way to hide index of map_id throughout whole notebook. For some reason this seems harder than it needs to be...

## Filtering


In [None]:
# Helper methods for filtering the ETPC dataframe based on paraphrase types

def filter_contains(df, search_ids):
  """Returns an ETPC dataframe with rows where paraphrase_types_ids contains
  the search_ids. Use this to search for paraphrase pairs containing specific
  ids"""
  return df[df['ept_ids'].apply(lambda x: np.isin(search_ids, x))]

def filter_equals(df, search_ids):
  """Returns an ETPC dataframe with rows where paraphrase_types_ids EXACTLY 
  MATCHES the search_ids."""
  return df[df['ept_ids'].apply(lambda x: np.array_equal(x, search_ids))]
  


In [None]:
filter_contains(etpc, '3')

In [None]:
filter_equals(etpc, ['25', '29'])

In [None]:
#TODO: Delete? Do I need this? I think I need this but it can be moved somewhere else
etpc['paraop_ids'] = etpc['ept_ids']
etpc

## Disambiguating duplicate types

### Reading XML files from the ETPC

In [None]:
textual_paraphrases = pd.read_xml('datasets/etpc/textual_paraphrases.xml')
# Convert scopes from strings to lists of ints
textual_paraphrases['s1_scope'] = textual_paraphrases['s1_scope'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)
textual_paraphrases['s2_scope'] = textual_paraphrases['s2_scope'].apply(lambda x: [int(n) for n in x.split(',')] if type(x) == str else x)

In [None]:
non_paraphrase_neg = pd.read_xml('https://raw.githubusercontent.com/venelink/ETPC/master/Corpus/textual_np_neg.xml')
non_paraphrase_neg[:10]

In [None]:
non_paraphrase_pos = pd.read_xml('https://raw.githubusercontent.com/venelink/ETPC/master/Corpus/textual_np_pos.xml')
non_paraphrase_pos[:10]

In [None]:
all_pairs = pd.read_xml('datasets/etpc/all_pairs.xml')

#### Delete?
No, don't delete, but relabel

In [None]:
pairs = pd.read_xml('datasets/etpc/text_pairs.xml')
pairs.drop(columns=['negation'], inplace=True)
pairs.set_index('pair_id', inplace=True)

In [None]:
pairs['sent1_indices'] = pairs['sent1_tokenized'].apply(lambda x: np.zeros(len(x.split())))
pairs['sent2_indices'] = pairs['sent2_tokenized'].apply(lambda x: np.zeros(len(x.split())))
pairs

### Helper Functions for Disambiguating

Helper functions to find duplicate paraphrase type ids given an array of ids

In [None]:
def get_duplicates(input_array):
    u, c = np.unique(input_array, return_counts=True)
    u = u[c > 1]
    c = c[c > 1]
    return dict(zip(u, c))

In [None]:
def has_duplicate(input_array):
    return len(get_duplicates(input_array)) != 0

In [None]:
etpc[etpc['ept_ids'].apply(has_duplicate)]

In [None]:
get_duplicates(etpc['ept_ids'][2])

**Disambiguate function:** given a row of the ETPC dataframe, look up any duplicate paraphrase types and annotate each separate instance of each type in the scope. The new scopes are floats, where the number before the period represents the paraphrase type, and the number after the period represents which instance of that type this scope refers to.

In [None]:
#TODO: Fix indices in ETPC. Remember to change the +1 here after that

def disambiguate_duplicate(idx, lookup_df):
    """"Disambiguates duplicate paraphrase types for a row (given its idx) of
    the ETPC dataframe. Returns a tuple containing the disambiguated scopes for
    each sentence in the pair."""
    # First, determine what are the duplicates
    dups = get_duplicates(etpc['ept_ids'][idx])
    # Convert array values to strings with '_0' appended to them
    s1_str = np.array([x + '_0' for x in etpc['sentence1_segment_location'][idx].astype(str)])
    s2_str = np.array([y + '_0' for y in etpc['sentence2_segment_location'][idx].astype(str)])
    # Disambiguate
    for iid, count in dups.items():
        # Subset the lookup df
        subset = lookup_df[(lookup_df['pair_id'] == idx+1) & (lookup_df['type_id'] == int(iid))]
        subset.reset_index(drop=True, inplace=True)
        for i in range(1, count): # Skip adding zeroes since they're already there
            if subset['s1_scope'][i] is not None:
                s1_str[subset['s1_scope'][i]] = str(iid) + f'_{str(i)}'
            if subset['s2_scope'][i] is not None:
                s2_str[subset['s2_scope'][i]] = str(iid) + f'_{str(i)}'
    return s1_str, s2_str

In [None]:
textual_paraphrases[:20]

In [None]:
disambiguate_duplicate(2, textual_paraphrases)

### Performing the disambiguation

**For testing**: Let's subset only the rows containing paraphrases (as labeled 
by mrpc_label), since those correspond to the `textual_paraphrases.xml` file 
from the ETPC

In [None]:
positives = etpc.loc[etpc['mrpc_label'] == 1]
positives.rename(columns={'sentence1_segment_location': 'sentence1_scope_etpc', 'sentence2_segment_location': 'sentence2_scope_etpc'}, inplace=True)
positives.drop(columns=['sentence1_segment_location_indices', 'sentence2_segment_location_indices'],inplace=True)
positives['idx'] = positives.index.to_series()
positives

In [None]:
series = positives['idx'].apply(disambiguate_duplicate, lookup_df=textual_paraphrases)
cols = pd.DataFrame(series.tolist(), columns=['sentence1', 'sentence2'])
positives['sentence1_scope_paraop'] = cols['sentence1'].values
positives['sentence2_scope_paraop'] = cols['sentence2'].values

## Reannotating types

In [None]:
# Helper function to get a Paraop id from an ETPC id
def ept_to_paraop(ept_id):
    return id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'].iloc[0]

ept_to_paraop(3)

In [None]:
def substitute_id(id_array, old):
    """Substitute the ETPC ids in id_array for their corresponding Paraop ids"""
    new = str(ept_to_paraop(old))
    copy = id_array.astype('U10') # To allow for more than 4 characters
    for i in range(len(copy)):
        iid, count = copy[i].split('_')
        if iid == str(old):
            # Any reannotated types will have a 10 appended to them. This is to
            # identify which types have already been reannotated, to avoid 
            # accidentally reannotating ids that have already been substituted. 
            copy[i] = f'10{new}_{count}'
    return copy

In [None]:
test = positives['sentence1_scope_paraop'][0]
test

In [None]:
substitute_id(test, 26)

# Reannotation, Continued

In [None]:
id_map = id_map.style.hide(axis="index")
id_map

In [None]:
positives

In [None]:
test = filter_contains(positives, '4')

In [None]:
def diagnose(row, typee):
    print('Sentences:')
    print(test['sentence1'][row])
    print(test['sentence2'][row])
    #print(test['sentence1_tokenized'][row])
    #print(test['sentence2_tokenized'][row])
    print()
    print(f'Words where type {typee} is found:')
    sent1 = test['sentence1_tokenized'][row]
    sent2 = test['sentence2_tokenized'][row]
    print(sent1[test['sentence1_scope_etpc'][row] == typee])
    print(sent2[test['sentence2_scope_etpc'][row] == typee])
    print()
    print('Scopes:')
    print(test['sentence1_scope_etpc'][row])
    print(test['sentence2_scope_etpc'][row])

In [None]:
diagnose(72, 4)

In [None]:
test[:10]

In [None]:
print(test['sentence1'][149])
print(test['sentence2'][149])

In [None]:
sent = test['sentence2_tokenized'][149]
sent

In [None]:
arr = test['sentence2_scope_etpc'][149]
arr

In [None]:
np.where(arr == 3)

In [None]:
sent[arr == 3]

In [None]:
def get_words(df, ept_type):
    

In [None]:
len(np.unique(etpc['sentence1_segment_location'][0]))

In [None]:
positives[positives['sentence1_scope_etpc'].apply(lambda x: (len(np.unique(x)) == 1))]

In [187]:
newpositives = positives.drop(columns=['idx', 'etpc_label', 'mrpc_label', 
                                       'sentence1_scope_etpc', 
                                       'sentence2_scope_etpc', 
                                       'sentence1_segment_text', 
                                       'sentence2_segment_text', 
                                       'sentence1_scope_paraop', 
                                       'sentence2_scope_paraop'])
newpositives

Unnamed: 0,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,ept_names,ept_ids,paraop_ids
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...","[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[5, 6, 26, 25, 29]"
2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...","[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 6, 26, 25, 29]"
4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...","[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]"
5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...","[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]","[11, 25, 29]"
7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...","[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]","[4, 5, 25, 25, 29]"
...,...,...,...,...,...,...,...
5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...","[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]","[25, 29, 21, 6, 11, 14, 26, 25]"
5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...","[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]","[6, 26, 25, 29, 21]"
5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...","[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]","[5, 5, 18, 29, 30]"
5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...","[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[4, 7, 6, 1, 26, 25, 25, 25, 29]"


In [188]:
newpositives['sentence1_scope'] = newpositives['sentence1_tokenized'].apply(lambda x: np.array(['' for token in x]).astype('U10'))
newpositives['sentence2_scope'] = newpositives['sentence2_tokenized'].apply(lambda x: np.array(['' for token in x]).astype('U10'))

In [238]:
newpositives

Unnamed: 0,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,ept_names,ept_ids,paraop_ids,sentence1_scope,sentence2_scope
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...","[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[5, 6, 26, 25, 29]","[, , , , , , , , 0_0, 0_0, 0_0, 0_0, 0_0, 0_0,...","[, , , , , , , , , , , , , , , , , , , ]"
2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...","[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 6, 26, 25, 29]","[, , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , ]"
4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...","[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[, , , , , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , , ]"
5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...","[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]","[11, 25, 29]","[, , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , , ,..."
7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...","[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]","[4, 5, 25, 25, 29]","[0_0, , , 0_0, 0_0, 0_0, , 0_0, 0_0, 0_0]","[, , , , , , , , , , , ]"
...,...,...,...,...,...,...,...,...,...
5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...","[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]","[25, 29, 21, 6, 11, 14, 26, 25]","[, , , , , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , , , ]"
5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...","[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]","[6, 26, 25, 29, 21]","[, , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , , ,..."
5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...","[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]","[5, 5, 18, 29, 30]","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , , ]"
5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...","[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,..."


In [272]:
def populate_identity(idx):
    #n = str(n)
    array = np.copy(newpositives['sentence1_scope'][idx])
    subset = textual_paraphrases[(textual_paraphrases['pair_id'] == idx+1)]
    scope = subset.loc[subset['type_id'] == 29, 's1_scope']
    if len(scope) > 0:
        array[scope.iloc[0]] = '0_0'
    #print(idx)
    return array

In [241]:
populate_identity(2, 1)

array(['', '0_0', '0_0', '0_0', '0_0', '0_0', '0_0', '0_0', '', '', '',
       '0_0', '0_0', '0_0', '', '0_0', '0_0', '', '', '', '0_0'],
      dtype='<U10')

In [243]:
newpositives['idx'] = newpositives.index
newpositives

Unnamed: 0,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,ept_names,ept_ids,paraop_ids,sentence1_scope,sentence2_scope,idx
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...","[Same Polarity Substitution (habitual), Same P...","[5, 6, 26, 25, 29]","[5, 6, 26, 25, 29]","[, , , , , , , , 0_0, 0_0, 0_0, 0_0, 0_0, 0_0,...","[, , , , , , , , , , , , , , , , , , , ]",0
2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...","[Same Polarity Substitution (contextual), Same...","[6, 6, 26, 25, 29]","[6, 6, 26, 25, 29]","[, , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , ]",2
4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...","[Same Polarity Substitution (contextual), Same...","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[, , , , , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , , ]",4
5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...","[Synthetic/analytic substitution, Addition/Del...","[11, 25, 29]","[11, 25, 29]","[, , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , , ,...",5
7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...","[Spelling changes, Same Polarity Substitution ...","[4, 5, 25, 25, 29]","[4, 5, 25, 25, 29]","[0_0, , , 0_0, 0_0, 0_0, , 0_0, 0_0, 0_0]","[, , , , , , , , , , , ]",7
...,...,...,...,...,...,...,...,...,...,...
5792,Gehring waived extradition Monday during a hea...,Gehring waived extradition Monday during a hea...,"[Gehring, waived, extradition, Monday, during,...","[Gehring, waived, extradition, Monday, during,...","[Addition/Deletion, Identity, Punctuation chan...","[25, 29, 21, 6, 11, 14, 26, 25]","[25, 29, 21, 6, 11, 14, 26, 25]","[, , , , , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , , , ]",5792
5793,"""I am advised that certain allegations of crim...","""I am advised that certain allegations of crim...","[``, I, am, advised, that, certain, allegation...","[``, I, am, advised, that, certain, allegation...","[Same Polarity Substitution (contextual), Chan...","[6, 26, 25, 29, 21]","[6, 26, 25, 29, 21]","[, , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , , ,...",5793
5795,"The deal, approved by both companies' board of...",The acquisition has been approved by both comp...,"[The, deal, ,, approved, by, both, companies, ...","[The, acquisition, has, been, approved, by, bo...","[Same Polarity Substitution (habitual), Same P...","[5, 5, 18, 29, 30]","[5, 5, 18, 29, 30]","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , , ]",5795
5799,"Last week the power station’s US owners, AES C...","The news comes after Drax's American owner, AE...","[Last, week, the, power, station’s, US, owners...","[The, news, comes, after, Drax, 's, American, ...","[Spelling changes, Same Polarity Substitution ...","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[4, 7, 6, 1, 26, 25, 25, 25, 29]","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...",5799


In [273]:
newpositives['idx'].apply(populate_identity)

0       [, , , , , , , , 0_0, 0_0, 0_0, 0_0, 0_0, 0_0,...
2       [, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, , , , 0_...
4       [, , , 0_0, , , 0_0, , , 0_0, , , , , , 0_0, ,...
5       [0_0, , , , , , , , 0_0, 0_0, 0_0, 0_0, 0_0, 0...
7               [0_0, , , 0_0, 0_0, 0_0, , 0_0, 0_0, 0_0]
                              ...                        
5792    [0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...
5793    [0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, ...
5795    [0_0, , , , , , , , , , , , 0_0, 0_0, 0_0, , ,...
5799    [, , , , , , , 0_0, 0_0, , 0_0, 0_0, 0_0, 0_0,...
5800    [, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0, 0_0...
Name: idx, Length: 3900, dtype: object

In [257]:
newpositives[130:150]

Unnamed: 0,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,ept_names,ept_ids,paraop_ids,sentence1_scope,sentence2_scope,idx
191,"By 2007, antivirus solutions will carry a worl...","By 2007, antivirus solutions will carry a worl...","[By, 2007, ,, antivirus, solutions, will, carr...","[By, 2007, ,, antivirus, solutions, will, carr...","[Same Polarity Substitution (contextual), Chan...","[6, 26, 29]","[6, 26, 29]","[, , , , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , ]",191
193,ConAgra stock closed Monday on the New York St...,ConAgra shares closed Monday at $21.63 a share...,"[ConAgra, stock, closed, Monday, on, the, New,...","[ConAgra, shares, closed, Monday, at, $, 21.63...","[Same Polarity Substitution (habitual), Change...","[5, 26, 29]","[5, 26, 29]","[, , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , ]",193
194,One of the features is the ability to delete d...,One of the features is the ability to delete d...,"[One, of, the, features, is, the, ability, to,...","[One, of, the, features, is, the, ability, to,...","[Same Polarity Substitution (contextual), Infl...","[6, 1, 26, 25, 29]","[6, 1, 26, 25, 29]","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , , ]",194
195,Last month Intel raised its revenue guidance f...,"At the end of the second quarter, Intel initia...","[Last, month, Intel, raised, its, revenue, gui...","[At, the, end, of, the, second, quarter, ,, In...",[Entailment],[31],[31],"[, , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , , , , , ]",195
196,"""I just got carried away and started making st...",Byrne says he got carried away with PowerPoint...,"[``, I, just, got, carried, away, and, started...","[Byrne, says, he, got, carried, away, with, Po...","[Inflectional Changes, Change of order, Change...","[1, 26, 26, 25, 29, 22]","[1, 26, 26, 25, 29, 22]","[, , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , ]",196
197,"Georgia cannot afford to not get funding,"" sai...","Georgia cannot afford to not get funding,"" sai...","[Georgia, can, not, afford, to, not, get, fund...","[Georgia, can, not, afford, to, not, get, fund...","[Subordination and nesting changes, Change of ...","[18, 8, 25, 29]","[18, 8, 25, 29]","[, , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , ]",197
198,Handspring shareholders will get 0.09 shares o...,Handspring's shareholders will receive 0.09 Pa...,"[Handspring, shareholders, will, get, 0.09, sh...","[Handspring, 's, shareholders, will, receive, ...","[Same Polarity Substitution (habitual), Same P...","[5, 7, 11, 11, 25, 25, 29]","[5, 7, 11, 11, 25, 25, 29]","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , , ,...",198
199,Shares of Microsoft rose 50 cents Friday to cl...,"Microsoft's stock was up 50 cents, to $28.34 a...","[Shares, of, Microsoft, rose, 50, cents, Frida...","[Microsoft, 's, stock, was, up, 50, cents, ,, ...","[Same Polarity Substitution (habitual), Same P...","[5, 5, 6, 11, 25, 25, 25, 29, 30, 21]","[5, 5, 6, 11, 25, 25, 25, 29, 30, 21]","[, , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , ]",199
200,"Based on having at least one of the symptoms, ...",On average the students suffered at least one ...,"[Based, on, having, at, least, one, of, the, s...","[On, average, the, students, suffered, at, lea...","[Same Polarity Substitution (contextual), Same...","[6, 5, 25, 25, 29, 28]","[6, 5, 25, 25, 29, 28]","[, , , , , , , , , , , , , , , , , , , , , , ,...","[, , , , , , , , , , , , , , , , , , , , , ]",200
201,Tibco has used the Rendezvous name since 1994 ...,Tibco has used the Rendezvous name since 1994 ...,"[Tibco, has, used, the, Rendezvous, name, sinc...","[Tibco, has, used, the, Rendezvous, name, sinc...","[Same Polarity Substitution (contextual), Same...","[6, 5, 29]","[6, 5, 29]","[, , , , , , , , , , , , , , , , , , , , , , , ]","[, , , , , , , , , , , , , , , , , ]",201


In [231]:
ric = textual_paraphrases[textual_paraphrases['pair_id'] == 7+1]
ric

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
24,8,4,Spelling changes,yes,[1],"[1, 2]",DVD-CCA,DVD CCA,,,,
25,8,5,Same Polarity Substitution (habitual),yes,[6],[8],state,U.S.,,,,
26,8,25,Addition/Deletion,yes,[2],,then,,,,,
27,8,25,Addition/Deletion,yes,,"[4, 5]",,that decision,,,,
28,8,29,Identity,yes,"[0, 3, 4, 5, 7, 8, 9]","[0, 3, 6, 7, 9, 10, 11]",The appealed to the Supreme Court .,The appealed to the Supreme Court .\n,,,,


In [269]:
len(ric.loc[ric['type_id'] == 28, 's1_scope'])

0

In [215]:
ric.query('type_id == 29')['s1_scope']

4    [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
Name: s1_scope, dtype: object

In [204]:
29 in list(ric['type_id'])

True

In [190]:
test = newpositives['sentence1_scope'][0]
test

array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', ''], dtype='<U10')

In [191]:
test[textual_paraphrases['s1_scope'][4]] = '0_0'
test

array(['', '', '', '', '', '', '', '', '0_0', '0_0', '0_0', '0_0', '0_0',
       '0_0', '0_0', '0_0', '0_0', '0_0', '0_0'], dtype='<U10')

In [None]:
pairs.reset_index(inplace=True)
pairs.drop(columns=['sent1_indices', 'sent2_indices'], inplace=True)

In [178]:
textual_paraphrases

Unnamed: 0,pair_id,type_id,type_name,sense_preserving,s1_scope,s2_scope,s1_text,s2_text,key_s1,key_s2,k1_text,k2_text
0,1,5,Same Polarity Substitution (habitual),yes,[5],"[1, 2]",whom,to him,,,,
1,1,6,Same Polarity Substitution (contextual),yes,[7],[0],called,Referring,,,,
2,1,26,Change of order,yes,"[0, 1, 2, 3]","[10, 11, 12, 13]",Amrozi accused his brother,Amrozi accused his brother,,,,
3,1,25,Addition/Deletion,yes,,[4],,only,,,,
4,1,29,Identity,yes,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]","[5, 6, 7, 8, 9, 14, 15, 16, 17, 18, 19]","`` the witness '' , of deliberately distorting...","`` the witness '' , of deliberately distorting...",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
22059,5801,6,Same Polarity Substitution (contextual),yes,[16],[16],as,like,,,,
22060,5801,6,Same Polarity Substitution (contextual),yes,[21],[22],",",and,,,,
22061,5801,11,Synthetic/analytic substitution,yes,"[13, 14]",[14],such familiar,familiar,,,,
22062,5801,25,Addition/Deletion,yes,"[29, 30, 31, 32, 33, 35]",,`` Re : That Movie '',,,,,


In [181]:
textual_paraphrases['s1_scope'][4]

[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]

In [None]:
subset = lookup_df[(lookup_df['pair_id'] == idx+1) & (lookup_df['type_id'] == int(iid))]
        subset.reset_index(drop=True, inplace=True)
        for i in range(1, count): # Skip adding zeroes since they're already there
            if subset['s1_scope'][i] is not None:
                s1_str[subset['s1_scope'][i]] = str(iid) + f'_{str(i)}'
            if subset['s2_scope'][i] is not None:
                s2_str[subset['s2_scope'][i]] = str(iid) + f'_{str(i)}'