# Imports

In [1]:
import pandas as pd
import numpy as np

# Reading the ETPC

This is the ETPC dataset compiled by Wahle and posted on HuggingFace

In [2]:
# Unpickle etpc_raw
etpc = pd.read_pickle('datasets/etpc_raw.pkl')

These are the XML files from the ETPC github repo

In [None]:
textual_paraphrases = pd.read_xml('datasets/etpc/textual_paraphrases.xml')

In [None]:
pairs = pd.read_xml('datasets/etpc/text_pairs.xml')
pairs.drop(columns=['negation'], inplace=True)
pairs.set_index('pair_id', inplace=True)

# Cleanup

## Cleaning up Columns

In [None]:
etpc.rename(columns={'paraphrase_type_ids': 'ept_ids', 'paraphrase_types': 'ept_names'}, inplace=True)
etpc.drop(columns={'negation'}, axis=1, inplace=True)
etpc

## Remapping Paraphrase IDs


First, make a list with paraphrase types and IDs from the ETPC:

In [None]:
id_map = pd.read_xml('https://raw.githubusercontent.com/venelink/ETPC/master/Corpus/paraphrase_types.xml')
# Rename columns for clarity
id_map.rename(columns={'type_id': 'ept_id', 'type_name': 'ept_name'}, inplace=True)
# Drop unused data
id_map = id_map[['ept_id', 'ept_name']] # No use for type_category column
id_map.drop(id_map.tail(2).index,inplace=True) # Types don't appear in ETPC
id_map.style.hide(axis="index")
id_map

Now, make a list with paraphrase names and IDs for ParaOp types

In [None]:
data = [[0, 'No change'],
        [1, 'Addition/Deletion - Function Word'],
        [2, 'Addition/Deletion - Content Word'],
        [3, 'Change of Order'],
        [4, 'Substitution - Synonym'],
        [5, 'Substitution - Contextual Synonym'],
        [6, 'Substitution - Morphological'],
        [7, 'Substitution - Spelling and Format']
       ]
paraop_map = pd.DataFrame(data, columns = ['paraop_id', 'paraop_name'])
paraop_map.set_index('paraop_id', inplace=True)
paraop_map

### Mapping

We'll use the dataframe below for mapping. Each row will contain the name and ID of a paraphrase type in the ETPC, and the name and ID of the correspondent ParaOp type.

In [None]:
id_map['paraop_id'] = ''
id_map['paraop_name'] = ''
id_map

Here's where we do the mapping:

In [None]:
# Helper function to map an ETPC id to a Paraop id
def map_id(ept_id, paraop_id):
    """Given an EPT id and a Paraop id, look up the name of the Paraop id and 
    fill in the rows of id_map with paraop_id and the name."""
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'] = paraop_id
    id_map.loc[id_map['ept_id'] == ept_id, 'paraop_name'] = paraop_map.loc[paraop_id, 'paraop_name']

In [None]:
map_id(ept_id=1, paraop_id=6)
map_id(ept_id=3, paraop_id=6)
map_id(ept_id=26, paraop_id=3)
map_id(ept_id=29, paraop_id=0)
id_map

TODO: Figure out a way to hide index of map_id throughout whole notebook. For some reason this seems harder than it needs to be...

## Reannotating types

In [None]:
# Helper function to get a Paraop id from an ETPC id
def ept_to_paraop(ept_id):
    return id_map.loc[id_map['ept_id'] == ept_id, 'paraop_id'].iloc[0]

ept_to_paraop(3)

In [None]:
def substitute_id(id_array, old):
    """Substitute the ETPC ids in id_array for their corresponding Paraop ids"""
    new = str(ept_to_paraop(old))
    copy = id_array.astype('U10') # To allow for more than 4 characters
    for i in range(len(copy)):
        iid, count = copy[i].split('_')
        if iid == str(old):
            # Any reannotated types will have a 10 appended to them. This is to
            # identify which types have already been reannotated, to avoid 
            # accidentally reannotating ids that have already been substituted. 
            copy[i] = f'10{new}_{count}'
    return copy

# Filtering

Helper methods for filtering the ETPC dataframe based on paraphrase types

In [None]:
def filter_contains(df, search_ids):
  """Returns an ETPC dataframe with rows where paraphrase_types_ids contains
  the search_ids. Use this to search for paraphrase pairs containing specific
  ids"""
  return df[df['ept_ids'].apply(lambda x: np.isin(search_ids, x))]

def filter_equals(df, search_ids):
  """Returns an ETPC dataframe with rows where paraphrase_types_ids EXACTLY 
  MATCHES the search_ids."""
  return df[df['ept_ids'].apply(lambda x: np.array_equal(x, search_ids))]W

In [None]:
filter_contains(etpc, '3')

In [None]:
filter_equals(etpc, ['25', '29'])

# Reannotation, Continued

## Creating Positives

In [None]:
positives = etpc.loc[etpc['mrpc_label'] == 1]
positives.rename(columns={'sentence1_segment_location': 'sentence1_scope_etpc', 'sentence2_segment_location': 'sentence2_scope_etpc'}, inplace=True)
positives.drop(columns=['sentence1_segment_location_indices', 'sentence2_segment_location_indices'],inplace=True)
positives['idx'] = positives.index.to_series()
positives

In [None]:
id_map = id_map.style.hide(axis="index")
id_map

In [None]:
test = filter_contains(positives, '4')

## Diagnosing

In [None]:
def diagnose(row, typee):
    print('Sentences:')
    print(test['sentence1'][row])
    print(test['sentence2'][row])
    #print(test['sentence1_tokenized'][row])
    #print(test['sentence2_tokenized'][row])
    print()
    print(f'Words where type {typee} is found:')
    sent1 = test['sentence1_tokenized'][row]
    sent2 = test['sentence2_tokenized'][row]
    print(sent1[test['sentence1_scope_etpc'][row] == typee])
    print(sent2[test['sentence2_scope_etpc'][row] == typee])
    print()
    print('Scopes:')
    print(test['sentence1_scope_etpc'][row])
    print(test['sentence2_scope_etpc'][row])

In [None]:
diagnose(72, 4)

In [None]:
test[:10]

In [None]:
def get_words(df, ept_type):
    return

In [None]:
# Check if ETPC 
positives[positives['sentence1_scope_etpc'].apply(lambda x: (len(np.unique(x)) == 1))]

In [None]:
positives.drop(columns=['idx', 'etpc_label', 'mrpc_label', 
                                       'sentence1_scope_etpc', 
                                       'sentence2_scope_etpc', 
                                       'sentence1_segment_text', 
                                       'sentence2_segment_text', 
                                       'sentence1_scope_paraop', 
                                       'sentence2_scope_paraop'])

In [None]:
positives['sentence1_scope'] = positives['sentence1_tokenized'].apply(lambda x: np.array(['' for token in x]).astype('U10'))
positives['sentence2_scope'] = positives['sentence2_tokenized'].apply(lambda x: np.array(['' for token in x]).astype('U10'))

In [None]:
positives

In [None]:
def populate_identity(idx):
    #n = str(n)
    array = np.copy(positives['sentence1_scope'][idx])
    subset = textual_paraphrases[(textual_paraphrases['pair_id'] == idx+1)]
    scope = subset.loc[subset['type_id'] == 29, 's1_scope']
    if len(scope) > 0:
        array[scope.iloc[0]] = '0_0'
    #print(idx)
    return array

In [None]:
populate_identity(2, 1)

In [None]:
positives['idx'] = positives.index
positives

In [None]:
positives['idx'].apply(populate_identity)

In [None]:
positives[130:150]

In [None]:
ric = textual_paraphrases[textual_paraphrases['pair_id'] == 7+1]
ric

In [None]:
len(ric.loc[ric['type_id'] == 28, 's1_scope'])

In [None]:
ric.query('type_id == 29')['s1_scope']

In [None]:
test = positives['sentence1_scope'][0]
test

In [None]:
test[textual_paraphrases['s1_scope'][4]] = '0_0'
test

In [None]:
pairs.reset_index(inplace=True)
pairs.drop(columns=['sent1_indices', 'sent2_indices'], inplace=True)

In [None]:
textual_paraphrases

In [None]:
textual_paraphrases['s1_scope'][4]

In [None]:
def disambiguate_duplicate(idx, lookup_df):
    """"Disambiguates duplicate paraphrase types for a row (given its idx) of
    the ETPC dataframe. Returns a tuple containing the disambiguated scopes for
    each sentence in the pair."""
    # First, determine what are the duplicates
    dups = get_duplicates(etpc['ept_ids'][idx])
    # Convert array values to strings with '_0' appended to them
    s1_str = np.array([x + '_0' for x in etpc['sentence1_segment_location'][idx].astype(str)])
    s2_str = np.array([y + '_0' for y in etpc['sentence2_segment_location'][idx].astype(str)])
    # Disambiguate
    for iid, count in dups.items():
        # Subset the lookup df
        subset = lookup_df[(lookup_df['pair_id'] == idx+1) & (lookup_df['type_id'] == int(iid))]
        subset.reset_index(drop=True, inplace=True)
        for i in range(1, count): # Skip adding zeroes since they're already there
            if subset['s1_scope'][i] is not None:
                s1_str[subset['s1_scope'][i]] = str(iid) + f'_{str(i)}'
            if subset['s2_scope'][i] is not None:
                s2_str[subset['s2_scope'][i]] = str(iid) + f'_{str(i)}'
    return s1_str, s2_str