In [None]:
import plotly.express as px
import os
import sys
import pandas as pd
from collections import defaultdict
import numpy as np

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from utils import load_json, PROJECT_ROOT_DIR

In [None]:
def modify_to_add_edit_defeasible(verification_row):
    """
    In some cases, the paraphrases were edited for grammaticality, etc. 
    If something is valid, then grab the edited paraphrase if it exists.
    """
    if pd.isnull(verification_row.paraphrase_edit):
        return verification_row.paraphrase_example
    
    og_example = "\n".join(verification_row.paraphrase_example.split('\n')[:-2])
    paraphrase_edit = verification_row.paraphrase_edit
    return og_example + '\n    Paraphrase: %s\n' % paraphrase_edit

In [None]:
def modify_to_add_edit_abductive(verification_row):
    if pd.isnull(verification_row.paraphrase_edit_hyp1) and pd.isnull(verification_row.paraphrase_edit_hyp2):
        return verification_row.paraphrase_example
    
    split = verification_row.paraphrase_example.split('\n')
    og_example = "\n".join(split[:-3])
    
    hyp1 = verification_row.paraphrase_edit_hyp1 if not pd.isnull(verification_row.paraphrase_edit_hyp1) else split[8]
    hyp2 = verification_row.paraphrase_edit_hyp2 if not pd.isnull(verification_row.paraphrase_edit_hyp2) else split[9]

    if not hyp1.strip().startswith('Hyp 1 Paraphrase:'):
        hyp1 = '    Hyp 1 Paraphrase: ' + hyp1
    
    if not hyp2.strip().startswith('Hyp 2 Paraphrase:'):
        hyp2 = '    Hyp 2 Paraphrase: ' + hyp2
    
    return og_example + '\n\n%s\n%s' % (hyp1, hyp2)
    
    

In [None]:
instruction_examples = []


validation_dir = os.path.join(PROJECT_ROOT_DIR, 'annotated_data/paraphrase_validation/validation_annotation_files/human/')

all_verification_examples = []

for validation_file in os.listdir(validation_dir):
    print(validation_file)
    data_source = pd.read_csv(os.path.join(validation_dir, validation_file))
    
    ### SAMPLE FOR RACHEL
    invalid = data_source[data_source['paraphrase_valid'] == 'invalid'].sample(12, random_state=42)
    valid = data_source[data_source['paraphrase_valid'] == 'valid'].sample(13, random_state=42)
    
    verification = pd.concat([invalid, valid]).sample(frac=1, random_state=42)
    modification_func = modify_to_add_edit_defeasible if not 'anli' in validation_file else modify_to_add_edit_abductive
    
    verification['paraphrase_example'] = verification.apply(modification_func, axis=1)
    all_verification_examples.append(verification)
    
    
    ### SAMPLE FOR INSTRUCTIONS
    invalid_instructions = data_source[data_source['paraphrase_valid'] == 'invalid'].sample(5, random_state=123)
    valid_instructions = data_source[data_source['paraphrase_valid'] == 'valid'].sample(5, random_state=123)
    
    instruction_verification = pd.concat([invalid_instructions, valid_instructions]).sample(frac=1, random_state=42)
    
    instruction_verification['paraphrase_example'] = instruction_verification.apply(modification_func, axis=1)
    instruction_examples.append(instruction_verification)
        



In [None]:
all_sample = pd.concat(all_verification_examples)[['paraphrase_id', 'paraphrase_example', 'original_example_id', 'paraphrase_valid']]
all_sample.to_csv(os.path.join(PROJECT_ROOT_DIR, 'experiments/data_characterization/validation_sample.csv'),index=False)

In [None]:
instructions = pd.concat(instruction_examples)[['paraphrase_id', 'paraphrase_example', 'original_example_id', 'paraphrase_valid']]
instructions.to_csv(os.path.join(PROJECT_ROOT_DIR, 'experiments/data_characterization/validation_instructions.csv'),index=False)