## Load Trained Models

In [1]:
from models import TrainedModel
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score

tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
roberta_base = TrainedModel(
    '../modeling/chkpts/roberta-base-anli/',
    cache_dir='../modeling/hf-cache/',
)

roberta_large = TrainedModel(
    '../modeling/chkpts/roberta-large-anli/',
    cache_dir='../modeling/hf-cache/',
)

### Evaluate on full test split

In [None]:
test = pd.read_json('/Users/nehasrikanth/Documents/paraphrase-nlu/raw-data/anli/test.jsonl', lines=True)
test['label'] = pd.read_csv('/Users/nehasrikanth/Documents/paraphrase-nlu/raw-data/anli/test-labels.lst', header=None)

In [None]:
test['base_pred_prob'] = test.progress_apply(
    lambda row: roberta_base._get_prediction(
        obs1=row['obs1'], obs2=row['obs2'], hyp1=row['hyp1'], hyp2=row['hyp2']
    ), axis=1
)

test['large_pred_prob'] = test.progress_apply(
    lambda row: roberta_large._get_prediction(
        obs1=row['obs1'], obs2=row['obs2'], hyp1=row['hyp1'], hyp2=row['hyp2']
    ), axis=1
)

In [None]:
print(accuracy_score(test.label, test['base-pred'].map(lambda a: np.argmax(a) + 1)))
print(accuracy_score(test.label, test['large-pred'].map(lambda a: np.argmax(a) + 1)))

Looks like RoBERTa large does significantly better, let's proceed with analysis using that model.

## Load paraphrase data and evaluate on original examples

In [2]:
pilot_paraphrases = pd.read_csv('../annotated-data/paraphrased_pilot.csv')
pilot_paraphrases['paraphrases_by_worker'] = pilot_paraphrases.paraphrases.map(eval)
pilot_paraphrases = pilot_paraphrases.drop(columns=['paraphrases', 'processed_assignments'])

In [None]:
pilot_paraphrases['hyp1_paraphrases'] = pilot_paraphrases.paraphrases_by_worker.map(
    lambda x: [p for w in x for p in w['hyp1_paraphrases']]
)
pilot_paraphrases['hyp2_paraphrases'] = pilot_paraphrases.paraphrases_by_worker.map(
    lambda x: [p for w in x for p in w['hyp2_paraphrases']]
)

In [None]:
pilot_paraphrases['large_pred_prob'] = pilot_paraphrases.progress_apply(
    lambda row: roberta_large._get_prediction(
        obs1=row['obs1'], obs2=row['obs2'], hyp1=row['hyp1'], hyp2=row['hyp2']
    ), axis=1
)

print(accuracy_score(pilot_paraphrases.label, pilot_paraphrases['large_pred_prob'].map(lambda a: np.argmax(a) + 1)))

In [None]:
pilot_paraphrases['large_pred'] = pilot_paraphrases['large_pred_prob'].map(lambda a: np.argmax(a) + 1)

In [None]:
import random

def get_random_paraphrased_example_h1_h2_intra_worker(row):
    worker_paraphrases = row.paraphrases_by_worker[random.randrange(0, 3)]
    return {
        'obs1': row['obs1'], 'obs2': row['obs2'],
        'hyp1': worker_paraphrases['hyp1_paraphrases'][random.randrange(0, 3)], 
        'hyp2': worker_paraphrases['hyp2_paraphrases'][random.randrange(0, 3)]
    }

def get_random_paraphrased_example_h1_h2_inter_worker(row):
    h1_paraphrase = row.paraphrases_by_worker[random.randrange(0, 3)]['hyp1_paraphrases'][random.randrange(0, 3)]
    h2_paraphrase = row.paraphrases_by_worker[random.randrange(0, 3)]['hyp2_paraphrases'][random.randrange(0, 3)]
    return {
        'obs1': row['obs1'], 'obs2': row['obs2'],
        'hyp1': h1_paraphrase,
        'hyp2': h2_paraphrase,
    }
    
def get_random_paraphrased_example_h1(row):
    idx = random.randrange(0, 9)
    return {
        'obs1': row['obs1'], 'obs2': row['obs2'],
        'hyp1': row.hyp1_paraphrases[idx], 'hyp2': row['hyp2']
    }

def get_random_paraphrased_example_h2(row):
    idx = random.randrange(0, 9)
    return {
        'obs1': row['obs1'], 'obs2': row['obs2'],
        'hyp1': row['hyp1'], 'hyp2': row.hyp2_paraphrases[idx]
    }

def get_zipped_examples(row):
    return [{'obs1': row['obs1'], 'obs2': row['obs2'], 'hyp1': h1, 'hyp2': h2} 
            for h1, h2 in zip(row.hyp1_paraphrases, row.hyp2_paraphrases)]

In [None]:
pilot_paraphrases['rand_intra_worker_paraphrased_h1_h2'] = pilot_paraphrases.apply(
    get_random_paraphrased_example_h1_h2_intra_worker, axis=1
)

In [None]:
pilot_paraphrases['rand_intra_worker_paraphrased_h1_h2_pred'] = pilot_paraphrases['rand_intra_worker_paraphrased_h1_h2'].progress_map(
    lambda r: np.argmax(roberta_large._get_prediction(**r)) + 1
)
accuracy_score(pilot_paraphrases.label, pilot_paraphrases['rand_intra_worker_paraphrased_h1_h2_pred'])

In [None]:
def calculate_flip_rate(old_pred, new_pred):
    return sum(old_pred != new_pred) / len(old_pred)

print(calculate_flip_rate(pilot_paraphrases['large_pred'], pilot_paraphrases['rand_intra_worker_paraphrased_h1_h2_pred']))

In [None]:
print('✅, ✅', len(pilot_paraphrases[
    (pilot_paraphrases.large_pred == pilot_paraphrases.label) & 
    (pilot_paraphrases.rand_intra_worker_paraphrased_h1_h2_pred == pilot_paraphrases.label)
])/len(pilot_paraphrases))

print('❌, ✅', len(pilot_paraphrases[
    (pilot_paraphrases.large_pred != pilot_paraphrases.label) & 
    (pilot_paraphrases.rand_intra_worker_paraphrased_h1_h2_pred == pilot_paraphrases.label)
])/len(pilot_paraphrases))

print('✅, ❌', len(pilot_paraphrases[
    (pilot_paraphrases.large_pred == pilot_paraphrases.label) & 
    (pilot_paraphrases.rand_intra_worker_paraphrased_h1_h2_pred != pilot_paraphrases.label)
])/len(pilot_paraphrases))
print('❌, ❌', len(pilot_paraphrases[
    (pilot_paraphrases.large_pred != pilot_paraphrases.label) & 
    (pilot_paraphrases.rand_intra_worker_paraphrased_h1_h2_pred != pilot_paraphrases.label)
])/len(pilot_paraphrases))

In [None]:
pilot_paraphrases['rand_inter_worker_paraphrased_h1_h2'] = pilot_paraphrases.apply(
    get_random_paraphrased_example_h1_h2_inter_worker, axis=1
)

In [None]:
pilot_paraphrases['rand_inter_worker_paraphrased_h1_h2_pred'] = pilot_paraphrases['rand_inter_worker_paraphrased_h1_h2'].progress_map(
    lambda r: np.argmax(roberta_large._get_prediction(**r)) + 1
)

In [None]:
accuracy_score(pilot_paraphrases.label, pilot_paraphrases['rand_inter_worker_paraphrased_h1_h2_pred'])

In [None]:

print(calculate_flip_rate(pilot_paraphrases['large_pred'], pilot_paraphrases['rand_intra_worker_paraphrased_h1_h2_pred']))

### Transformation Analysis

In [None]:
flipped_wrong = pilot_paraphrases[
    (pilot_paraphrases.large_pred == pilot_paraphrases.label) & 
    (pilot_paraphrases.rand_intra_worker_paraphrased_h1_h2_pred != pilot_paraphrases.label)
]

In [None]:
pd.DataFrame(list(flipped_wrong.apply(lambda i: {
    'obs1': i.obs1,
    'obs2': i.obs2,
    'hyp1': i.hyp1,
    'hyp1-para': i.rand_intra_worker_paraphrased_h1_h2['hyp1'],
    'hyp2': i.hyp2,
    'hyp2-para': i.rand_intra_worker_paraphrased_h1_h2['hyp2']}, axis=1))).to_csv('flipped_ex.csv')

In [None]:
zipped_examples = pilot_paraphrases.apply(get_zipped_examples, axis=1)

zipped_examples = pd.concat([
    pilot_paraphrases[['label', 'large-pred', 'hyp1', 'hyp2']],
    zipped_examples.progress_map(lambda example_list: [roberta_large._get_prediction (**e) for e in example_list])
], axis=1
).rename(columns={0: 'para-preds'})

In [None]:
zipped_examples['para-preds-argmax'] = zipped_examples['para-preds'].map(lambda x: [np.argmax(i) + 1 for i in x])

zipped_examples.apply(
    lambda row: sum(1 for i in row['para-preds-argmax'] if i == row.label)/len(row['para-preds-argmax']), axis=1
).plot.hist(bins=12)

In [None]:
zipped_sample = []
for i, row in pilot_paraphrases.iterrows():
    for h1, h2 in list(zip(row.hyp1_paraphrases, row.hyp2_paraphrases)):
        zipped_sample.append({
            'id': i,
            'example_id': row.example_id,
            'obs1': row.obs1,
            'obs2': row.obs2,
            'original_h1': row.hyp1,
            'original_h2': row.hyp2,
            'hyp1': h1,
            'hyp2': h2,
            'label': row.label
        })

In [None]:
unaligned_sample = []
for i, row in pilot_paraphrases.iterrows():
    w1 = random.randint(0, 2)
    w2 = random.choice(list(set([0, 1, 2]) - set([w1])))
    
    unaligned_sample.append({
        'id': i,
        'example_id': row.example_id,
        'obs1': row.obs1,
        'obs2': row.obs2,
        'original_h1': row.hyp1,
        'original_h2': row.hyp2,
        'hyp1': random.choice(row.paraphrases_by_worker[w1]['hyp1_paraphrases']),
        'hyp2': random.choice(row.paraphrases_by_worker[w2]['hyp2_paraphrases']),
        'label': row.label
    })

In [None]:
pd.DataFrame(zipped_sample).sample(100, random_state=42).to_csv('zipped_examples_validation.csv', index=False)

In [None]:
pd.DataFrame(unaligned_sample).sample(100, random_state=42).to_csv('unaligned_examples_validation.csv', index=False)