## Load Trained Models

In [1]:
from models import TrainedModel
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score

tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
roberta_base = TrainedModel(
    '../modeling/chkpts/roberta-base-anli/',
    cache_dir='../modeling/hf-cache/',
)

roberta_large = TrainedModel(
    '../modeling/chkpts/roberta-large-anli/',
    cache_dir='../modeling/hf-cache/',
)

Loading model from ../modeling/chkpts/roberta-base-anli/
Loading model from ../modeling/chkpts/roberta-large-anli/


### Evaluate on full test split

In [None]:
test = pd.read_json('/Users/nehasrikanth/Documents/paraphrase-nlu/raw-data/anli/test.jsonl', lines=True)
test['label'] = pd.read_csv('/Users/nehasrikanth/Documents/paraphrase-nlu/raw-data/anli/test-labels.lst', header=None)

In [None]:
test['base_pred_prob'] = test.progress_apply(
    lambda row: roberta_base._get_prediction(
        obs1=row['obs1'], obs2=row['obs2'], hyp1=row['hyp1'], hyp2=row['hyp2']
    ), axis=1
)

test['large_pred_prob'] = test.progress_apply(
    lambda row: roberta_large._get_prediction(
        obs1=row['obs1'], obs2=row['obs2'], hyp1=row['hyp1'], hyp2=row['hyp2']
    ), axis=1
)

In [None]:
print(accuracy_score(test.label, test['base-pred'].map(lambda a: np.argmax(a) + 1)))
print(accuracy_score(test.label, test['large-pred'].map(lambda a: np.argmax(a) + 1)))

Looks like RoBERTa large does significantly better, let's proceed with analysis using that model.

## Load paraphrase data and evaluate on original examples

In [3]:
pilot_paraphrases = pd.read_csv('../annotated-data/abductive/paraphrased_pilot.csv')
pilot_paraphrases['paraphrases_by_worker'] = pilot_paraphrases.paraphrases.map(eval)
pilot_paraphrases = pilot_paraphrases.drop(columns=['paraphrases', 'processed_assignments'])

In [4]:
annotated = pd.read_csv('zipped_intra_worker_paraphrases_pilot_annotated.csv')
annotated = annotated[annotated['invalid?'] == False]

In [5]:
#annotated.to_csv('validated_intra_worker_paraphrases.csv')

In [None]:
annotated['pred_prob'] = annotated.progress_apply(
    lambda row: roberta_large._get_prediction(
        obs1=row['obs1'], obs2=row['obs2'], hyp1=row['hyp1'], hyp2=row['hyp2']
    ), axis=1
)
annotated['pred'] = annotated['pred_prob'].map(lambda a: np.argmax(a) + 1)

In [None]:
annotated

In [None]:
from collections import defaultdict

predictions = defaultdict(list)

for _, row in annotated.iterrows():
    if not row['invalid?']:
        predictions[int(row.paraphrase_id.split('.')[0])].append(row.pred)

In [None]:
pilot_paraphrases['predictions'] = pilot_paraphrases.example_id.map(lambda x: predictions[x])

In [None]:
pilot_paraphrases['pred_prob'] = pilot_paraphrases.progress_apply(
    lambda row: roberta_large._get_prediction(
        obs1=row['obs1'], obs2=row['obs2'], hyp1=row['hyp1'], hyp2=row['hyp2']
    ), axis=1
)
pilot_paraphrases['pred'] = pilot_paraphrases['pred_prob'].map(lambda a: np.argmax(a) + 1)

In [None]:
from collections import Counter

sum(Counter([1, 1, 1, 1, 1, 1, 1, 1, 2]).values())

In [None]:
pilot_paraphrases['group_incorrect_rate'] = pilot_paraphrases.apply(
    lambda row: 1 - (Counter(row.predictions)[row.label] / sum(Counter(row.predictions).values()))
    ,axis=1
)

In [None]:
pilot_paraphrases.group_incorrect_rate.plot.hist()

In [None]:
pilot_paraphrases.group_incorrect_rate.describe()

In [None]:
pilot_paraphrases[pilot_paraphrases.pred == pilot_paraphrases.label].group_incorrect_rate.plot.hist()

In [None]:
pilot_paraphrases[pilot_paraphrases.pred == pilot_paraphrases.label].group_incorrect_rate.describe()

In [None]:
pilot_paraphrases['prediction_mode'] = pilot_paraphrases.predictions.map(lambda row: Counter(predictions).most_common())

In [None]:
Counter([1, 1,]).most_common(1)

In [None]:
def calculate_flip_rate(old_pred, new_pred):
    return sum(old_pred != new_pred) / len(old_pred)

print(calculate_flip_rate(pilot_paraphrases['large_pred'], pilot_paraphrases['rand_intra_worker_paraphrased_h1_h2_pred']))

In [None]:
print('✅, ✅', len(pilot_paraphrases[
    (pilot_paraphrases.large_pred == pilot_paraphrases.label) & 
    (pilot_paraphrases.rand_intra_worker_paraphrased_h1_h2_pred == pilot_paraphrases.label)
])/len(pilot_paraphrases))

print('❌, ✅', len(pilot_paraphrases[
    (pilot_paraphrases.large_pred != pilot_paraphrases.label) & 
    (pilot_paraphrases.rand_intra_worker_paraphrased_h1_h2_pred == pilot_paraphrases.label)
])/len(pilot_paraphrases))

print('✅, ❌', len(pilot_paraphrases[
    (pilot_paraphrases.large_pred == pilot_paraphrases.label) & 
    (pilot_paraphrases.rand_intra_worker_paraphrased_h1_h2_pred != pilot_paraphrases.label)
])/len(pilot_paraphrases))
print('❌, ❌', len(pilot_paraphrases[
    (pilot_paraphrases.large_pred != pilot_paraphrases.label) & 
    (pilot_paraphrases.rand_intra_worker_paraphrased_h1_h2_pred != pilot_paraphrases.label)
])/len(pilot_paraphrases))

In [None]:
pilot_paraphrases['rand_inter_worker_paraphrased_h1_h2'] = pilot_paraphrases.apply(
    get_random_paraphrased_example_h1_h2_inter_worker, axis=1
)

In [None]:
pilot_paraphrases['rand_inter_worker_paraphrased_h1_h2_pred'] = pilot_paraphrases['rand_inter_worker_paraphrased_h1_h2'].progress_map(
    lambda r: np.argmax(roberta_large._get_prediction(**r)) + 1
)

In [None]:
accuracy_score(pilot_paraphrases.label, pilot_paraphrases['rand_inter_worker_paraphrased_h1_h2_pred'])

In [None]:

print(calculate_flip_rate(pilot_paraphrases['large_pred'], pilot_paraphrases['rand_intra_worker_paraphrased_h1_h2_pred']))

### Transformation Analysis

In [None]:
flipped_wrong = pilot_paraphrases[
    (pilot_paraphrases.large_pred == pilot_paraphrases.label) & 
    (pilot_paraphrases.rand_intra_worker_paraphrased_h1_h2_pred != pilot_paraphrases.label)
]