In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import plotly.express as px
import os
import sys
import numpy as np

In [3]:
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [16]:
from utils import load_json
from collections import defaultdict
from data_selection.data_selection_utils import float_floor

from consistency import (
    dnli_human_bucket_predictions,
    dnli_test_set_predictions,
    anli_human_bucket_predictions,
    anli_test_set_predictions,
    construct_bucket_metadata, 
    plot_orig_v_bucket_conf, 
    plot_consistency_cdf,
    get_original_example_prediction_accuracy,
    calculate_weighted_consistency
)

In [19]:
print('Specialized RoBERTa Models')
for dataname in dnli_human_bucket_predictions.keys():
    print(f'####### {dataname}#######')
    print(dataname, calculate_weighted_consistency(
            dnli_human_bucket_predictions[dataname]['specialized_roberta'],
            dnli_test_set_predictions[dataname]['specialized_roberta'],
            show_test_distribution=False
        )
    )
    
print('anli', calculate_weighted_consistency(anli_human_bucket_predictions['specialized_roberta'],anli_test_set_predictions['specialized_roberta'],show_test_distribution=True))

Specialized RoBERTa Models
####### social#######
social {'weighted_consistency': 0.9087363234357935, 'mean_consistency': 0.6592190476190476}
####### snli#######
snli {'weighted_consistency': 0.9064806546026608, 'mean_consistency': 0.7052253968253969}
####### atomic#######
atomic {'weighted_consistency': 0.8616549681381606, 'mean_consistency': 0.7478507936507937}


anli {'weighted_consistency': 0.8744013294599748, 'mean_consistency': 0.6805238095238096}


## Specialized RoBERTa

In [31]:
def plot_roberta(name, bucket_preds):
    get_original_example_prediction_accuracy(bucket_preds)
    metadata = construct_bucket_metadata(bucket_preds)
    plot = plot_orig_v_bucket_conf(metadata, name)
    plot.show()
    return plot

In [32]:
plot = plot_roberta('α-NLI RoBERTa', anli_human_bucket_predictions['specialized_roberta'])
plot.write_image('anli-human-roberta.pdf')
#plot_roberta('δ-SNLI RoBERTa', dnli_human_bucket_predictions['snli']['specialized_roberta'])

In [33]:
from annotated_data.annotated_data import anli_human

In [45]:
for i in anli_human['anli.test.1063']:
    print(i)
    print()

Hyp1:Tom persuaded his pal to go out for pizza with him.
Hyp2:Tim's pal desired to fight beside him.

Hyp1:Tom persuaded his friend to join him for pizza.
Hyp2:Tim's friend requested a boxing match.

Hyp1:Tom persuaded his friend to join him for pizza.
Hyp2:Tim's pal desired to fight beside him.

Hyp1:Tom persuaded his pal to go out for pizza with him.
Hyp2:Tim's friend requested a boxing match.

Hyp1:To grab pizza with him, Tom persuaded his pal.
Hyp2:A boxing match with Tim was desired by his pal.



In [38]:
for i in anli_human['anli.test.1105']:
    print(i)
    print()

Hyp1:Sue got preoccupied with something else while the cookies were baking
Hyp2:After she put the cookies in the oven and went to watch a movie, Sue immediately darted from the kitchen when she smelled something burning.

Hyp1:sue was inattentive because she got busy with something else while she left cookies in the oven
Hyp2:sue went to watch a movie after putting the cookies into bake, but smelled a burning smell and left

Hyp1:Sue lost track of time while her cookies were baking.
Hyp2:Sue watched a movie while her cookies baked, smelled a burning smell and left the kitchen.

Hyp1:Sue got distracted while baking.
Hyp2:The cookies baked longer and smelled bad because Sue watched a movie, but she left the kitchen after she smelled something burning.

Hyp1:Sue focused on something else while baking.
Hyp2:Sue left the kitchen after noticing a burning smell and watched a movie.

Hyp1:Sue got distracted while the cookies baked
Hyp2:Sue, after placing the cookies in the oven and settling do

In [39]:
from abductive_data import anli_dataset

In [41]:
print(anli_dataset.get_example_by_id('anli.test.854'))

Obs1:Kevin took his two sons to a park in Hawthorne.
Obs2:It was one of his closest friends from high school smoking in his car.
Hyp1:Kevin noticed someone unfamiliar and creepy.
Hyp2:Someone called Kevin's name.


In [37]:
for i in anli_human['anli.test.854']:
    print(i)
    print()

Hyp1:Kevin eyed a person that looked a little weird and it made him feel uncomfortable.
Hyp2:"Kevin!", someone shouted at a high volume

Hyp1:Kevin saw a weird-looking person 
Hyp2:Kevin's name was yelled out.

Hyp1:Someone strange and unsettling caught Kevin's attention.
Hyp2:A person exclaimed the name Kevin loudly.

Hyp1:Kevin saw an unfamiliar and unsettling person.
Hyp2:Kevin's name was shouted out.

Hyp1:Kevin observed a person who was strange and unsettling.
Hyp2:Kevin was summoned by name.

Hyp1:Kevin spotted someone who was creepy and unknown to him.
Hyp2:Kevin's name was called out.

Hyp1:A strange and unsettling person caught Kevin's attention.
Hyp2:The name "Kevin" was spoken by someone.

Hyp1:Kevin observed an individual he didn't recognize who made him feel uneasy.
Hyp2:Someone made the utterance "Kevin."

Hyp1:Kevin saw someone who he didn't know and it made him uncomfortable.
Hyp2:Kevin was addressed by an individual.



## FastText BOW

In [46]:
for b, dataset_buckets in dnli_human_bucket_predictions.items():
    print(b)
    print(
        'Specialized Lexical Model Accuracy:',
        get_original_example_prediction_accuracy(dataset_buckets['specialized_full_input_lexical'])
    )
    
    print(
        'Specialized Lexical Partial Input Model Accuracy:', 
        get_original_example_prediction_accuracy(dataset_buckets['specialized_partial_input_lexical'])
    )
    print()
    
    metadata = construct_bucket_metadata(dataset_buckets['specialized_full_input_lexical'])
    plot = plot_orig_v_bucket_conf(metadata, f'δ-{b} Lexical Model')
    plot.show()
    #plot_consistency_cdf(metadata, f'{b} Specialized Defeasible Lexical Consistency CDF')
    
    metadata = construct_bucket_metadata(dataset_buckets['specialized_partial_input_lexical'])
    plot_orig_v_bucket_conf(metadata, f'{b} Specialized Defeasible Lexical (Partial Input)')
    #plot_consistency_cdf(metadata, f'{b} Specialized Defeasible Lexical Consistency CDF')

social
Specialized Lexical Model Accuracy: 0.576
Specialized Lexical Partial Input Model Accuracy: 0.568



snli
Specialized Lexical Model Accuracy: 0.58
Specialized Lexical Partial Input Model Accuracy: 0.596



atomic
Specialized Lexical Model Accuracy: 0.492
Specialized Lexical Partial Input Model Accuracy: 0.528



## GPT-3 Curie

In [47]:
from utils import load_json

In [48]:
for b, dataset_buckets in dnli_human_bucket_predictions.items():
    print(b)
    print(
        'GPT-3 Curie Accuracy:',
        get_original_example_prediction_accuracy(dataset_buckets['gpt3-curie'])
    )
    
    metadata = construct_bucket_metadata(dataset_buckets['gpt3-curie'])
    plot = plot_orig_v_bucket_conf(metadata, f'δ-{b} GPT-3 (Curie)')
    plot.show()

social
GPT-3 Curie Accuracy: 0.556


snli
GPT-3 Curie Accuracy: 0.528


atomic
GPT-3 Curie Accuracy: 0.52
