In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import plotly.express as px
import os
import sys
import numpy as np

In [3]:
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
from utils import load_json
from collections import defaultdict
from data_selection.data_selection_utils import float_floor

from consistency import (
    dnli_human_bucket_predictions,
    dnli_test_set_predictions,
    anli_human_bucket_predictions,
    anli_test_set_predictions,
    construct_bucket_metadata, 
    plot_orig_v_bucket_conf, 
    plot_consistency_cdf,
    get_original_example_prediction_accuracy,
    calculate_weighted_consistency
)

In [5]:
def get_consistencies(model_name):
    for dataname in dnli_human_bucket_predictions.keys():
        test_set_preds = dnli_test_set_predictions[dataname][model_name] if model_name in dnli_test_set_predictions[dataname].keys() else None
        human_preds = dnli_human_bucket_predictions[dataname][model_name] if model_name in dnli_human_bucket_predictions[dataname].keys() else None
        
        print(dataname, calculate_weighted_consistency(
                paraphrase_predictions=human_preds,
                test_set_predictions=test_set_preds,
                show_test_distribution=False
            )
        )
        print()
    print(f'####### anli #######')
    
    if model_name in anli_human_bucket_predictions.keys() and anli_test_set_predictions.keys():
        print('anli', calculate_weighted_consistency(anli_human_bucket_predictions[model_name],anli_test_set_predictions[model_name],show_test_distribution=False))

In [6]:
get_consistencies('specialized_roberta')

snli {'mean_consistency': 0.7052253968253969, 'weighted_consistency': 0.9064806546026608}

atomic {'mean_consistency': 0.7478507936507937, 'weighted_consistency': 0.8616549681381606}

social {'mean_consistency': 0.6592190476190476, 'weighted_consistency': 0.9087363234357935}

####### anli #######
anli {'mean_consistency': 0.6805238095238096, 'weighted_consistency': 0.8744013294599748}


In [7]:
get_consistencies('unified_roberta')

snli {'mean_consistency': 0.7409476190476191, 'weighted_consistency': 0.8816991207553786}

atomic {'mean_consistency': 0.7947984126984127, 'weighted_consistency': 0.873993649318216}

social {'mean_consistency': 0.7110761904761905, 'weighted_consistency': 0.8669404287513931}

####### anli #######


In [None]:
get_consistencies('specialized_full_input_lexical')

In [None]:
get_consistencies('gpt3-curie')

## Specialized RoBERTa

In [None]:
def plot_roberta(name, bucket_preds):
    get_original_example_prediction_accuracy(bucket_preds)
    metadata = construct_bucket_metadata(bucket_preds)
    plot = plot_orig_v_bucket_conf(metadata, name)
    return plot

In [None]:
plot = plot_roberta('α-NLI RoBERTa', anli_human_bucket_predictions['specialized_roberta'])
plot.write_image('anli-human-roberta.pdf')

plot = plot_roberta('δ-SNLI RoBERTa', dnli_human_bucket_predictions['snli']['specialized_roberta'])
plot.write_image('snli-human-roberta.pdf')

plot = plot_roberta('δ-SOCIAL RoBERTa', dnli_human_bucket_predictions['social']['specialized_roberta'])
plot.write_image('social-human-roberta.pdf')

plot = plot_roberta('δ-ATOMIC RoBERTa', dnli_human_bucket_predictions['atomic']['specialized_roberta'])
plot.write_image('atomic-human-roberta.pdf')

In [None]:
from annotated_data.annotated_data import anli_human

In [None]:
for i in anli_human['anli.test.1063']:
    print(i)
    print()

In [None]:
for i in anli_human['anli.test.1105']:
    print(i)
    print()

In [None]:
from abductive_data import anli_dataset

In [None]:
print(anli_dataset.get_example_by_id('anli.test.854'))

In [None]:
for i in anli_human['anli.test.854']:
    print(i)
    print()

## FastText BOW

In [None]:
for b, dataset_buckets in dnli_human_bucket_predictions.items():
    print(b)
    print(
        'Specialized Lexical Model Accuracy:',
        get_original_example_prediction_accuracy(dataset_buckets['specialized_full_input_lexical'])
    )
    
    print(
        'Specialized Lexical Partial Input Model Accuracy:', 
        get_original_example_prediction_accuracy(dataset_buckets['specialized_partial_input_lexical'])
    )
    print()
    
    metadata = construct_bucket_metadata(dataset_buckets['specialized_full_input_lexical'])
    plot = plot_orig_v_bucket_conf(metadata, f'δ-{b} Lexical Model')
    plot.show()
    plot.write_image(f'{b}-human-lexical.pdf')
    #plot_consistency_cdf(metadata, f'{b} Specialized Defeasible Lexical Consistency CDF')
    
#     metadata = construct_bucket_metadata(dataset_buckets['specialized_partial_input_lexical'])
#     plot_orig_v_bucket_conf(metadata, f'{b} Specialized Defeasible Lexical (Partial Input)')
#     #plot_consistency_cdf(metadata, f'{b} Specialized Defeasible Lexical Consistency CDF')

## GPT-3 Curie

In [None]:
from utils import load_json

In [None]:
for b, dataset_buckets in dnli_human_bucket_predictions.items():
    print(b)
    print(
        'GPT-3 Curie Accuracy:',
        get_original_example_prediction_accuracy(dataset_buckets['gpt3-curie'])
    )
    
    metadata = construct_bucket_metadata(dataset_buckets['gpt3-curie'])
    plot = plot_orig_v_bucket_conf(metadata, f'δ-{b} GPT-3 (Curie)')
    plot.show()
    
    plot.write_image(f'{b}-human-gpt-3-curie.pdf')