In [None]:
import plotly.express as px
import os
import sys
import numpy as np

In [None]:
module_path = os.path.abspath(os.path.join('../../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from utils import load_json
from defeasible_consistency import dnli_test_set_predictions, dnli_human_bucket_predictions, construct_bucket_metadata
from collections import defaultdict
from data_selection.data_selection_utils import float_floor

In [None]:
def calculate_weighted_consistency(dataname, model):
    test_set_confidences = []
    
    for p in dnli_test_set_predictions[dataname][model]:
        test_set_confidences.append(p['confidence'][p['label']])
    
    histogram = np.histogram(test_set_confidences, bins=10, density=False, range=[0, 1])
    confidence_densities = [x / len(test_set_confidences) for x in histogram[0]]
    
    fig = px.histogram(test_set_confidences)
    fig.show(width=5, height=5)
    
    metadata = construct_bucket_metadata(dnli_human_bucket_predictions[dataname][model])
    
    ranges = defaultdict(list)
    
    for _, row in metadata.iterrows():
        ranges[float_floor(row.original_confidence)].append(row.bucket_consistency)
        
    weighted_bucket_consistences = []
    for decile, decile_consistences in ranges.items():
        weighted_bucket_consistences.append(confidence_densities[int(10*decile)] * np.mean(decile_consistences))

    return {
        'weighted_consistency': sum(weighted_bucket_consistences), 
        'mean_consistency': np.mean(metadata.bucket_consistency)
    }
    

In [None]:
print('Specialized RoBERTa Models')
for dataname in dnli_human_bucket_predictions.keys():
    print(dataname, calculate_weighted_consistency(dataname, 'specialized_roberta'))
    
print()
print('Unified RoBERTa Model')
for dataname in dnli_human_bucket_predictions.keys():
    print(dataname, calculate_weighted_consistency(dataname, 'unified_roberta'))

print()
print('Full Input Lexical Model')
for dataname in dnli_human_bucket_predictions.keys():
    print(dataname, calculate_weighted_consistency(dataname, 'specialized_full_input_lexical'))

## RoBERTa

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from defeasible_consistency import (
    dnli_human_bucket_predictions, 
    construct_bucket_metadata, 
    plot_orig_v_bucket_conf, 
    plot_consistency_cdf,
    get_original_example_prediction_accuracy,
)

In [22]:
for b, dataset_buckets in dnli_human_bucket_predictions.items():
    print('Specialized Defeasible RoBERTa Model Accuracy:',
    get_original_example_prediction_accuracy(dataset_buckets['specialized_roberta']))
    
    metadata = construct_bucket_metadata(dataset_buckets['specialized_roberta'])
    # plot_orig_v_bucket_conf_js_consistency(metadata, f'{b} Specialized Defeasible RoBERTa')
    
    plot_orig_v_bucket_conf(metadata, f'{b} Specialized Defeasible RoBERTa')
    #plot_consistency_cdf(metadata, f'{b} Specialized Defeasible RoBERTa Consistency CDF')
    #plot_mean_js_original_v_bucket_mean_distance(metadata, f'{b} Specialized Defeasible RoBERTa Mean JS' )
    
    
    print('General Defeasible RoBERTa Model Accuracy:',
        get_original_example_prediction_accuracy(dataset_buckets['unified_roberta']))
    
    metadata = construct_bucket_metadata(dataset_buckets['unified_roberta'])
    #plot_orig_v_bucket_conf(metadata, f'{b} General Defeasible RoBERTa')
    #plot_consistency_cdf(metadata, f'{b} General Defeasible RoBERTa Consistency CDF')
    #plot_mean_js_original_v_bucket_mean_distance(metadata, f'{b} General Defeasible RoBERTa Mean JS' )

Specialized Defeasible RoBERTa Model Accuracy: 0.516


General Defeasible RoBERTa Model Accuracy: 0.708
Specialized Defeasible RoBERTa Model Accuracy: 0.512


General Defeasible RoBERTa Model Accuracy: 0.66
Specialized Defeasible RoBERTa Model Accuracy: 0.536


General Defeasible RoBERTa Model Accuracy: 0.656


## FastText BOW

In [None]:
for b, dataset_buckets in dnli_human_bucket_predictions.items():
    print(b)
    print(
        'Specialized Lexical Model Accuracy:',
        get_original_example_prediction_accuracy(dataset_buckets['specialized_full_input_lexical'])
    )
    
    print(
        'Specialized Lexical Partial Input Model Accuracy:', 
        get_original_example_prediction_accuracy(dataset_buckets['specialized_partial_input_lexical'])
    )
    print()
    
    metadata = construct_bucket_metadata(dataset_buckets['specialized_full_input_lexical'])
    #plot_mean_js_original_v_bucket_mean_distance(metadata, f'{b} Specialized Defeasible Lexical (Full Input)')
    plot_orig_v_bucket_conf(metadata, f'{b} Specialized Defeasible Lexical (Full Input)')
    #plot_consistency_cdf(metadata, f'{b} Specialized Defeasible Lexical Consistency CDF')
    
    metadata = construct_bucket_metadata(dataset_buckets['specialized_partial_input_lexical'])
    #plot_mean_js_original_v_bucket_mean_distance(metadata, f'{b} Specialized Defeasible Lexical (Partial Input)')
    plot_orig_v_bucket_conf(metadata, f'{b} Specialized Defeasible Lexical (Partial Input)')
    #plot_consistency_cdf(metadata, f'{b} Specialized Defeasible Lexical Consistency CDF')

## GPT-3 Curie

In [None]:
from utils import load_json

In [None]:
for b, dataset_buckets in dnli_human_bucket_predictions.items():
    print(b)
    print(
        'GPT-3 Curie Accuracy:',
        get_original_example_prediction_accuracy(dataset_buckets['gpt3-curie'])
    )
    
    metadata = construct_bucket_metadata(dataset_buckets['gpt3-curie'])
    #plot_mean_js_original_v_bucket_mean_distance(metadata, f'{b} GPT-3 Curie')
    plot_orig_v_bucket_conf(metadata, f'{b} GPT-3 Curie')
    #plot_consistency_cdf(metadata, f'{b} GPT-3 Curie')