# Import

In [54]:
%matplotlib widget

In [55]:
import os
import pickle5 as pickle
import copy

import pandas as pd
import seaborn as sns
import numpy
import torch
import scipy
import scipy.stats

import pyro
import pyro.infer
import pyro.infer.mcmc
import pyro.distributions as dist
import torch.distributions.constraints as constraints
from tqdm.auto import tqdm

import matplotlib.pyplot as plot
import matplotlib.gridspec as gridspec
from matplotlib.lines import Line2D
import warnings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

In [56]:
repo = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))

# Define

In [57]:
def sigmoid(x):
    return 1./(1.+torch.exp(-x))

def icc_best_deriv(alpha, beta, theta, model_names, gamma=None, col='mean'):
    '''
    Method to calculate the locally estimated headroom (LEH) score, defined as
    the derivative of the item characteristic curve w.r.t. the best performing model.
    
    Args:
        alpha:       DataFrame of discrimination parameter statistics for each item.
        beta:        DataFrame of difficulty parameter statistics for each item.
        theta:       DataFrame of ability parameter statistics for each responder.
        model_names: List of responder names.
        gamma:       DataFrame of guessing parameter statistics for each item.
        col:         DataFrame column name to use for calculating LEH scores.
    
    Returns:
        scores:      LEH scores for each item.    
    '''
    best_idx, best_value = theta[col].argmax(), theta[col].max()
    print(f'Best model: {model_names[best_idx]}\n{best_value}')
    
    a, b = torch.tensor(alpha[col].values), torch.tensor(beta[col].values)
    
    logits = (a*(best_value-b))
    sigmoids = sigmoid(logits)
    scores = sigmoids*(1.-sigmoids)*a
    
    print(f'No gamma: {scores.mean()}')
    if not gamma is None:
        g = torch.tensor(gamma[col].apply(lambda x: x.item()).values)
        scores = (1.-g)*scores
        print(f'With gamma: {scores.mean()}')
    
    return scores      
    
    

In [58]:
def get_model_guide(alpha_dist, theta_dist, alpha_transform, theta_transform):
    model = lambda obs: irt_model(obs, alpha_dist, theta_dist, alpha_transform = alpha_transform, theta_transform = theta_transform)
    guide = lambda obs: vi_posterior(obs, alpha_dist, theta_dist)
    
    return model, guide

In [59]:
def get_data_accuracies(data, verbose = False, get_cols = False):
    '''
    Method to reformat `data` and calculate item and responder accuracies.
    
    Args:
        data:                DataFrame of item responses.
        verbose:             Boolean value of whether to print statements.
        get_cols:            Boolean value of whether to return original column
                             values of `data`.
        
    Returns:
        new_data:            Reformatted `data`, dropping first column.
        accuracies:          Accuracy for each responder across examples.
        example_accuracies:  Accuracy for each example across responders.
        data.columns.values: Returns only if `get_cols` is True. Original column
                             values of `data`.
    '''
    new_data = numpy.array(data)
    new_data = new_data[:,1:]
    
    model_names = dict(data['userid'])
    accuracies = new_data.mean(-1)
    example_accuracies = new_data.mean(0)
    
    if verbose:
        print('\n'.join([f'{name}: {acc}' for name, acc in zip(model_names.values(),accuracies)]))
    
    if get_cols:
        return new_data, accuracies, example_accuracies, data.columns.values
    else:
        return new_data, accuracies, example_accuracies

In [60]:
def get_stats_CI(params, p=0.95, dist='normal'):
    '''
    Method to calculate lower and upper quantiles defined by `p`, mean, and variance of `param`
    
    Args:
        params: Dictionary of distribution parameters for each item keyed according to the 
                parametric distribution defined by `dist`.
        p:      Percent of distribution covered by the lower and upper interval values for each
                parameter.
        dist:   Name of parametric distribution
    
    Returns:
        return: {
            'lower': Lower interval values of each parameter,
            'upper': Upper interval values of each parameter,
            'mean' : Mean of each parameter,
            'var'  : Variance of each parameter
        }
    '''
    stats = {}
    if dist == 'normal':
        L,U = scipy.stats.norm.interval(p,loc=params['mu'], scale=torch.exp(params['logstd']))
        M,V = scipy.stats.norm.stats(loc=params['mu'], scale=torch.exp(params['logstd']))
    elif dist == 'log-normal':
        L,U = scipy.stats.lognorm.interval(p, s=torch.exp(params['logstd']), scale=torch.exp(params['mu']))
        M,V = scipy.stats.lognorm.stats(s=torch.exp(params['logstd']), scale=torch.exp(params['mu']))
    elif dist == 'beta':
        L,U = scipy.stats.beta.interval(p,a=params['alpha'], b=params['beta'])
        M,V = scipy.stats.beta.stats(a=params['alpha'], b=params['beta'])
    else:
        raise TypeError(f'Distribution type {dist} not supported.')
    
    return {
        'lower':[L],
        'upper':[U],
        'mean':[M],
        'var':[V],
    }

In [61]:
def get_plot_stats(exp_dir, alpha_dist, theta_dist, transforms, p = 0.95):
    '''
    Method to return plotting statistics for 3 parameter IRT model parameters.
    
    Args:
        exp_dir:          Path to 3 parameter IRT parameters and responses.
        alpha_dist:       Name of the item discrimination [a] distribution.
        theta_dist:       Name of the responder ability [t] distribution.
        transforms:       Dictionary of transformations to apply to each parameter type
                          where keys are parameter names and values are functions.
        p:                Percent of distribution covered by the lower and upper interval 
                          values for each parameter.
    
    Returns:
        param_plot_stats: Dictionary of parameter plot statistics where keys are parameter
                          names and values are plot statistics dictionaries as defined by
                          get_stats_CI().
    '''
    param_dists = {
        'a':alpha_dist,
        'b':'normal',
        'g':'normal',
        't':theta_dist,
    }

    dist_params = {
        'normal':['mu', 'logstd'],
        'log-normal':['mu', 'logstd'],
        'beta':['alpha', 'beta'],
    }

    pyro.clear_param_store()
    pyro.get_param_store().load(os.path.join(exp_dir, 'params.p'))

    with torch.no_grad():
        pyro_param_dict = dict(pyro.get_param_store().named_parameters())
    
    # get stats for plotting
    param_plot_stats = {}

    for param, param_dist in param_dists.items():
        temp_params = dist_params[param_dist]

        for idx, (p1_orig, p2_orig) in enumerate(zip(pyro_param_dict[f'{param} {temp_params[0]}'], pyro_param_dict[f'{param} {temp_params[1]}'])):
            p1, p2 = p1_orig.detach(), p2_orig.detach()
            
            temp_stats_df = pd.DataFrame.from_dict(
                get_stats_CI(
                    params = {
                        temp_params[0]:p1,
                        temp_params[1]:p2,
                    },
                    p=p,
                    dist = param_dist,
                )
            )
            
            temp_stats_df = temp_stats_df.applymap(transforms[param])
        
            if idx == 0:
                param_plot_stats[param] = temp_stats_df
            else:
                param_plot_stats[param] = param_plot_stats[param].append(temp_stats_df, ignore_index = True)
    
    return param_plot_stats

In [62]:
def sign_mult(df1, df2):
    newdf = copy.deepcopy(df2)
    
    for idx, row in df1.iterrows():
        if numpy.sign(row['mean']) < 0:
            newdf.loc[idx,'mean'] = -1*newdf.loc[idx,'mean']
            newdf.loc[idx,'lower'] = -1*newdf.loc[idx,'upper']
            newdf.loc[idx,'upper'] = -1*newdf.loc[idx,'lower']
    
    return newdf

In [63]:
def get_diff_by_set(diffs, item_ids):
    diff_by_set = {}
    id_split = '_'

    max_diff = -1e6
    min_diff = 1e6
    
    for idx, diff in enumerate(diffs):
        set_name = item_ids[idx].split(id_split)[0]

        if set_name in diff_by_set.keys():
            diff_by_set[set_name].append(diff)
        else:
            diff_by_set[set_name] = [diff]
            
        if diff < min_diff:
            min_diff = diff
            
        if diff > max_diff:
            max_diff = diff
    
    return diff_by_set, min_diff, max_diff

# Load Trimmed

## Get Tasks

In [64]:
from variational_irt import *

In [65]:
datasets="boolq,cb,commonsenseqa,copa,cosmosqa,hellaswag,adversarial-nli,rte,snli,wic,qamr,arct,mcscript,mctaco,mutual,mutual-plus,quoref,socialiqa,squad-v2,wsc,mnli,mrqa-nq,newsqa,abductive-nli,arc-easy,arc-challenge,piqa,quail,winogrande,anli"
data_names, responses, n_items = get_files(
    os.path.join(repo, 'data_trimmed_item'),
    "csv",
    set(datasets.split(','))
)

In [66]:
task_metadata = pd.read_csv('task_metadata.csv')
task_metadata.set_index("jiant_name", inplace=True)
task_list = [x for x in task_metadata.index if x in data_names]

In [67]:
total = 0
task_name = []
task_format = []

for tname, size in zip(data_names, n_items):
    name = task_metadata.loc[tname]['taskname']
    total += size
    task_name += [name for _ in range(size)]
    task_format += [task_metadata.loc[tname]['format'] for _ in range(size)]
    
task_name = pd.DataFrame(task_name, columns=['task_name'])
task_format = pd.DataFrame(task_format, columns=['format'])
task_name_format_trimmed = pd.concat([task_name, task_format], axis=1)

In [68]:
len(data_names)

28

## Get Params and Order

In [69]:
exp_dir = os.path.join(repo, 'params_trimmed_item', f'alpha-lognormal-identity_theta-normal-identity_nosubsample_1.00_0.30')
p = 0.95

with open(os.path.join(exp_dir, 'responses.p'), 'rb') as f:
    combined_responses = pickle.load(f).reset_index()

In [70]:
# Check accuracy of roberta-large models

extractmodel = 'roberta-large_best'
tie_break = 0

acc_by_dataset = {}

roberta_rp = combined_responses.loc[combined_responses['userid']==extractmodel, :]
if roberta_rp.shape[0] > 1:
    roberta_rp = roberta_rp.iloc[tie_break, :]

cols = combined_responses.columns.values

for item in cols[1:]:
    data_name = '_'.join(item.split('_')[:-1])
    resp = roberta_rp[item].item()
    
    if data_name in acc_by_dataset:
        acc_by_dataset[data_name]['correct'] += resp
        acc_by_dataset[data_name]['total'] += 1
    else:
        acc_by_dataset[data_name] = {'correct': resp, 'total': 1}

print(extractmodel)
print('='*90)
print(f'Overall acc: {roberta_rp.iloc[0, 1:].sum()/(roberta_rp.shape[1]-1):.4f}')        

for data_name, acc_dict in acc_by_dataset.items():
    print(f'{data_name} acc: {acc_dict["correct"]/acc_dict["total"]:.4f}')

roberta-large_best
Overall acc: 0.7661
abductive_nli acc: 0.8564
adversarial_nli acc: 0.4995
arc_challenge acc: 0.3319
arc_easy acc: 0.6299
arct acc: 0.8604
boolq acc: 0.8217
cb acc: 0.8571
commonsenseqa acc: 0.6759
copa acc: 0.8400
cosmosqa acc: 0.8000
hellaswag acc: 0.8420
mcscript acc: 0.9183
mctaco acc: 0.6010
mnli acc: 0.8995
mrqa_natural_questions acc: 0.7489
mutual_plus acc: 0.7314
mutual acc: 0.8668
newsqa acc: 0.6608
piqa acc: 0.7617
qamr acc: 0.7944
quail acc: 0.6691
quoref acc: 0.8241
rte acc: 0.8345
snli acc: 0.9192
socialiqa acc: 0.7738
squad_v2 acc: 0.4337
wic acc: 0.7085
winogrande acc: 0.7697
wsc acc: 0.6154


In [71]:
# set to False if run for the first time
# note that this will take sometimes to run if the datasets are big
load_from_cache = True

In [72]:
# distribution and transformation
alpha_dist = 'log-normal'
alpha_transf = 'standard'
theta_dist = 'normal'
theta_transf = 'standard'

exp_dir = os.path.join(repo, 'params_trimmed_item', f'alpha-lognormal-identity_theta-normal-identity_nosubsample_1.00_0.30')
p = 0.95

with open(os.path.join(exp_dir, 'responses.p'), 'rb') as f:
    combined_responses = pickle.load(f).reset_index()
data, accuracies, example_accuracies = get_data_accuracies(combined_responses)
column_names = combined_responses.columns[1:]
select_ts = {
    'standard':lambda x:x,
    'positive':lambda x:torch.log(1+torch.exp(torch.tensor(x))),
    'sigmoid':lambda x:sigmoid(torch.tensor(x)),
}

transforms = {
    'a':select_ts[alpha_transf],
    'b':select_ts['standard'],
    'g':select_ts['sigmoid'],
    't':select_ts[theta_transf],
}

if load_from_cache:
    param_plot_stats_trimmed = {}

    for key in transforms.keys():
        with open(os.path.join('plot_stats_pickles_trimmed_item', f'{key}.p'), 'rb') as f:
            param_plot_stats_trimmed[key] = pickle.load(f)
else:
    param_plot_stats = get_plot_stats(
        exp_dir,
        alpha_dist,
        theta_dist,
        transforms,
        p = 0.95
    )
    
    os.makedirs('plot_stats_pickles_trimmed_item', exist_ok=True)
    for key, value in param_plot_stats.items():
        with open(os.path.join('plot_stats_pickles_trimmed_item', f'{key}.p'), 'wb') as f:
            pickle.dump(value, f)

In [73]:
combined_responses

Unnamed: 0,userid,abductive_nli_0,abductive_nli_1,abductive_nli_2,abductive_nli_3,abductive_nli_4,abductive_nli_5,abductive_nli_6,abductive_nli_7,abductive_nli_8,...,wsc_42,wsc_43,wsc_44,wsc_45,wsc_46,wsc_47,wsc_48,wsc_49,wsc_50,wsc_51
0,roberta-base-10M-1_best,1,0,1,1,1,1,1,0,1,...,1,0,1,1,0,1,0,1,0,1
1,roberta-base-10M-1_1,0,0,0,1,1,0,1,0,1,...,1,0,1,0,0,1,0,0,0,1
2,roberta-base-10M-1_25,1,1,1,1,1,1,1,1,1,...,0,0,0,1,0,1,1,1,1,1
3,roberta-base-10M-1_50,1,0,1,1,1,1,1,0,1,...,1,0,0,1,0,1,1,1,0,1
4,roberta-base-10M-1_10,1,0,1,1,1,1,1,0,1,...,1,0,0,1,0,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,xlm-roberta-large_best,1,1,0,1,1,1,1,1,1,...,1,1,0,1,0,1,1,1,0,0
86,xlm-roberta-large_1,1,1,0,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,1,1
87,xlm-roberta-large_25,1,1,0,1,1,1,0,1,1,...,1,1,0,1,0,1,1,1,0,0
88,xlm-roberta-large_50,1,1,1,1,1,1,1,1,1,...,1,1,0,1,0,1,1,1,0,0


In [74]:
model_names = []
model_levels = []
for m in combined_responses['userid']:
    mname = m.split('_')[0]
    mlevel = m.split('_')[-1]
    if mname.endswith('-1') or mname.endswith('-2') or mname.endswith('-3'):
        mname = mname[:-2]
    model_names.append(mname)
    
    mlevel_append = '' if mlevel == 'best' else r'%'
    model_levels.append(mlevel+mlevel_append)

In [75]:
#  we will only use log mean for discriminative parameter
for param_key, param_stat in param_plot_stats_trimmed.items():
    param_stat['log_mean'] = numpy.log(param_stat['mean'])
    print(param_key, param_stat['log_mean'].isnull().sum())

a 0
b 35641
g 0
t 47


In [76]:
param_a = pd.concat([param_plot_stats_trimmed['a'], task_name_format_trimmed], axis=1)
param_b = pd.concat([param_plot_stats_trimmed['b'], task_name_format_trimmed], axis=1)

task_order = [task_metadata.loc[x]['taskname'] for x in task_list]

In [77]:
leh_scores = icc_best_deriv(
    param_plot_stats_trimmed['a'],
    param_plot_stats_trimmed['b'],
    param_plot_stats_trimmed['t'],
    model_names,
    gamma = param_plot_stats_trimmed['g'],
)

leh_scores_plot = pd.DataFrame(pd.Series(leh_scores), columns = ['mean'])
print(leh_scores_plot)

Best model: albert-xxlarge-v2
1.7490668296813965
No gamma: 0.14549648163855539
With gamma: 0.11648878309933183
           mean
0      0.156178
1      0.208147
2      0.069049
3      0.014967
4      0.102332
...         ...
79343  0.050886
79344  0.110832
79345  0.069405
79346  0.167813
79347  0.101622

[79348 rows x 1 columns]


In [78]:
leh_scores_plot_trimmed = pd.concat([leh_scores_plot, task_name_format_trimmed], axis=1)

In [79]:
task_metadata.set_index("taskname", inplace=True)

# Load Full

## Get Tasks

In [80]:
from variational_irt import *

In [81]:
datasets="boolq,cb,commonsenseqa,copa,cosmosqa,hellaswag,adversarial-nli,rte,snli,wic,qamr,arct,mcscript,mctaco,mutual,mutual-plus,quoref,socialiqa,squad-v2,wsc,mnli,mrqa-nq,newsqa,abductive-nli,arc-easy,arc-challenge,piqa,quail,winogrande,anli"
data_names, responses, n_items = get_files(
    os.path.join(repo, 'data'),
    "csv",
    set(datasets.split(','))
)

In [82]:
task_metadata = pd.read_csv('task_metadata.csv')
task_metadata.set_index("jiant_name", inplace=True)
task_list = [x for x in task_metadata.index if x in data_names]

In [83]:
total = 0
task_name = []
task_format = []

for tname, size in zip(data_names, n_items):
    name = task_metadata.loc[tname]['taskname']
    total += size
    task_name += [name for _ in range(size)]
    task_format += [task_metadata.loc[tname]['format'] for _ in range(size)]
    
task_name = pd.DataFrame(task_name, columns=['task_name'])
task_format = pd.DataFrame(task_format, columns=['format'])
task_name_format = pd.concat([task_name, task_format], axis=1)

In [84]:
len(data_names)

28

## Get Params and Order

In [85]:
exp_dir = os.path.join(repo, 'params_trimmed_item', f'alpha-lognormal-identity_theta-normal-identity_nosubsample_1.00_0.30')
p = 0.95

with open(os.path.join(exp_dir, 'responses.p'), 'rb') as f:
    combined_responses = pickle.load(f).reset_index()

In [86]:
# Check accuracy of roberta-large models

extractmodel = 'roberta-large_best'
tie_break = 0

acc_by_dataset = {}

roberta_rp = combined_responses.loc[combined_responses['userid']==extractmodel, :]
if roberta_rp.shape[0] > 1:
    roberta_rp = roberta_rp.iloc[tie_break, :]

cols = combined_responses.columns.values

for item in cols[1:]:
    data_name = '_'.join(item.split('_')[:-1])
    resp = roberta_rp[item].item()
    
    if data_name in acc_by_dataset:
        acc_by_dataset[data_name]['correct'] += resp
        acc_by_dataset[data_name]['total'] += 1
    else:
        acc_by_dataset[data_name] = {'correct': resp, 'total': 1}

print(extractmodel)
print('='*90)
print(f'Overall acc: {roberta_rp.iloc[0, 1:].sum()/(roberta_rp.shape[1]-1):.4f}')        

for data_name, acc_dict in acc_by_dataset.items():
    print(f'{data_name} acc: {acc_dict["correct"]/acc_dict["total"]:.4f}')

roberta-large_best
Overall acc: 0.7661
abductive_nli acc: 0.8564
adversarial_nli acc: 0.4995
arc_challenge acc: 0.3319
arc_easy acc: 0.6299
arct acc: 0.8604
boolq acc: 0.8217
cb acc: 0.8571
commonsenseqa acc: 0.6759
copa acc: 0.8400
cosmosqa acc: 0.8000
hellaswag acc: 0.8420
mcscript acc: 0.9183
mctaco acc: 0.6010
mnli acc: 0.8995
mrqa_natural_questions acc: 0.7489
mutual_plus acc: 0.7314
mutual acc: 0.8668
newsqa acc: 0.6608
piqa acc: 0.7617
qamr acc: 0.7944
quail acc: 0.6691
quoref acc: 0.8241
rte acc: 0.8345
snli acc: 0.9192
socialiqa acc: 0.7738
squad_v2 acc: 0.4337
wic acc: 0.7085
winogrande acc: 0.7697
wsc acc: 0.6154


In [87]:
# set to False if run for the first time
# note that this will take sometimes to run if the datasets are big
load_from_cache = True

In [88]:
# distribution and transformation
alpha_dist = 'log-normal'
alpha_transf = 'standard'
theta_dist = 'normal'
theta_transf = 'standard'

exp_dir = os.path.join(repo, 'params', f'alpha-lognormal-identity_theta-normal-identity_nosubsample_1.00_0.30')
p = 0.95

with open(os.path.join(exp_dir, 'responses.p'), 'rb') as f:
    combined_responses = pickle.load(f).reset_index()
data, accuracies, example_accuracies = get_data_accuracies(combined_responses)
column_names = combined_responses.columns[1:]
select_ts = {
    'standard':lambda x:x,
    'positive':lambda x:torch.log(1+torch.exp(torch.tensor(x))),
    'sigmoid':lambda x:sigmoid(torch.tensor(x)),
}

transforms = {
    'a':select_ts[alpha_transf],
    'b':select_ts['standard'],
    'g':select_ts['sigmoid'],
    't':select_ts[theta_transf],
}

if load_from_cache:
    param_plot_stats = {}

    for key in transforms.keys():
        with open(os.path.join('plot_stats_pickles', f'{key}.p'), 'rb') as f:
            param_plot_stats[key] = pickle.load(f)
else:
    param_plot_stats = get_plot_stats(
        exp_dir,
        alpha_dist,
        theta_dist,
        transforms,
        p = 0.95
    )
    
    os.makedirs('plot_stats_pickles', exist_ok=True)
    for key, value in param_plot_stats.items():
        with open(os.path.join('plot_stats_pickles', f'{key}.p'), 'wb') as f:
            pickle.dump(value, f)

In [89]:
combined_responses

Unnamed: 0,userid,abductive_nli_0,abductive_nli_1,abductive_nli_2,abductive_nli_3,abductive_nli_4,abductive_nli_5,abductive_nli_6,abductive_nli_7,abductive_nli_8,...,wsc_42,wsc_43,wsc_44,wsc_45,wsc_46,wsc_47,wsc_48,wsc_49,wsc_50,wsc_51
0,roberta-base-10M-1_best,1,0,1,1,1,1,1,0,1,...,1,0,1,1,0,1,0,1,0,1
1,roberta-base-10M-1_1,0,0,0,1,1,0,1,0,1,...,1,0,1,0,0,1,0,0,0,1
2,roberta-base-10M-1_25,1,1,1,1,1,1,1,1,1,...,0,0,0,1,0,1,1,1,1,1
3,roberta-base-10M-1_50,1,0,1,1,1,1,1,0,1,...,1,0,0,1,0,1,1,1,0,1
4,roberta-base-10M-1_10,1,0,1,1,1,1,1,0,1,...,1,0,0,1,0,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,xlm-roberta-large_best,1,1,0,1,1,1,1,1,1,...,1,1,0,1,0,1,1,1,0,0
86,xlm-roberta-large_1,1,1,0,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,1,1
87,xlm-roberta-large_25,1,1,0,1,1,1,0,1,1,...,1,1,0,1,0,1,1,1,0,0
88,xlm-roberta-large_50,1,1,1,1,1,1,1,1,1,...,1,1,0,1,0,1,1,1,0,0


In [90]:
model_names = []
model_levels = []
for m in combined_responses['userid']:
    mname = m.split('_')[0]
    mlevel = m.split('_')[-1]
    if mname.endswith('-1') or mname.endswith('-2') or mname.endswith('-3'):
        mname = mname[:-2]
    model_names.append(mname)
    
    mlevel_append = '' if mlevel == 'best' else r'%'
    model_levels.append(mlevel+mlevel_append)

In [91]:
#  we will only use log mean for discriminative parameter
for param_key, param_stat in param_plot_stats.items():
    param_stat['log_mean'] = numpy.log(param_stat['mean'])
    print(param_key, param_stat['log_mean'].isnull().sum())

a 0
b 35716
g 0
t 47


In [92]:
param_a = pd.concat([param_plot_stats['a'], task_name_format], axis=1)
param_b = pd.concat([param_plot_stats['b'], task_name_format], axis=1)

task_order = [task_metadata.loc[x]['taskname'] for x in task_list]

In [93]:
leh_scores = icc_best_deriv(
    param_plot_stats['a'],
    param_plot_stats['b'],
    param_plot_stats['t'],
    model_names,
    gamma = param_plot_stats['g'],
)

leh_scores_plot = pd.DataFrame(pd.Series(leh_scores), columns = ['mean'])
print(leh_scores_plot)

Best model: albert-xxlarge-v2
1.6105766296386719
No gamma: 0.14762448462619243
With gamma: 0.1185372240065298
           mean
0      0.171294
1      0.230789
2      0.064061
3      0.015200
4      0.090730
...         ...
82751  0.059779
82752  0.141603
82753  0.091485
82754  0.128162
82755  0.097277

[82756 rows x 1 columns]


In [94]:
leh_scores_plot = pd.concat([leh_scores_plot, task_name_format], axis=1)

In [95]:
task_metadata.set_index("taskname", inplace=True)

# Compare

## LEH

In [129]:
trimmed = leh_scores_plot_trimmed.groupby(by='task_name').quantile(q=0.75).rename(columns={'mean':'Trimmed'})
trimmed

Unnamed: 0_level_0,Trimmed
task_name,Unnamed: 1_level_1
ANLI,0.210905
ARC-C,0.231316
ARC-E,0.220743
ARCT,0.144571
AbductNLI,0.184625
BoolQ,0.141349
CB,0.108294
COPA,0.181559
CSQA,0.23525
CosmosQA,0.205929


In [130]:
full = leh_scores_plot.groupby(by='task_name').quantile(q=0.75).rename(columns={'mean':'Full'})
full

Unnamed: 0_level_0,Full
task_name,Unnamed: 1_level_1
ANLI,0.202093
ARC-C,0.224374
ARC-E,0.221798
ARCT,0.146663
AbductNLI,0.180351
BoolQ,0.126232
CB,0.102626
COPA,0.173248
CSQA,0.239099
CosmosQA,0.204523


In [131]:
combined = pd.concat([trimmed, full], axis=1)
combined['diff'] = combined['Trimmed'] - combined['Full']
combined['rel_diff'] = (combined['Trimmed'] - combined['Full'])/combined['Full']

print('median diff (magnitude)', combined['diff'].abs().median())
print('standard deviation diff (magnitude)', combined['diff'].abs().std())
print('median rel diff (magnitude)', combined['rel_diff'].abs().median())

median diff (magnitude) 0.006144598435523396
standard deviation diff (magnitude) 0.004611184132519173
median rel diff (magnitude) 0.039637497671869554


In [132]:
combined.loc[:,['Trimmed', 'Full', 'diff', 'rel_diff']].sort_values(by='diff')

Unnamed: 0_level_0,Trimmed,Full,diff,rel_diff
task_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RTE,0.165325,0.180724,-0.015399,-0.085208
WiC,0.2453,0.255731,-0.010431,-0.040788
Winogrande,0.200612,0.210921,-0.010309,-0.048874
MNLI,0.114458,0.12085,-0.006393,-0.052899
MCScript,0.129385,0.13548,-0.006095,-0.04499
SNLI,0.045599,0.050806,-0.005207,-0.102493
CSQA,0.23525,0.239099,-0.003849,-0.016097
HellaSwag,0.256265,0.259556,-0.003291,-0.012679
ARCT,0.144571,0.146663,-0.002092,-0.014264
ARC-E,0.220743,0.221798,-0.001055,-0.004756


In [133]:
combined.loc[:,['Trimmed','Full']].corr(method='pearson')

Unnamed: 0,Trimmed,Full
Trimmed,1.0,0.988994
Full,0.988994,1.0


## Discr

In [134]:
print(param_plot_stats_trimmed['a'].shape)

(79348, 5)


In [135]:
print(task_name_format_trimmed.shape)

(73167, 2)


In [136]:
trimmed = pd.concat([param_plot_stats_trimmed['a'], task_name_format_trimmed], axis=1) 
trimmed = trimmed.groupby(by='task_name').quantile(q=0.75).rename(columns={'log_mean':'Trimmed'})
trimmed

Unnamed: 0_level_0,lower,upper,mean,var,Trimmed
task_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ANLI,0.899919,2.226639,1.441112,0.125407,0.365415
ARC-C,0.822956,2.078902,1.324271,0.109164,0.280862
ARC-E,0.831715,2.070082,1.33263,0.109028,0.287154
ARCT,0.792842,2.039686,1.283021,0.118357,0.249218
AbductNLI,0.824605,2.203403,1.362311,0.135256,0.309183
BoolQ,0.830674,2.135282,1.34199,0.120325,0.294153
CB,0.902353,2.114346,1.383875,0.09645,0.32488
COPA,0.715233,2.144357,1.26129,0.139823,0.232093
CSQA,0.870722,2.138904,1.376232,0.114204,0.319349
CosmosQA,0.872706,2.162746,1.391881,0.117798,0.330656


In [137]:
full = pd.concat([param_plot_stats['a'], task_name_format], axis=1)
full = full.groupby(by='task_name').quantile(q=0.75).rename(columns={'log_mean':'Full'})
full

Unnamed: 0_level_0,lower,upper,mean,var,Full
task_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ANLI,0.915775,2.265168,1.467177,0.128683,0.38334
ARC-C,0.831736,2.101384,1.334468,0.114224,0.288532
ARC-E,0.838473,2.104504,1.347662,0.113631,0.298371
ARCT,0.788995,2.132945,1.314351,0.120544,0.273343
AbductNLI,0.840363,2.23631,1.39165,0.141433,0.33049
BoolQ,0.79713,2.123107,1.297305,0.124242,0.260289
CB,0.892566,2.254348,1.444007,0.125014,0.367416
COPA,0.691717,2.040448,1.212194,0.122549,0.192426
CSQA,0.865282,2.135357,1.386515,0.113656,0.326793
CosmosQA,0.876607,2.180389,1.398905,0.118627,0.33569


In [138]:
combined_a = pd.concat([trimmed, full], axis=1)
combined_a['diff'] = combined_a['Trimmed'] - combined_a['Full']
combined_a['rel_diff'] = (combined_a['Trimmed'] - combined_a['Full'])/combined_a['Full']

print('median diff (magnitude)', combined_a['diff'].abs().median())
print('standard deviation diff (magnitude)', combined_a['diff'].abs().std())
print('median rel diff (magnitude)', combined_a['rel_diff'].abs().median())

median diff (magnitude) 0.016389266234225514
standard deviation diff (magnitude) 0.015199419535840721
median rel diff (magnitude) 0.04174369125790653


In [139]:
combined_a.loc[:,['Trimmed', 'Full', 'diff', 'rel_diff']].sort_values(by='diff')

Unnamed: 0_level_0,Trimmed,Full,diff,rel_diff
task_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MC-TACO,0.399988,0.448584,-0.048596,-0.108332
CB,0.32488,0.367416,-0.042536,-0.115771
NewsQA,0.489936,0.520287,-0.030351,-0.058335
RTE,0.154438,0.179166,-0.024728,-0.138017
ARCT,0.249218,0.273343,-0.024125,-0.088259
AbductNLI,0.309183,0.33049,-0.021307,-0.064471
MNLI,0.409765,0.430545,-0.02078,-0.048265
MRQA-NQ,0.5071,0.526834,-0.019734,-0.037458
ANLI,0.365415,0.38334,-0.017925,-0.046759
Quoref,0.480489,0.495343,-0.014854,-0.029987


In [140]:
combined_a.loc[:,['Trimmed', 'Full']].corr(method='pearson')

Unnamed: 0,Trimmed,Full
Trimmed,1.0,0.972188
Full,0.972188,1.0
