In [322]:
import pandas as pd
import torch
from datasets import Dataset
from Exam.utils import tokenize_and_align_labels_adv, tokenize_and_align_labels_base
from transformers import  AutoModelForTokenClassification, AutoTokenizer, AutoConfig
from torch_scatter import scatter
from collections import defaultdict
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [323]:
MODEL_BASE = 'bert_a3_10ep'
MODEL_ADVANCED = 'bert_a3_10ep_adv'

In [324]:
def model_predict(model, test_dataset):
    y_true = []
    y_pred = []
    with torch.inference_mode():
        logits = model(input_ids=test_dataset['input_ids'], attention_mask=test_dataset['attention_mask'], token_type_ids=test_dataset['token_type_ids']).logits
        labels = test_dataset['labels']
        for logits, label, scatter_idx in zip(logits, labels, test_dataset['scatter_idx']):
            prediction = scatter(logits, scatter_idx, dim=0, reduce='mean')
            prediction = torch.argmax(prediction, dim=1)
            t_label = scatter(torch.tensor(label), scatter_idx, dim=0, reduce='max')
            for p, l in zip(prediction, t_label):
                if l.item() != -100:
                    y_true.append(l.item())
                    y_pred.append(p.item())
            continue
    return y_true, y_pred

In [325]:
full_dict = defaultdict(list)
output_dict = defaultdict(list)

In [326]:
for MODEL_NAME in [MODEL_BASE, MODEL_ADVANCED]:
    MODEL_URL = f'martincc98/{MODEL_NAME}'
    config = AutoConfig.from_pretrained(MODEL_URL)
    model = AutoModelForTokenClassification.from_pretrained(MODEL_URL, config=config)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_URL)

    df = pd.read_csv('Exam/test_data/challenge_ds.csv')
    df['label'] = df['label'].apply(lambda x: config.label2id['B-'+x])

    capabilities = df['capability'].unique()
    for capability in capabilities:

        # if capability != 'TempAdv':
        #     continue

        print('Capability:', capability)

        cap_df: pd.DataFrame = df[df['capability'] == capability]  
        tests = cap_df['test_name'].unique()
        for test in tests:
            print('Test:', test)
            cur_df = cap_df[cap_df['test_name'] == test]
            test_cnt = len(cur_df)
            succes_cnt = 0

            # Parse dataset
            ds = Dataset.from_pandas(cur_df)
            if MODEL_NAME == MODEL_ADVANCED:
                test_dataset = ds.map(lambda x: tokenize_and_align_labels_adv(tokenizer, x))
            else:
                test_dataset = ds.map(lambda x: tokenize_and_align_labels_base(tokenizer, x))

            test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label', 'token_type_ids', 'scatter_idx'])
            y_true, y_pred = model_predict(model, test_dataset)
            for t, p in zip(y_true, y_pred):
                if t == p:
                    succes_cnt += 1 


            fr = 1 - (succes_cnt / test_cnt)
            for i, (y_t, y_p) in enumerate(zip(y_true, y_pred)):
                full_dict['model_name'].append(MODEL_NAME)
                full_dict['capability'].append(capability)
                full_dict['test_name'].append(test)
                full_dict['test_n'].append(i)
                full_dict['pred'].append(config.id2label[y_p])
                full_dict['true'].append(config.id2label[y_t])
                full_dict['sent'].append(cur_df.iloc[i]['sent'])
                full_dict['predicate'].append(cur_df.iloc[i]['pred'])
                full_dict['arg_pos'].append(cur_df.iloc[i]['arg_pos'])
                full_dict['pred_pos'].append(cur_df.iloc[i]['pred_pos'])

            output_dict['model_name'].append(MODEL_NAME)
            output_dict['capability'].append(capability)
            output_dict['test_name'].append(test)
            output_dict['failure_rate'].append(fr)
            output_dict['example_cnt'].append(test_cnt)


Capability: PatientRight
Test: PatientRight


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Capability: AgentRole3D
Test: AgentRole3D


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Capability: PatientRole3D
Test: PatientRole3D


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Capability: TempAdv
Test: TempAdv


Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Capability: NegAdv
Test: NegAdv


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Capability: LocationVar
Test: LocationVar


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Capability: FreqAdvPast
Test: FreqAdvPast


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Capability: FreqAdvPresent
Test: FreqAdvPresent


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Capability: FreqAdvFuture
Test: FreqAdvFuture


Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Capability: VarDirTarget
Test: VarDirTarget


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Capability: VarGoalSource
Test: VarGoalSource


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Capability: PatientRight
Test: PatientRight


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Capability: AgentRole3D
Test: AgentRole3D


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Capability: PatientRole3D
Test: PatientRole3D


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Capability: TempAdv
Test: TempAdv


Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Capability: NegAdv
Test: NegAdv


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Capability: LocationVar
Test: LocationVar


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Capability: FreqAdvPast
Test: FreqAdvPast


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Capability: FreqAdvPresent
Test: FreqAdvPresent


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Capability: FreqAdvFuture
Test: FreqAdvFuture


Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Capability: VarDirTarget
Test: VarDirTarget


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Capability: VarGoalSource
Test: VarGoalSource


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [327]:
df_f = pd.DataFrame(full_dict)
df_f.to_csv('Exam/output/challenge_dataset_outputs.csv')

In [328]:
df = pd.DataFrame(output_dict)
df.to_csv('Exam/output/challenge_dataset_summary.csv')

In [329]:
df.groupby(by=['capability', 'test_name', 'model_name']).mean(numeric_only=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,failure_rate,example_cnt
capability,test_name,model_name,Unnamed: 3_level_1,Unnamed: 4_level_1
AgentRole3D,AgentRole3D,bert_a3_10ep,0.0,25.0
AgentRole3D,AgentRole3D,bert_a3_10ep_adv,0.0,25.0
FreqAdvFuture,FreqAdvFuture,bert_a3_10ep,0.5,12.0
FreqAdvFuture,FreqAdvFuture,bert_a3_10ep_adv,0.0,12.0
FreqAdvPast,FreqAdvPast,bert_a3_10ep,0.0,16.0
FreqAdvPast,FreqAdvPast,bert_a3_10ep_adv,0.0,16.0
FreqAdvPresent,FreqAdvPresent,bert_a3_10ep,0.0,15.0
FreqAdvPresent,FreqAdvPresent,bert_a3_10ep_adv,0.0,15.0
LocationVar,LocationVar,bert_a3_10ep,0.033333,30.0
LocationVar,LocationVar,bert_a3_10ep_adv,0.0,30.0


In [331]:
df_f[(df_f['capability'] == 'VarDirTarget') & (df_f['pred'] != df_f['true'])]

Unnamed: 0,model_name,capability,test_name,test_n,pred,true,sent,predicate,arg_pos,pred_pos
239,bert_a3_10ep,VarDirTarget,VarDirTarget,21,B-ARG2,B-ARGM-DIR,The dog ran along the road to the gym,ran,5,2
241,bert_a3_10ep,VarDirTarget,VarDirTarget,23,B-ARG2,B-ARGM-DIR,The dog ran along the road to the university,ran,5,2
