In [1]:
from datasets import load_dataset
import evaluate
from transformers import AutoModelWithHeads, AutoTokenizer

import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxruntime import InferenceSession
import onnxruntime


import time
from typing import Tuple, Union
import torch
import numpy as np
import pandas as pd
import os

from huggingface_hub import hf_hub_download

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
def load_skills(skill_type, path="square_skills/impl_skills.csv"):
    all_skills = pd.read_csv(path)
    skills = all_skills[all_skills["Type"] == skill_type]
    return skills

#### Measure extractive qa

In [25]:
all_skills = pd.read_csv("square_skills/impl_skills.csv")
skill = "span-extraction"
skills = load_skills(skill)

In [4]:
def decode(
            start_: np.ndarray,
            end_: np.ndarray,
            topk: int,
            max_answer_len: int,
            undesired_tokens_: np.ndarray,
    ) -> Tuple:
    """
    Take the output of any :obj:`ModelForQuestionAnswering` and
        will generate probabilities for each span to be the
        actual answer.
    In addition, it filters out some unwanted/impossible cases
    like answer len being greater than max_answer_len or
    answer end position being before the starting position.
    The method supports output the k-best answer through
    the topk argument.
    Args:
        start_ (:obj:`np.ndarray`): Individual start
            probabilities for each token.
        end (:obj:`np.ndarray`): Individual end_ probabilities
            for each token.
        topk (:obj:`int`): Indicates how many possible answer
            span(s) to extract from the model output.
        max_answer_len (:obj:`int`): Maximum size of the answer
            to extract from the model"s output.
        undesired_tokens_ (:obj:`np.ndarray`): Mask determining
            tokens that can be part of the answer
    """
    # Ensure we have batch axis
    if start_.ndim == 1:
        start_ = start_[None]

    if end_.ndim == 1:
        end_ = end_[None]

    # Compute the score of each tuple(start_, end_) to be the real answer
    outer = np.matmul(np.expand_dims(start_, -1), np.expand_dims(end_, 1))

    # Remove candidate with end_ < start_ and end_ - start_ > max_answer_len
    candidates = np.tril(np.triu(outer), max_answer_len - 1)

    #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
    scores_flat = candidates.flatten()
    if topk == 1:
        idx_sort = [np.argmax(scores_flat)]
    elif len(scores_flat) < topk:
        idx_sort = np.argsort(-scores_flat)
    else:
        idx = np.argpartition(-scores_flat, topk)[0:topk]
        idx_sort = idx[np.argsort(-scores_flat[idx])]

    starts_, ends_ = np.unravel_index(idx_sort, candidates.shape)[1:]
    desired_spans = np.isin(starts_, undesired_tokens_.nonzero()) & np.isin(
        ends_, undesired_tokens_.nonzero()
    )
    starts_ = starts_[desired_spans]
    ends_ = ends_[desired_spans]
    scores_ = candidates[0, starts_, ends_]

    return starts_, ends_, scores_

In [5]:
def base_predict(
            model, input, tokenizer, preprocessing_kwargs, model_kwargs, batch_size=1, disable_gpu=True, output_features=False
    ) -> Union[dict, Tuple[dict, dict]]:
        """
        Inference on the input.
        Args:
         request: the request with the input and optional kwargs
         output_features: return the features of the input.
            Necessary if, e.g., attention mask is needed for post-processing.
        Returns:
             The model outputs and optionally the input features
        """

        all_predictions = []
        preprocessing_kwargs["padding"] = preprocessing_kwargs.get(
            "padding", True
        )
        preprocessing_kwargs["truncation"] = preprocessing_kwargs.get(
            "truncation", True
        )
        model.to(
            "cuda"
            if torch.cuda.is_available() and not disable_gpu
            else "cpu"
        )

        features = tokenizer(
            input, return_tensors="pt", **preprocessing_kwargs
        )

        for start_idx in range(0, len(input), batch_size):
            with torch.no_grad():
                input_features = {
                    k: features[k][start_idx: start_idx + batch_size]
                    for k in features.keys()
                }
                predictions = model(**input_features, **model_kwargs)
                all_predictions.append(predictions)

        keys = all_predictions[0].keys()
        final_prediction = {}
        for key in keys:
            # HuggingFace outputs for "attentions" and more is
            # returned as tuple of tensors
            # Tuple of tuples only exists for "past_key_values"
            # which is only relevant for generation.
            # Generation should NOT use this function
            if isinstance(all_predictions[0][key], tuple):
                tuple_of_lists = list(
                    zip(
                        *[
                            [
                                torch.stack(p).cpu()
                                if isinstance(p, tuple)
                                else p.cpu()
                                for p in tpl[key]
                            ]
                            for tpl in all_predictions
                        ]
                    )
                )
                final_prediction[key] = tuple(torch.cat(l) for l in tuple_of_lists)
            else:
                final_prediction[key] = torch.cat(
                    [p[key].cpu() for p in all_predictions]
                )
        if output_features:
            return final_prediction, features

        return final_prediction

def base_qa(model, tokenizer, input, preprocessing_kwargs, task_kwargs, model_kwargs):
    """
    Span-based question answering for a given question and context.
    We expect the input to use the (question, context) format for the text pairs.
    Args:
        request: the prediction request
    """    
    preprocessing_kwargs["truncation"] = "only_second"
    features = tokenizer(
        input, return_tensors="pt", **preprocessing_kwargs
    )
    predictions, features = base_predict(model, input, tokenizer, preprocessing_kwargs, model_kwargs, output_features=True)

    task_outputs = {
        "answers": [],
        "attributions": [],
        "adversarial": {
            "indices": [],
        },  # for hotflip, input_reduction and topk
    }

    for idx, (start, end, (_, context)) in enumerate(
            zip(predictions["start_logits"], predictions["end_logits"], input)
    ):
        # Ensure padded tokens & question tokens cannot
        # belong to the set of candidate answers.
        question_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1)
        # Unmask CLS token for "no answer"
        question_tokens[0] = 1
        undesired_tokens = question_tokens & features["attention_mask"][idx].numpy()

        # Generate mask
        undesired_tokens_mask = undesired_tokens == 0.0

        # Make sure non-context indexes in the tensor cannot
        # contribute to the softmax
        start = np.where(undesired_tokens_mask, -10000.0, start)
        end = np.where(undesired_tokens_mask, -10000.0, end)

        start = np.exp(start - np.log(np.sum(np.exp(start), axis=-1, keepdims=True)))
        end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True)))

        # Get score for "no answer" then mask for decoding step (CLS token
        no_answer_score = (start[0] * end[0]).item()
        start[0] = end[0] = 0.0

        starts, ends, scores = decode(
            start,
            end,
            task_kwargs.get("topk", 1),
            task_kwargs.get("max_answer_len", 128),
            undesired_tokens,
        )

        enc = features[idx]
        original_ans_start = enc.token_to_word(starts[0])
        original_ans_end = enc.token_to_word(ends[0])
        answers = [
            {
                "score": score.item(),
                "start": enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0],
                "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1],
                "answer": context[
                            enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0]: enc.word_to_chars(
                                enc.token_to_word(e), sequence_index=1
                            )[1]
                            ],
            }
            for s, e, score in zip(starts, ends, scores)
        ]
        if task_kwargs.get("show_null_answers", True):
            answers.append({"score": no_answer_score, "start": 0, "end": 0, "answer": ""})
        answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: task_kwargs.get("topk", 1)]
        task_outputs["answers"].append(answers)

    return predictions, task_outputs, original_ans_start, original_ans_end

In [6]:
# Code from SQuARE ONNX QA Pipeline (note: some features like explainability and attack mode have been removed)
def question_answering(model_qa, tokenizer, input, preprocessing_kwargs, task_kwargs, model_kwargs):
    """
    Span-based question answering for a given question and context.
    We expect the input to use the (question, context) format for the text pairs.
    Args:
        request: the prediction request
    """    
    preprocessing_kwargs["truncation"] = "only_second"

    features = tokenizer(
        input, return_tensors="np", **preprocessing_kwargs
    )
    onnx_inputs = {key: np.array(features[key], dtype=np.int64) for key in features}
    
    predictions_onnx = model_qa.run(input_feed=onnx_inputs, output_names=None)
    predictions = {
        "start_logits": predictions_onnx[0],
        "end_logits": predictions_onnx[1]
    }

    task_outputs = {
        "answers": [],
        "attributions": [],
        "adversarial": {
            "indices": [],
        },  # for hotflip, input_reduction and topk
    }

    for idx, (start, end, (_, context)) in enumerate(
            zip(predictions["start_logits"], predictions["end_logits"], input)
    ):
        # Ensure padded tokens & question tokens cannot
        # belong to the set of candidate answers.
        question_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1)
        # Unmask CLS token for "no answer"
        question_tokens[0] = 1
        undesired_tokens = question_tokens & features["attention_mask"][idx]

        # Generate mask
        undesired_tokens_mask = undesired_tokens == 0.0

        # Make sure non-context indexes in the tensor cannot
        # contribute to the softmax
        start = np.where(undesired_tokens_mask, -10000.0, start)
        end = np.where(undesired_tokens_mask, -10000.0, end)

        start = np.exp(start - np.log(np.sum(np.exp(start), axis=-1, keepdims=True)))
        end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True)))

        # Get score for "no answer" then mask for decoding step (CLS token
        no_answer_score = (start[0] * end[0]).item()
        start[0] = end[0] = 0.0

        starts, ends, scores = decode(
            start,
            end,
            task_kwargs.get("topk", 1),
            task_kwargs.get("max_answer_len", 128),
            undesired_tokens,
        )

        enc = features[idx]
        original_ans_start = enc.token_to_word(starts[0])
        original_ans_end = enc.token_to_word(ends[0])
        answers = [
            {
                "score": score.item(),
                "start": enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0],
                "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1],
                "answer": context[
                            enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0]: enc.word_to_chars(
                                enc.token_to_word(e), sequence_index=1
                            )[1]
                            ],
            }
            for s, e, score in zip(starts, ends, scores)
        ]
        if task_kwargs.get("show_null_answers", True):
            answers.append({"score": no_answer_score, "start": 0, "end": 0, "answer": ""})
        answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: task_kwargs.get("topk", 1)]
        task_outputs["answers"].append(answers)

    return predictions, task_outputs, original_ans_start, original_ans_end

In [7]:
preprocessing_kwargs = {"padding": True, "truncation": True}

task_kwargs = {"show_null_answers": False, "topk": 1, "max_answer_len": 128}

model_kwargs = {"": {}}

In [8]:
def load_model(model_onnx, model_onnx_quant, as_list=False):
    local_onnx_model = onnxruntime.InferenceSession(model_onnx, providers=["CPUExecutionProvider"])
    local_onnx_model_quant = onnxruntime.InferenceSession(model_onnx_quant, providers=["CPUExecutionProvider"])
    
    so = onnxruntime.SessionOptions()
    so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    
    local_onnx_model_opt = onnxruntime.InferenceSession(model_onnx, so)
    local_onnx_model_quant_opt = onnxruntime.InferenceSession(model_onnx_quant, so)
    
    if as_list:
        return [local_onnx_model, local_onnx_model_opt, local_onnx_model_quant, local_onnx_model_quant_opt]
    return local_onnx_model, local_onnx_model_opt, local_onnx_model_quant, local_onnx_model_quant_opt

def repo_builder(reader, adapter):
    repo_id = f"UKP-SQuARE/{reader}-pf-{adapter}-onnx"
    filename_onnx = "model.onnx"
    filename_onnx_quant = "model_quant.onnx"

    model_onnx = hf_hub_download(repo_id=repo_id, filename=filename_onnx)
    model_onnx_quant = hf_hub_download(repo_id=repo_id, filename=filename_onnx_quant)

    return model_onnx, model_onnx_quant

In [15]:
# def run_torch(model, inputs):
#     with torch.no_grad():
#         model(**inputs)

# def run_onnx(qa_model, onnx_inputs):
#     qa_model.run(output_names=["start_logits", "end_logits"], input_feed=dict(onnx_inputs))   

# def get_time_duration(func, model, inputs): 
#     st= time.time()
#     func(model, inputs)
#     et = time.time()
#     return 1000 * (et - st)

def save_df(df_new, path_to_logger_file = "logger_all.csv"):

    if os.path.exists(path_to_logger_file):
        df_fin = pd.concat([pd.read_csv(path_to_logger_file), df_new])
        df_fin.to_csv(path_to_logger_file,index=False)
    else: 
        df_new.to_csv(path_to_logger_file,index=False)

# def measure_time(perf_type, tokenizer, question, context, model):
#     if perf_type == "base":
#         inputs = tokenizer(question, context, return_tensors="pt", truncation=True)
#         mode = run_torch
#         # time_once = get_time_duration(run_torch, model, inputs)
    
#     elif perf_type == "seq_length":
#         inputs = tokenizer(question, context, return_tensors="np", truncation=True)
#         inputs = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}
#         mode = run_onnx
#         # time_once = get_time_duration(run_onnx, model, inputs) 
    
#     time_once = get_time_duration(mode, model, inputs) 

#     return time_once

# def performance_log(perf_type, name, model, tokenizer, data, data_intervall = 0): 
#     df = pd.DataFrame(columns=["model_name", "time once (ms)", "average_time 50 times (ms)", "seq_length", "context", "question", "data_id"])
    
#     for i in range(0, len(data["context"]), data_intervall):
#         context = data["context"][i]
#         question = data["question"][i]
#         time_duration = measure_time(perf_type, tokenizer, question, context, model)
        
#         seq_length = len(context.split()) # TODO -> reduce stopwords? Real Tokenization?
        
#         df.loc[len(df)] = [name, time_duration, "", seq_length, context, question, data["id"][i]]
        
#         print("Model: {}, Input Length {}: {:.3f} ms".format(name, seq_length, time_duration))
#     save_df(df)

### evaluate all extractive qa model on squad

In [10]:
data_set_name = "squad"
data = load_dataset(data_set_name, split="validation[:100]")
metric = evaluate.load(data_set_name)

Found cached dataset squad (/Users/michaelhermann/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [11]:
def squad_evaluate(inference_func, model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs):
    examples = list(zip(data["question"], data["context"]))
    predictions = []
    for example in examples:
        _, task_outputs, _, _ = inference_func(model, tokenizer, [example], preprocessing_kwargs, task_kwargs, model_kwargs)
        predictions.append(task_outputs["answers"][0][0]["answer"])
    
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in zip(data["id"], predictions)]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in data]
    
    score = metric.compute(predictions=formatted_predictions, references=references)

    return score["f1"], score["exact_match"]

In [None]:
result = []
for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print(f"Loading: {reader} {adapter}")

    #load base model
    tokenizer = AutoTokenizer.from_pretrained(reader)
    default_model = AutoModelWithHeads.from_pretrained(reader)
    adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
    default_model.active_adapters = adapter_name

    # Test acc for base model
    f1, exact = squad_evaluate(base_qa, default_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)
    result.append(("Base", skill, reader, adapter, f1, exact, data_set_name))

    #load onnx models
    model_onnx, model_onnx_quant = repo_builder(reader, adapter)
    onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
    onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "Quantized ONNX", "Quantized ONNX - OPT"]

    # Test acc for onnx models
    for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
        f1, exact = squad_evaluate(question_answering, onnx_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)
        result.append((onnx_model_name, skill, reader, adapter, f1, exact, data_set_name))   

In [None]:
df = pd.DataFrame(result, columns=["name", "skill", "reader", "adapter", "f1", "exact", "dataset"])

In [16]:
save_df(df, "accuracy.csv")

In [18]:
df

### evaluate extractive qa model on specific adapter - TODO

In [33]:
# result = []
# for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
#     print(f"Loading: {reader} {adapter}")

#     #load adapter specific dataset
#     data_set_name = adapter
#     if data_set_name in ["newsqa", "hotpot_qa"]:
#         continue
#     else: 
#         data = load_dataset(data_set_name, split="validation[:100]")

#     metric = evaluate.load(data_set_name)

#     #load base model
#     tokenizer = AutoTokenizer.from_pretrained(reader)
#     default_model = AutoModelWithHeads.from_pretrained(reader)
#     adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
#     default_model.active_adapters = adapter_name

#     # Test acc for base model
#     f1, exact = squad_evaluate(base_qa, default_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)
#     result.append(("Base", skill, reader, adapter, f1, exact, data_set_name))

#     #load onnx models
#     model_onnx, model_onnx_quant = repo_builder(reader, adapter)
#     onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
#     onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "Quantized ONNX", "Quantized ONNX - OPT"]

#     # Test acc for onnx models
#     for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
#         f1, exact = squad_evaluate(question_answering, onnx_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)
#         result.append((onnx_model_name, skill, reader, adapter, f1, exact, data_set_name))   

In [None]:
df = pd.DataFrame(result, columns=["name", "skill", "reader", "adapter", "f1", "exact", "dataset"])

save_df(df, "accuracy.csv")

### evaluate categorical qa

In [36]:
all_skills = pd.read_csv("square_skills/impl_skills.csv")
skill = "categorical"
skills = load_skills(skill)

In [35]:
data_set_name = "boolq"
data = load_dataset(data_set_name, split="validation[:100]")

Found cached dataset boolq (/Users/michaelhermann/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5)


In [37]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
adapter_name = model.load_adapter("AdapterHub/bert-base-uncased-pf-boolq", source="hf")
model.active_adapters = adapter_name

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4100.67it/s]


In [57]:
def categorical_base_inference(model, tokenizer, question, context):
    
    raw_input = [[context, question]]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="pt")
    
    outputs = model(**inputs)
    answer_idx = torch.argmax(outputs.logits)

    return bool(answer_idx)
    
def onnx_inference(onnx_model, tokenizer, question, context):

    inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors="np")
    inputs = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}

    outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)

    return bool(np.argmax(outputs[0][0]))

In [60]:
def boolq_evaluate(model, tokenizer, data, inference_type):
    correct = 0
    for test_no in range(len(data)):
        question = data[test_no]["question"]
        correct_answer = data[test_no]["answer"]
        context = data[test_no]["passage"]

        answer = inference_type(model, tokenizer, question, context)

        if answer == correct_answer:
            correct += 1

    return correct/len(data)

In [65]:
result = []
for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print(f"Loading: {reader} {adapter}")

    #load base model
    tokenizer = AutoTokenizer.from_pretrained(reader)
    default_model = AutoModelWithHeads.from_pretrained(reader)
    adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
    default_model.active_adapters = adapter_name

    # Test acc for base model
    exact = boolq_evaluate(default_model, tokenizer, data, categorical_base_inference)
    result.append(("Base", skill, reader, adapter, "", exact, data_set_name))

    #load onnx models
    model_onnx, model_onnx_quant = repo_builder(reader, adapter)
    onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
    onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "Quantized ONNX", "Quantized ONNX - OPT"]


    for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
        exact = boolq_evaluate(onnx_model, tokenizer, data, onnx_inference)
        result.append((onnx_model_name, skill, reader, adapter, "", exact, data_set_name))

Loading: bert-base-uncased boolq


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2799.62it/s]


Loading: roberta-base boolq


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

In [66]:
df = pd.DataFrame(result, columns=["name", "skill", "reader", "adapter", "f1", "exact", "dataset"])

In [67]:
df

Unnamed: 0,name,skill,reader,adapter,f1,exact,dataset
0,Base,categorical,bert-base-uncased,boolq,,0.73,boolq
1,ONNX,categorical,bert-base-uncased,boolq,,0.66,boolq
2,ONNX-OPT,categorical,bert-base-uncased,boolq,,0.66,boolq
3,Quantized ONNX,categorical,bert-base-uncased,boolq,,0.47,boolq
4,Quantized ONNX - OPT,categorical,bert-base-uncased,boolq,,0.47,boolq
5,Base,categorical,roberta-base,boolq,,0.78,boolq
6,ONNX,categorical,roberta-base,boolq,,0.75,boolq
7,ONNX-OPT,categorical,roberta-base,boolq,,0.75,boolq
8,Quantized ONNX,categorical,roberta-base,boolq,,0.74,boolq
9,Quantized ONNX - OPT,categorical,roberta-base,boolq,,0.74,boolq


In [68]:
save_df(df, "accuracy.csv")