In [267]:
from datasets import load_dataset, load_metric
import evaluate
from transformers import AutoModelWithHeads, AutoTokenizer
from transformers.models.bert import BertOnnxConfig
from transformers.onnx import OnnxConfig, validate_model_outputs, export

import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxruntime import InferenceSession
import onnxruntime


import time
from typing import Tuple, Union
import torch
import numpy as np
import pandas as pd
import os
from typing import Mapping, OrderedDict

from huggingface_hub import hf_hub_download

In [268]:
def load_skills(skill_type, path="square_skills/impl_skills.csv"):
    all_skills = pd.read_csv(path)
    skills = all_skills[all_skills["Type"] == skill_type]
    return skills

## Measure Inference Time 

#### extractive qa

In [None]:
all_skills = pd.read_csv("square_skills/impl_skills.csv")
skill = "span-extraction"
skills = load_skills(skill)

In [None]:
def decode(
            start_: np.ndarray,
            end_: np.ndarray,
            topk: int,
            max_answer_len: int,
            undesired_tokens_: np.ndarray,
    ) -> Tuple:
    """
    Take the output of any :obj:`ModelForQuestionAnswering` and
        will generate probabilities for each span to be the
        actual answer.
    In addition, it filters out some unwanted/impossible cases
    like answer len being greater than max_answer_len or
    answer end position being before the starting position.
    The method supports output the k-best answer through
    the topk argument.
    Args:
        start_ (:obj:`np.ndarray`): Individual start
            probabilities for each token.
        end (:obj:`np.ndarray`): Individual end_ probabilities
            for each token.
        topk (:obj:`int`): Indicates how many possible answer
            span(s) to extract from the model output.
        max_answer_len (:obj:`int`): Maximum size of the answer
            to extract from the model"s output.
        undesired_tokens_ (:obj:`np.ndarray`): Mask determining
            tokens that can be part of the answer
    """
    # Ensure we have batch axis
    if start_.ndim == 1:
        start_ = start_[None]

    if end_.ndim == 1:
        end_ = end_[None]

    # Compute the score of each tuple(start_, end_) to be the real answer
    outer = np.matmul(np.expand_dims(start_, -1), np.expand_dims(end_, 1))

    # Remove candidate with end_ < start_ and end_ - start_ > max_answer_len
    candidates = np.tril(np.triu(outer), max_answer_len - 1)

    #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
    scores_flat = candidates.flatten()
    if topk == 1:
        idx_sort = [np.argmax(scores_flat)]
    elif len(scores_flat) < topk:
        idx_sort = np.argsort(-scores_flat)
    else:
        idx = np.argpartition(-scores_flat, topk)[0:topk]
        idx_sort = idx[np.argsort(-scores_flat[idx])]

    starts_, ends_ = np.unravel_index(idx_sort, candidates.shape)[1:]
    desired_spans = np.isin(starts_, undesired_tokens_.nonzero()) & np.isin(
        ends_, undesired_tokens_.nonzero()
    )
    starts_ = starts_[desired_spans]
    ends_ = ends_[desired_spans]
    scores_ = candidates[0, starts_, ends_]

    return starts_, ends_, scores_

In [None]:
def base_predict(
            model, input, tokenizer, preprocessing_kwargs, model_kwargs, batch_size=1, disable_gpu=True, output_features=False
    ) -> Union[dict, Tuple[dict, dict]]:
        """
        Inference on the input.
        Args:
         request: the request with the input and optional kwargs
         output_features: return the features of the input.
            Necessary if, e.g., attention mask is needed for post-processing.
        Returns:
             The model outputs and optionally the input features
        """

        all_predictions = []
        preprocessing_kwargs["padding"] = preprocessing_kwargs.get(
            "padding", True
        )
        preprocessing_kwargs["truncation"] = preprocessing_kwargs.get(
            "truncation", True
        )
        model.to(
            "cuda"
            if torch.cuda.is_available() and not disable_gpu
            else "cpu"
        )

        features = tokenizer(
            input, return_tensors="pt", **preprocessing_kwargs
        )

        for start_idx in range(0, len(input), batch_size):
            with torch.no_grad():
                input_features = {
                    k: features[k][start_idx: start_idx + batch_size]
                    for k in features.keys()
                }
                predictions = model(**input_features, **model_kwargs)
                all_predictions.append(predictions)

        keys = all_predictions[0].keys()
        final_prediction = {}
        for key in keys:
            # HuggingFace outputs for "attentions" and more is
            # returned as tuple of tensors
            # Tuple of tuples only exists for "past_key_values"
            # which is only relevant for generation.
            # Generation should NOT use this function
            if isinstance(all_predictions[0][key], tuple):
                tuple_of_lists = list(
                    zip(
                        *[
                            [
                                torch.stack(p).cpu()
                                if isinstance(p, tuple)
                                else p.cpu()
                                for p in tpl[key]
                            ]
                            for tpl in all_predictions
                        ]
                    )
                )
                final_prediction[key] = tuple(torch.cat(l) for l in tuple_of_lists)
            else:
                final_prediction[key] = torch.cat(
                    [p[key].cpu() for p in all_predictions]
                )
        if output_features:
            return final_prediction, features

        return final_prediction

def base_qa(model, tokenizer, input, preprocessing_kwargs, task_kwargs, model_kwargs):
    """
    Span-based question answering for a given question and context.
    We expect the input to use the (question, context) format for the text pairs.
    Args:
        request: the prediction request
    """    
    preprocessing_kwargs["truncation"] = "only_second"
    features = tokenizer(
        input, return_tensors="pt", **preprocessing_kwargs
    )
    predictions, features = base_predict(model, input, tokenizer, preprocessing_kwargs, model_kwargs, output_features=True)

    task_outputs = {
        "answers": [],
        "attributions": [],
        "adversarial": {
            "indices": [],
        },  # for hotflip, input_reduction and topk
    }

    for idx, (start, end, (_, context)) in enumerate(
            zip(predictions["start_logits"], predictions["end_logits"], input)
    ):
        # Ensure padded tokens & question tokens cannot
        # belong to the set of candidate answers.
        question_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1)
        # Unmask CLS token for "no answer"
        question_tokens[0] = 1
        undesired_tokens = question_tokens & features["attention_mask"][idx].numpy()

        # Generate mask
        undesired_tokens_mask = undesired_tokens == 0.0

        # Make sure non-context indexes in the tensor cannot
        # contribute to the softmax
        start = np.where(undesired_tokens_mask, -10000.0, start)
        end = np.where(undesired_tokens_mask, -10000.0, end)

        start = np.exp(start - np.log(np.sum(np.exp(start), axis=-1, keepdims=True)))
        end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True)))

        # Get score for "no answer" then mask for decoding step (CLS token
        no_answer_score = (start[0] * end[0]).item()
        start[0] = end[0] = 0.0

        starts, ends, scores = decode(
            start,
            end,
            task_kwargs.get("topk", 1),
            task_kwargs.get("max_answer_len", 128),
            undesired_tokens,
        )

        enc = features[idx]
        original_ans_start = enc.token_to_word(starts[0])
        original_ans_end = enc.token_to_word(ends[0])
        answers = [
            {
                "score": score.item(),
                "start": enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0],
                "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1],
                "answer": context[
                            enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0]: enc.word_to_chars(
                                enc.token_to_word(e), sequence_index=1
                            )[1]
                            ],
            }
            for s, e, score in zip(starts, ends, scores)
        ]
        if task_kwargs.get("show_null_answers", True):
            answers.append({"score": no_answer_score, "start": 0, "end": 0, "answer": ""})
        answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: task_kwargs.get("topk", 1)]
        task_outputs["answers"].append(answers)

    return predictions, task_outputs, original_ans_start, original_ans_end

In [None]:
# Code from SQuARE ONNX QA Pipeline (note: some features like explainability and attack mode have been removed)
def question_answering(model_qa, tokenizer, input, preprocessing_kwargs, task_kwargs, model_kwargs):
    """
    Span-based question answering for a given question and context.
    We expect the input to use the (question, context) format for the text pairs.
    Args:
        request: the prediction request
    """    
    preprocessing_kwargs["truncation"] = "only_second"

    features = tokenizer(
        input, return_tensors="np", **preprocessing_kwargs
    )
    onnx_inputs = {key: np.array(features[key], dtype=np.int64) for key in features}
    
    predictions_onnx = model_qa.run(input_feed=onnx_inputs, output_names=None)
    predictions = {
        "start_logits": predictions_onnx[0],
        "end_logits": predictions_onnx[1]
    }

    task_outputs = {
        "answers": [],
        "attributions": [],
        "adversarial": {
            "indices": [],
        },  # for hotflip, input_reduction and topk
    }

    for idx, (start, end, (_, context)) in enumerate(
            zip(predictions["start_logits"], predictions["end_logits"], input)
    ):
        # Ensure padded tokens & question tokens cannot
        # belong to the set of candidate answers.
        question_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1)
        # Unmask CLS token for "no answer"
        question_tokens[0] = 1
        undesired_tokens = question_tokens & features["attention_mask"][idx]

        # Generate mask
        undesired_tokens_mask = undesired_tokens == 0.0

        # Make sure non-context indexes in the tensor cannot
        # contribute to the softmax
        start = np.where(undesired_tokens_mask, -10000.0, start)
        end = np.where(undesired_tokens_mask, -10000.0, end)

        start = np.exp(start - np.log(np.sum(np.exp(start), axis=-1, keepdims=True)))
        end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True)))

        # Get score for "no answer" then mask for decoding step (CLS token
        no_answer_score = (start[0] * end[0]).item()
        start[0] = end[0] = 0.0

        starts, ends, scores = decode(
            start,
            end,
            task_kwargs.get("topk", 1),
            task_kwargs.get("max_answer_len", 128),
            undesired_tokens,
        )

        enc = features[idx]
        original_ans_start = enc.token_to_word(starts[0])
        original_ans_end = enc.token_to_word(ends[0])
        answers = [
            {
                "score": score.item(),
                "start": enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0],
                "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1],
                "answer": context[
                            enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0]: enc.word_to_chars(
                                enc.token_to_word(e), sequence_index=1
                            )[1]
                            ],
            }
            for s, e, score in zip(starts, ends, scores)
        ]
        if task_kwargs.get("show_null_answers", True):
            answers.append({"score": no_answer_score, "start": 0, "end": 0, "answer": ""})
        answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: task_kwargs.get("topk", 1)]
        task_outputs["answers"].append(answers)

    return predictions, task_outputs, original_ans_start, original_ans_end

In [None]:
preprocessing_kwargs = {"padding": True, "truncation": True}

task_kwargs = {"show_null_answers": False, "topk": 1, "max_answer_len": 128}

model_kwargs = {"": {}}

In [None]:
def load_model(model_onnx, model_onnx_quant, as_list=False):
    local_onnx_model = onnxruntime.InferenceSession(model_onnx, providers=["CPUExecutionProvider"])
    local_onnx_model_quant = onnxruntime.InferenceSession(model_onnx_quant, providers=["CPUExecutionProvider"])
    
    so = onnxruntime.SessionOptions()
    so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    
    local_onnx_model_opt = onnxruntime.InferenceSession(model_onnx, so)
    local_onnx_model_quant_opt = onnxruntime.InferenceSession(model_onnx_quant, so)
    
    if as_list:
        return [local_onnx_model, local_onnx_model_opt, local_onnx_model_quant, local_onnx_model_quant_opt]
    return local_onnx_model, local_onnx_model_opt, local_onnx_model_quant, local_onnx_model_quant_opt

def repo_builder(reader, adapter):
    repo_id = f"UKP-SQuARE/{reader}-pf-{adapter}-onnx"
    filename_onnx = "model.onnx"
    filename_onnx_quant = "model_quant.onnx"

    model_onnx = hf_hub_download(repo_id=repo_id, filename=filename_onnx)
    model_onnx_quant = hf_hub_download(repo_id=repo_id, filename=filename_onnx_quant)

    return model_onnx, model_onnx_quant

In [None]:
# def run_torch(model, inputs):
#     with torch.no_grad():
#         model(**inputs)

# def run_onnx(qa_model, onnx_inputs):
#     qa_model.run(output_names=["start_logits", "end_logits"], input_feed=dict(onnx_inputs))   

# def get_time_duration(func, model, inputs): 
#     st= time.time()
#     func(model, inputs)
#     et = time.time()
#     return 1000 * (et - st)

def save_df(df_new, path_to_logger_file = "logger_all.csv"):

    if os.path.exists(path_to_logger_file):
        df_fin = pd.concat([pd.read_csv(path_to_logger_file), df_new])
        df_fin.to_csv(path_to_logger_file,index=False)
    else: 
        df_new.to_csv(path_to_logger_file,index=False)

# def measure_time(perf_type, tokenizer, question, context, model):
#     if perf_type == "base":
#         inputs = tokenizer(question, context, return_tensors="pt", truncation=True)
#         mode = run_torch
#         # time_once = get_time_duration(run_torch, model, inputs)
    
#     elif perf_type == "seq_length":
#         inputs = tokenizer(question, context, return_tensors="np", truncation=True)
#         inputs = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}
#         mode = run_onnx
#         # time_once = get_time_duration(run_onnx, model, inputs) 
    
#     time_once = get_time_duration(mode, model, inputs) 

#     return time_once

# def performance_log(perf_type, name, model, tokenizer, data, data_intervall = 0): 
#     df = pd.DataFrame(columns=["model_name", "time once (ms)", "average_time 50 times (ms)", "seq_length", "context", "question", "data_id"])
    
#     for i in range(0, len(data["context"]), data_intervall):
#         context = data["context"][i]
#         question = data["question"][i]
#         time_duration = measure_time(perf_type, tokenizer, question, context, model)
        
#         seq_length = len(context.split()) # TODO -> reduce stopwords? Real Tokenization?
        
#         df.loc[len(df)] = [name, time_duration, "", seq_length, context, question, data["id"][i]]
        
#         print("Model: {}, Input Length {}: {:.3f} ms".format(name, seq_length, time_duration))
#     save_df(df)

### categorical 

In [307]:
all_skills = pd.read_csv("square_skills/impl_skills.csv")
skill = "categorical"
skills = load_skills(skill)

In [308]:
def categorical_base_inference(model, tokenizer, question, context):
    
    raw_input = [[context, question]]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="pt")
    
    outputs = model(**inputs)
    answer_idx = torch.argmax(outputs.logits)
    
    return bool(answer_idx)

def categorical_onnx_inference(onnx_model, tokenizer, question, context):

    inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors="np")
    inputs = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}

    outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)

    return bool(np.argmax(outputs[0][0]))

In [319]:
def get_time_duration(func, model, tokenizer, question, context): 
    st= time.time()
    func(model, tokenizer, question, context)
    et = time.time()
    return 1000 * (et - st)

def save_df(df_new, path_to_logger_file = "logger_all.csv"):
    if os.path.exists(path_to_logger_file):
        df_fin = pd.concat([pd.read_csv(path_to_logger_file), df_new])
        df_fin.to_csv(path_to_logger_file,index=False)
    else: 
        df_new.to_csv(path_to_logger_file,index=False)

def performance_log(func, name, model, tokenizer, data, data_set_name, data_intervall = 0): 
    df = pd.DataFrame(columns=["model_name", "time once (ms)", "average_time 50 times (ms)", "seq_length", "context", "question", "data_id", "data_set_name"])
    
    for i in range(0, len(data["passage"]), data_intervall):
        context = data["passage"][i]
        question = data["question"][i]
        time_duration = get_time_duration(func, model, tokenizer, question, context)
        
        seq_length = len(context.split()) # TODO -> reduce stopwords
        
        df.loc[len(df)] = [name, time_duration, "", seq_length, context, question, i, data_set_name]
        
        print("Model: {}, Input Length {}: {:.3f} ms".format(name, seq_length, time_duration))
    save_df(df, path_to_logger_file="inference_time.csv")

In [321]:
runs = 250
data_set_name = "boolq"
data = load_dataset(data_set_name, split=f"validation[:{runs}]")

for i in range(5):
    for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
        print("Loading: {} {}".format(reader, adapter))
        
        #load base model
        tokenizer = AutoTokenizer.from_pretrained(reader)
        default_model = AutoModelWithHeads.from_pretrained(reader)
        adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
        default_model.active_adapters = adapter_name

        performance_log(categorical_base_inference, "Base", default_model, tokenizer, data, data_set_name, 1) 

        #load quant model
        quantized_base_model = torch.quantization.quantize_dynamic(default_model, {torch.nn.Linear}, dtype=torch.qint8)
        performance_log(categorical_base_inference, "Base Quantized", quantized_base_model, tokenizer, data, data_set_name, 1) 
        
        #load onnx models
        model_onnx, model_onnx_quant = repo_builder(reader, adapter)
        onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
        onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "ONNX Quantized", "ONNX-OPT Quantized"]

        # eval onnx models
        for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
            performance_log(categorical_onnx_inference, onnx_model_name, onnx_model, tokenizer, data, data_set_name, 1) 


### MCQ

In [381]:
all_skills = pd.read_csv("square_skills/impl_skills.csv")
skill = "multiple-choice"
skills = load_skills(skill)

In [382]:
def mc_model_inference(model, tokenizer, question, context, choices):
    outputs = []
    raw_input = [[context, question + " " + choice] for choice in choices]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="pt")
        
    with torch.no_grad():
        outputs = model(**inputs)

    answer_idx = torch.argmax(outputs.logits)

    return choices[answer_idx]

def mc_onnx_inference(onnx_model, tokenizer, question, context, choices):

    raw_input = [[context, question + " " + choice] for choice in choices]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="np")

    inputs['input_ids'] =  np.expand_dims(inputs['input_ids'], axis=0)
    inputs['attention_mask'] =  np.expand_dims(inputs['attention_mask'], axis=0)

    if "token_type_ids" in inputs: #roberta does not use this
        inputs['token_type_ids'] = np.expand_dims(inputs['token_type_ids'], axis=0)

    outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)
    answer_idx = np.argmax(outputs[0])
    return choices[answer_idx]

In [388]:
def get_time_duration(func, model, tokenizer, question, context, choices): 
    st= time.time()
    func(model, tokenizer, question, context, choices)
    et = time.time()
    return 1000 * (et - st)

def save_df(df_new, path_to_logger_file = "logger_all.csv"):
    if os.path.exists(path_to_logger_file):
        df_fin = pd.concat([pd.read_csv(path_to_logger_file), df_new])
        df_fin.to_csv(path_to_logger_file,index=False)
    else: 
        df_new.to_csv(path_to_logger_file,index=False)

def performance_log(func, name, model, tokenizer, preped_data_set, data_set_name, data_intervall=1, run_amount=10): 

    for i in range(run_amount):
        df = pd.DataFrame(columns=["model_name", "time once (ms)", "average_time 50 times (ms)", "seq_length", "context", "question", "choices", "data_id", "data_set_name"])
        
        for i in range(0, len(preped_data_set), data_intervall):
            question, context, choices = preped_data_set[i][0], preped_data_set[i][1], preped_data_set[i][2]
            time_duration = get_time_duration(func, model, tokenizer, question, context, choices)
            
            seq_length = len(context.split()) # TODO -> reduce stopwords
            
            df.loc[len(df)] = [name, time_duration, "", seq_length, context, question, choices, i, data_set_name]
            
            print("Model: {}, Input Length {}: {:.3f} ms".format(name, seq_length, time_duration))
        save_df(df, path_to_logger_file="inference_time_mcq_2.csv")

In [389]:
data_amount = 100
for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print("Loading: {} {}".format(reader, adapter))
    
    #load adapter specific dataset
    data_set_name = adapter
    if data_set_name in ["commonsense_qa", "social_i_qa", "multirc"]:
        continue
    elif data_set_name == "race":
        data = load_dataset(data_set_name, "all", split=f"validation[:{data_amount}]")
    else: 
        data = load_dataset(data_set_name, split=f"validation[:{data_amount}]")
    
    print(f"Loaded dataset: {data_set_name}")

    # build preped data
    preped_data_set = []
    for example in data:
        if data_set_name == "cosmos_qa":
            choices = [example["answer0"], example["answer1"], example["answer2"], example["answer3"]]
            preped_data_set.append((example["question"], example["context"], choices))
        elif data_set_name == "quail":
            preped_data_set.append((example["question"], example["context"], example["answers"]))
        elif data_set_name == "quartz":
            preped_data_set.append((example["question"], example["para"], example["choices"]["text"]))
        elif data_set_name =="race":
            preped_data_set.append((example["question"], example["article"], example["options"]))
            id_name = "example_id"
            
        else:
            print("Error. Not implemented data_set. Dont know how to build preped_data_set.")
            Exception
    print("Preped data")

    data_runs = 5
    data_intervall = 10
    
    #load and eval base model
    tokenizer = AutoTokenizer.from_pretrained(reader)
    default_model = AutoModelWithHeads.from_pretrained(reader)
    adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
    default_model.active_adapters = adapter_name
    performance_log(mc_model_inference, "Base", default_model, tokenizer, preped_data_set, data_set_name, data_intervall, data_runs) 
    
    #load and eval quant model
    quantized_base_model = torch.quantization.quantize_dynamic(default_model, {torch.nn.Linear}, dtype=torch.qint8)
    performance_log(mc_model_inference, "Base Quantized", quantized_base_model, tokenizer, preped_data_set, data_set_name, data_intervall, data_runs) 
    
    #load onnx models
    model_onnx, model_onnx_quant = repo_builder(reader, adapter)
    onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
    onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "ONNX Quantized", "ONNX-OPT Quantized"]

    # eval onnx models
    for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
        performance_log(mc_onnx_inference, onnx_model_name, onnx_model, tokenizer, preped_data_set, data_set_name, data_intervall, data_runs) 

Loading: bert-base-uncased cosmos_qa


Found cached dataset cosmos_qa (/Users/michaelhermann/.cache/huggingface/datasets/cosmos_qa/default/0.1.0/3e18538cbfdb2c04189b16642715f0f6da3e97ed5df0aadcec3641245b2cf157)


Loaded dataset: cosmos_qa
Preped data


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3577.23it/s]


Model: Base, Input Length 92: 364.141 ms
Model: Base, Input Length 67: 240.724 ms
Model: Base, Input Length 117: 454.405 ms
Model: Base, Input Length 58: 268.950 ms
Model: Base, Input Length 62: 261.901 ms
Model: Base, Input Length 92: 326.817 ms
Model: Base, Input Length 112: 340.621 ms
Model: Base, Input Length 69: 195.496 ms
Model: Base, Input Length 107: 335.328 ms
Model: Base, Input Length 44: 186.007 ms
Model: Base, Input Length 92: 307.447 ms
Model: Base, Input Length 67: 189.204 ms
Model: Base, Input Length 117: 380.620 ms
Model: Base, Input Length 58: 257.042 ms
Model: Base, Input Length 62: 212.051 ms
Model: Base, Input Length 92: 270.642 ms
Model: Base, Input Length 112: 323.247 ms
Model: Base, Input Length 69: 195.751 ms
Model: Base, Input Length 107: 407.122 ms
Model: Base, Input Length 44: 217.303 ms
Model: Base, Input Length 92: 334.316 ms
Model: Base, Input Length 67: 222.130 ms
Model: Base, Input Length 117: 397.649 ms
Model: Base, Input Length 58: 224.278 ms
Model: Ba

Found cached dataset cosmos_qa (/Users/michaelhermann/.cache/huggingface/datasets/cosmos_qa/default/0.1.0/3e18538cbfdb2c04189b16642715f0f6da3e97ed5df0aadcec3641245b2cf157)


Loaded dataset: cosmos_qa
Preped data


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: Base, Input Length 92: 259.745 ms
Model: Base, Input Length 67: 174.830 ms
Model: Base, Input Length 117: 311.686 ms
Model: Base, Input Length 58: 196.325 ms
Model: Base, Input Length 62: 201.475 ms
Model: Base, Input Length 92: 269.657 ms
Model: Base, Input Length 112: 295.798 ms
Model: Base, Input Length 69: 200.789 ms
Model: Base, Input Length 107: 349.566 ms
Model: Base, Input Length 44: 187.335 ms
Model: Base, Input Length 92: 281.679 ms
Model: Base, Input Length 67: 193.471 ms
Model: Base, Input Length 117: 317.115 ms
Model: Base, Input Length 58: 167.446 ms
Model: Base, Input Length 62: 235.951 ms
Model: Base, Input Length 92: 275.280 ms
Model: Base, Input Length 112: 298.218 ms
Model: Base, Input Length 69: 189.559 ms
Model: Base, Input Length 107: 353.011 ms
Model: Base, Input Length 44: 184.889 ms
Model: Base, Input Length 92: 303.838 ms
Model: Base, Input Length 67: 201.477 ms
Model: Base, Input Length 117: 335.673 ms
Model: Base, Input Length 58: 186.741 ms
Model: Ba

Found cached dataset quail (/Users/michaelhermann/.cache/huggingface/datasets/quail/quail/1.3.0/3cabab19c99e571b528209e14313cfff1debf772db9e24e19b4fcbeb8399336c)


Loaded dataset: quail
Preped data


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3302.17it/s]


Model: Base, Input Length 310: 1163.226 ms
Model: Base, Input Length 310: 1059.952 ms
Model: Base, Input Length 320: 1136.427 ms
Model: Base, Input Length 320: 1162.821 ms
Model: Base, Input Length 325: 1289.864 ms
Model: Base, Input Length 325: 1371.122 ms
Model: Base, Input Length 278: 1188.074 ms
Model: Base, Input Length 278: 1155.267 ms
Model: Base, Input Length 313: 1341.083 ms
Model: Base, Input Length 313: 1275.993 ms
Model: Base, Input Length 310: 1314.396 ms
Model: Base, Input Length 310: 1381.996 ms
Model: Base, Input Length 320: 1361.550 ms
Model: Base, Input Length 320: 1375.574 ms
Model: Base, Input Length 325: 1523.039 ms
Model: Base, Input Length 325: 1471.356 ms
Model: Base, Input Length 278: 1292.772 ms
Model: Base, Input Length 278: 1272.892 ms
Model: Base, Input Length 313: 1324.723 ms
Model: Base, Input Length 313: 1429.339 ms
Model: Base, Input Length 310: 1394.206 ms
Model: Base, Input Length 310: 1396.219 ms
Model: Base, Input Length 320: 1474.157 ms
Model: Base

Found cached dataset quail (/Users/michaelhermann/.cache/huggingface/datasets/quail/quail/1.3.0/3cabab19c99e571b528209e14313cfff1debf772db9e24e19b4fcbeb8399336c)


Loaded dataset: quail
Preped data


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: Base, Input Length 310: 1227.966 ms
Model: Base, Input Length 310: 1319.022 ms
Model: Base, Input Length 320: 1443.268 ms
Model: Base, Input Length 320: 1188.583 ms
Model: Base, Input Length 325: 1245.538 ms
Model: Base, Input Length 325: 1323.734 ms
Model: Base, Input Length 278: 1169.779 ms
Model: Base, Input Length 278: 1432.532 ms
Model: Base, Input Length 313: 1338.215 ms
Model: Base, Input Length 313: 1386.475 ms
Model: Base, Input Length 310: 1785.359 ms
Model: Base, Input Length 310: 1737.205 ms
Model: Base, Input Length 320: 1553.691 ms
Model: Base, Input Length 320: 1644.744 ms
Model: Base, Input Length 325: 1806.277 ms
Model: Base, Input Length 325: 1815.897 ms
Model: Base, Input Length 278: 1469.884 ms
Model: Base, Input Length 278: 1761.851 ms
Model: Base, Input Length 313: 1850.211 ms
Model: Base, Input Length 313: 1590.867 ms
Model: Base, Input Length 310: 1814.666 ms
Model: Base, Input Length 310: 1956.559 ms
Model: Base, Input Length 320: 1670.192 ms
Model: Base

Found cached dataset quartz (/Users/michaelhermann/.cache/huggingface/datasets/quartz/default/0.1.0/6e5195fb88ecd7a75eda5d8f3549c262c8b15267366f38f9c153f40da92724a6)


Loaded dataset: quartz
Preped data


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: Base, Input Length 19: 86.306 ms
Model: Base, Input Length 20: 64.568 ms
Model: Base, Input Length 20: 79.444 ms
Model: Base, Input Length 18: 79.097 ms
Model: Base, Input Length 18: 88.553 ms
Model: Base, Input Length 10: 69.967 ms
Model: Base, Input Length 16: 93.287 ms
Model: Base, Input Length 10: 113.341 ms
Model: Base, Input Length 22: 87.493 ms
Model: Base, Input Length 14: 70.494 ms
Model: Base, Input Length 19: 84.970 ms
Model: Base, Input Length 20: 72.066 ms
Model: Base, Input Length 20: 81.114 ms
Model: Base, Input Length 18: 81.951 ms
Model: Base, Input Length 18: 103.982 ms
Model: Base, Input Length 10: 79.603 ms
Model: Base, Input Length 16: 107.818 ms
Model: Base, Input Length 10: 114.892 ms
Model: Base, Input Length 22: 88.366 ms
Model: Base, Input Length 14: 71.104 ms
Model: Base, Input Length 19: 87.990 ms
Model: Base, Input Length 20: 72.964 ms
Model: Base, Input Length 20: 86.119 ms
Model: Base, Input Length 18: 108.206 ms
Model: Base, Input Length 18: 108.1

Found cached dataset race (/Users/michaelhermann/.cache/huggingface/datasets/race/all/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)


Loaded dataset: race
Preped data


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2838.14it/s]


Model: Base, Input Length 339: 1192.867 ms
Model: Base, Input Length 433: 1494.180 ms
Model: Base, Input Length 385: 1505.298 ms
Model: Base, Input Length 317: 1056.481 ms
Model: Base, Input Length 327: 1145.822 ms
Model: Base, Input Length 583: 1600.533 ms
Model: Base, Input Length 254: 921.250 ms
Model: Base, Input Length 347: 1616.987 ms
Model: Base, Input Length 305: 1182.015 ms
Model: Base, Input Length 278: 1176.651 ms
Model: Base, Input Length 339: 1595.228 ms
Model: Base, Input Length 433: 1808.810 ms
Model: Base, Input Length 385: 1794.946 ms
Model: Base, Input Length 317: 1259.386 ms
Model: Base, Input Length 327: 1352.276 ms
Model: Base, Input Length 583: 2050.027 ms
Model: Base, Input Length 254: 1084.414 ms
Model: Base, Input Length 347: 1815.329 ms
Model: Base, Input Length 305: 1223.675 ms
Model: Base, Input Length 278: 1237.223 ms
Model: Base, Input Length 339: 1690.683 ms
Model: Base, Input Length 433: 2092.458 ms
Model: Base, Input Length 385: 2091.062 ms
Model: Base,

Found cached dataset race (/Users/michaelhermann/.cache/huggingface/datasets/race/all/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)


Loaded dataset: race
Preped data


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: Base, Input Length 339: 1136.817 ms
Model: Base, Input Length 433: 1446.437 ms
Model: Base, Input Length 385: 1377.089 ms
Model: Base, Input Length 317: 1141.790 ms
Model: Base, Input Length 327: 1317.540 ms
Model: Base, Input Length 583: 1690.949 ms
Model: Base, Input Length 254: 987.686 ms
Model: Base, Input Length 347: 1624.416 ms
Model: Base, Input Length 305: 1117.442 ms
Model: Base, Input Length 278: 1201.291 ms
Model: Base, Input Length 339: 1518.966 ms
Model: Base, Input Length 433: 1798.779 ms
Model: Base, Input Length 385: 1682.984 ms
Model: Base, Input Length 317: 1285.956 ms
Model: Base, Input Length 327: 1483.394 ms
Model: Base, Input Length 583: 1961.537 ms
Model: Base, Input Length 254: 1140.689 ms
Model: Base, Input Length 347: 1788.475 ms
Model: Base, Input Length 305: 1460.456 ms
Model: Base, Input Length 278: 1586.548 ms
Model: Base, Input Length 339: 1839.404 ms
Model: Base, Input Length 433: 2057.986 ms
Model: Base, Input Length 385: 1830.118 ms
Model: Base,

In [391]:
df = pd.read_csv("inference_time_mcq_2.csv")
df



Unnamed: 0,model_name,time once (ms),average_time 50 times (ms),seq_length,context,question,choices,data_id,data_set_name
0,Base,364.140987,,92,Do i need to go for a legal divorce ? I wanted...,Why is this person asking about divorce ?,['If he gets married in the church he wo nt ha...,0,cosmos_qa
1,Base,240.724087,,67,I watched the first McCain / Obama debate last...,How would this person be classified ?,"['None of the above choices .', 'Liberal', 'Co...",10,cosmos_qa
2,Base,454.405308,,117,"So , while i was in the library in my old neig...",What did you do after realizing that your thin...,['I set about reporting the theft to the campu...,20,cosmos_qa
3,Base,268.949986,,58,At the beginning of the change there were jet ...,Why did jet airplanes allow us jump from one p...,['Because it was enough to add two or three co...,30,cosmos_qa
4,Base,261.901140,,62,Another thing I do n't appreciate is the shoot...,Why might I have problems with Jon playing a s...,"[""Because a shooting game involves violence bu...",40,cosmos_qa
...,...,...,...,...,...,...,...,...,...
2095,ONNX-OPT Quantized,940.914154,,583,What is one of the most boring and tiresome wo...,What can we learn about responsibility?,"[""It's of secondary importance to discipline.""...",50,race
2096,ONNX-OPT Quantized,500.789881,,254,Children have their own rules in playing games...,The writer believes that _ .,['children should make better rules for their ...,60,race
2097,ONNX-OPT Quantized,797.747374,,347,Before l tell you bow many hours a day people ...,"According to the poll, the time people spend v...","['one to three hours', 'four to six hours', 'o...",70,race
2098,ONNX-OPT Quantized,599.487305,,305,We are fortunate to be living in a time when a...,The writer's attitude toward the digital socie...,"['critical', 'positive', 'neutral', 'negative']",80,race


## Measure filesize

In [423]:
all_skills = pd.read_csv("square_skills/impl_skills.csv")
# skill = "multiple-choice"
# skills = load_skills(skill)

skills = all_skills

In [424]:
def get_size_of_model(model):
    # torch.save(model.state_dict(), "temp.p")
    # size_of_model = os.path.getsize("temp.p")/(1024*1024)
    # print('Size (MB):', os.path.getsize("temp.p")/(1024*1024))
    # os.remove('temp.p')

    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()

    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_of_model = (param_size + buffer_size) / 1024**2
    return size_of_model

def save_df(df_new, path_to_logger_file = "logger_all.csv"):
    if os.path.exists(path_to_logger_file):
        df_fin = pd.concat([pd.read_csv(path_to_logger_file), df_new])
        df_fin.to_csv(path_to_logger_file,index=False)
    else: 
        df_new.to_csv(path_to_logger_file,index=False)

In [426]:
df = pd.DataFrame(columns=["reader", "adapter", "base", "base_quant", "onnx", "onnx_quant"])

for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print("Loading: {} {}".format(reader, adapter))


    #load base model
    tokenizer = AutoTokenizer.from_pretrained(reader)
    default_model = AutoModelWithHeads.from_pretrained(reader)
    adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
    default_model.active_adapters = adapter_name
    # get base model size
    default_model_size = get_size_of_model(default_model)
    print(default_model_size)
    
    #get quant model size
    quantized_base_model = torch.quantization.quantize_dynamic(default_model, {torch.nn.Linear}, dtype=torch.qint8)
    quantized_base_model_size = get_size_of_model(quantized_base_model)
    print(quantized_base_model_size)

    try:
        #load onnx model
        onnx_model_size = os.path.getsize(f"onnx/{reader}-pf-{adapter}-onnx/model.onnx")/(1024*1024)
        print(onnx_model_size)

    except:
        print("error while exporting onnx")
        onnx_model_size = "error"
    
    try:
        # get onnx quant size 
        onnx_quant_model_size = os.path.getsize(f"onnx/{reader}-pf-{adapter}-onnx/model_quant.onnx")/(1024*1024)
        print(onnx_quant_model_size)

    except:
        print("error while exporting onnx quant")
        onnx_quant_model_size = "error"

        
    df.loc[len(df)] = [reader, adapter, default_model_size, quantized_base_model_size, onnx_model_size, onnx_quant_model_size]
save_df(df, path_to_logger_file="file_size_2.csv")

Loading: bert-base-uncased boolq


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3083.29it/s]


423.32056427001953
388.361328125
Loading: roberta-base boolq


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

481.16434478759766
446.2051086425781
Loading: bert-base-uncased cosmos_qa


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4683.76it/s]


423.31763076782227
388.361328125
Loading: roberta-base cosmos_qa


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

481.1614112854004
446.2051086425781
Loading: bert-base-uncased drop


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 5154.82it/s]


421.06763458251953
388.361328125
Loading: roberta-base drop


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

478.91141510009766
446.2051086425781
Loading: bert-base-uncased hotpotqa


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 5136.93it/s]


421.06763458251953
388.361328125
Loading: roberta-base hotpotqa


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

478.91141510009766
446.2051086425781
Loading: bert-base-uncased multirc


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4431.38it/s]


423.32056427001953
388.361328125
Loading: roberta-base multirc


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

481.16434478759766
446.2051086425781
Loading: bert-base-uncased newsqa


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 5272.54it/s]


421.06763458251953
388.361328125
Loading: roberta-base newsqa


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

478.91141510009766
446.2051086425781
Loading: bert-base-uncased quail


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4918.08it/s]


423.31763076782227
388.361328125
Loading: roberta-base quail


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

481.1614112854004
446.2051086425781
Loading: roberta-base quartz


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

481.1614112854004
446.2051086425781
Loading: bert-base-uncased quoref


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3666.88it/s]


421.06763458251953
388.361328125
Loading: roberta-base quoref


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

478.91141510009766
446.2051086425781
Loading: bert-base-uncased race


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4655.17it/s]


423.31763076782227
388.361328125
Loading: roberta-base race


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

481.1614112854004
446.2051086425781
Loading: bert-base-uncased squad


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3829.25it/s]


421.06763458251953
388.361328125
Loading: roberta-base squad


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

478.91141510009766
446.2051086425781
Loading: bert-base-uncased squad_v2


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2830.16it/s]


421.06763458251953
388.361328125
Loading: roberta-base squad_v2


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

478.91141510009766
446.2051086425781


## Measure Accuracy 

### evaluate all extractive qa model on squad

In [None]:
data_set_name = "squad"
data = load_dataset(data_set_name, split="validation[:500]")
metric = evaluate.load(data_set_name)

In [None]:
def squad_evaluate(inference_func, model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs):
    examples = list(zip(data["question"], data["context"]))
    predictions = []
    for example in examples:
        _, task_outputs, _, _ = inference_func(model, tokenizer, [example], preprocessing_kwargs, task_kwargs, model_kwargs)
        predictions.append(task_outputs["answers"][0][0]["answer"])
    
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in zip(data["id"], predictions)]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in data]
    
    score = metric.compute(predictions=formatted_predictions, references=references)

    return score["f1"], score["exact_match"]

In [None]:
result = []
for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print(f"Loading: {reader} {adapter}")

    #load base model
    tokenizer = AutoTokenizer.from_pretrained(reader)
    default_model = AutoModelWithHeads.from_pretrained(reader)
    adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
    default_model.active_adapters = adapter_name

    # Test acc for base model
    f1, exact = squad_evaluate(base_qa, default_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)
    result.append(("Base", skill, reader, adapter, f1, exact, data_set_name))

    #load onnx models
    model_onnx, model_onnx_quant = repo_builder(reader, adapter)
    onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
    onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "Quantized ONNX", "Quantized ONNX - OPT"]

    # Test acc for onnx models
    for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
        f1, exact = squad_evaluate(question_answering, onnx_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)
        result.append((onnx_model_name, skill, reader, adapter, f1, exact, data_set_name))   

In [None]:
df = pd.DataFrame(result, columns=["name", "skill", "reader", "adapter", "f1", "exact", "dataset"])

In [None]:
save_df(df, "accuracy.csv")

### evaluate extractive qa model on specific adapter

In [None]:
def accuracy_scoring(pred_list, true_val_list):
    hit = 0
    for pred, true_val in zip(pred_list, true_val_list):
        if pred in true_val:
            hit += 1
    return hit/len(pred_list)

In [None]:
def adapter_evaluate(adapter, inference_func, model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs):
    if adapter == "drop":
        context_name = "passage"
        id_name = "query_id"
        answers_name = "answers_spans"
    else:
        context_name = "context"
        id_name = "id"
        answers_name = "answers"

    examples = list(zip(data["question"], data[context_name]))
    
    predictions = []
    for example in examples:
        _, task_outputs, _, _ = inference_func(model, tokenizer, [example], preprocessing_kwargs, task_kwargs, model_kwargs)
        predictions.append(task_outputs["answers"][0][0]["answer"])

    return predictions

In [None]:
result = []
runs = 250
for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print(f"Loading: {reader} {adapter}")

    #load adapter specific dataset
    data_set_name = adapter
    if data_set_name in ["newsqa", "hotpot_qa"]:
        continue
    else: 
        data = load_dataset(data_set_name, split=f"validation[:{runs}]")
        print(f"Loaded dataset: {data_set_name}")
    
    #load base model
    tokenizer = AutoTokenizer.from_pretrained(reader)
    default_model = AutoModelWithHeads.from_pretrained(reader)
    adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
    default_model.active_adapters = adapter_name
    
    # Get base results
    base_model_result = adapter_evaluate(adapter, base_qa, default_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)

    #load and eval quant model 
    quantized_model = torch.quantization.quantize_dynamic(default_model, {torch.nn.Linear}, dtype=torch.qint8)
    quant_base_model_result = adapter_evaluate(adapter, base_qa, quantized_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)
    scoring = accuracy_scoring(base_model_result, quant_base_model_result)
    result.append(("Quantized Base Model", skill, reader, adapter, scoring, data_set_name, runs)) 
    
    #load onnx models
    model_onnx, model_onnx_quant = repo_builder(reader, adapter)
    onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
    onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "Quantized ONNX", "Quantized ONNX - OPT"] 
    
    for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
        onnx = adapter_evaluate(adapter, question_answering, onnx_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)

        scoring = accuracy_scoring(base_model_result, onnx)

        result.append((onnx_model_name, skill, reader, adapter, scoring, data_set_name, runs))          

In [None]:
df = pd.DataFrame(result, columns=["base_name", "onnx_name", "skill", "reader", "adapter", "result" "dataset", "runs"])
df

In [None]:
save_df(df, "sim_base_to_onnx.csv")

### evaluate categorical qa

In [None]:
all_skills = pd.read_csv("square_skills/impl_skills.csv")
skill = "categorical"
skills = load_skills(skill)

In [None]:
def categorical_base_inference(model, tokenizer, question, context):
    
    raw_input = [[context, question]]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="pt")
    
    outputs = model(**inputs)
    answer_idx = torch.argmax(outputs.logits)
    
    return bool(answer_idx)

def categorical_onnx_inference(onnx_model, tokenizer, question, context):

    inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors="np")
    inputs = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}

    outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)

    return bool(np.argmax(outputs[0][0]))

In [None]:
def boolq_evaluate(model, tokenizer, data, inference_type):
    result = []
    for test_no in range(len(data)):
        question = data[test_no]["question"]
        context = data[test_no]["passage"]

        answer = inference_type(model, tokenizer, question, context)
        result.append(answer)

    return result

In [None]:
def boolq_accuracy_scoring(base_list, pred_list):
    hit = 0
    for base_pred, pred in zip(base_list, pred_list):
        if base_pred == pred:
            hit += 1
    return hit/len(base_list)

In [None]:
result = []
runs = 250

data_set_name = "boolq"
data = load_dataset(data_set_name, split=f"validation[:{runs}]")

for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print(f"Loading: {reader} {adapter}")
    
    #load base model
    tokenizer = AutoTokenizer.from_pretrained(reader)
    default_model = AutoModelWithHeads.from_pretrained(reader)
    adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
    default_model.active_adapters = adapter_name

    # Get base results
    base_model_result = boolq_evaluate(default_model, tokenizer, data, categorical_base_inference)

    #load quant model
    quantized_base_model = torch.quantization.quantize_dynamic(default_model, {torch.nn.Linear}, dtype=torch.qint8)
    
    #eval quant model
    quant_base_model_result = boolq_evaluate(default_model, tokenizer, data, categorical_base_inference)
    scoring = boolq_accuracy_scoring(base_model_result, quant_base_model_result)
    result.append(("Quantized Base Model", skill, reader, adapter, scoring, data_set_name, runs)) 
    
    
    #load onnx models
    model_onnx, model_onnx_quant = repo_builder(reader, adapter)
    onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
    onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "Quantized ONNX", "Quantized ONNX - OPT"]

    # eval onnx models
    for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
        onnx_result = boolq_evaluate(onnx_model, tokenizer, data, categorical_onnx_inference)
        scoring = boolq_accuracy_scoring(base_model_result, onnx_result)
        result.append((onnx_model_name, skill, reader, adapter, scoring, data_set_name, runs))

In [None]:
df = pd.DataFrame(result, columns=["base_name", "onnx_name", "skill", "reader", "adapter", "result" "dataset", "runs"])
df

In [None]:
save_df(df, "sim_base_to_onnx.csv")

### evaluate mcq qa model on specific adapter

In [358]:
def mc_model_inference(model, tokenizer, preped_data_set):
    result = []
    i = 0
    for example in preped_data_set:
        if i % 10 == 0:
            print(i)
        i += 1
        question, context, choices = example[0], example[1], example[2]
        
        outputs = []
        raw_input = [[context, question + " " + choice] for choice in choices]
        inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model(**inputs)

        answer_idx = torch.argmax(outputs.logits)
        result.append(choices[answer_idx])
    return result

def onnx_inference(onnx_model, tokenizer, preped_data_set):
    result = []
    i = 0
    for example in preped_data_set:
        if i % 10 == 0:
            print(i)
        i += 1
        question, context, choices= example[0], example[1], example[2]

        raw_input = [[context, question + " " + choice] for choice in choices]
        inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="np")

        inputs['input_ids'] =  np.expand_dims(inputs['input_ids'], axis=0)
        inputs['attention_mask'] =  np.expand_dims(inputs['attention_mask'], axis=0)

        if "token_type_ids" in inputs: #roberta does not use this
            inputs['token_type_ids'] = np.expand_dims(inputs['token_type_ids'], axis=0)

        outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)

        answer_idx = np.argmax(outputs[0])
        result.append(choices[answer_idx])
    return result

def accuracy_scoring(base_list, pred_list):
    hit = 0
    for base_pred, pred in zip(base_list, pred_list):
        if base_pred == pred:
            hit += 1
    return hit/len(base_list)

In [300]:
all_skills = pd.read_csv("square_skills/impl_skills.csv")
skill = "multiple-choice"
skills = load_skills(skill)

In [301]:
result = []
runs = 250
for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print(f"Loading: {reader} {adapter}")

    #load adapter specific dataset
    data_set_name = adapter
    preped_data_set = []
    if data_set_name in ["commonsense_qa", "social_i_qa", "multirc", "quail"]:
        continue
    elif data_set_name == "race":
        data = load_dataset(data_set_name, "all", split=f"validation[:{runs}]")
    else: 
        data = load_dataset(data_set_name, split=f"validation[:{runs}]")
    print(f"Loaded dataset: {data_set_name}")
    
    # build preped data
    for example in data:
        if data_set_name == "cosmos_qa":
            choices = [example["answer0"], example["answer1"], example["answer2"], example["answer3"]]
            preped_data_set.append((example["question"], example["context"], choices))
        elif data_set_name == "quail":
            preped_data_set.append((example["question"], example["context"], example["answers"]))
        elif data_set_name == "quartz":
            preped_data_set.append((example["question"], example["para"], example["choices"]["text"]))
        elif data_set_name =="race":
            preped_data_set.append((example["question"], example["article"], example["options"]))
            
        else:
            print("Error. Not implemented data_set. Dont know how to build preped_data_set.")
            Exception
    print("Preped data")
    
    #  load base model
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    base_model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
    adapter_name = base_model.load_adapter(f"AdapterHub/bert-base-uncased-pf-{adapter}", source="hf")
    base_model.active_adapters = adapter_name
    
    # Get base results
    print("Doing base inference. ")
    base_model_result = mc_model_inference(base_model, tokenizer, preped_data_set)

    #load and eval quant model 
    quantized_model = torch.quantization.quantize_dynamic(base_model, {torch.nn.Linear}, dtype=torch.qint8)
    quant_base_model_result = mc_model_inference(quantized_model, tokenizer, preped_data_set)
    print("Getting scoring for quant base model.")
    scoring = accuracy_scoring(base_model_result, quant_base_model_result)
    result.append(("Quantized Base Model", skill, reader, adapter, scoring, data_set_name, runs)) 
    
    #load onnx models
    print("Doing ONNX loading. ")
    model_onnx, model_onnx_quant = repo_builder(reader, adapter)
    onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
    onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "Quantized ONNX", "Quantized ONNX - OPT"] 
    
    for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
        print(f"Doing {reader}, {adapter} for {onnx_model_name}")
        onnx_results = onnx_inference(onnx_model, tokenizer, preped_data_set)
        scoring = accuracy_scoring(base_model_result, onnx_results)
        print("Done with scoring")
        result.append((onnx_model_name, skill, reader, adapter, scoring, data_set_name, runs))  

Loading: bert-base-uncased cosmos_qa


Found cached dataset cosmos_qa (/Users/michaelhermann/.cache/huggingface/datasets/cosmos_qa/default/0.1.0/3e18538cbfdb2c04189b16642715f0f6da3e97ed5df0aadcec3641245b2cf157)


Loaded dataset: cosmos_qa
Preped data


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4104.02it/s]


Doing base inference. 
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Getting scoring for quant base model.
Doing ONNX loading. 
Doing bert-base-uncased, cosmos_qa for ONNX
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing bert-base-uncased, cosmos_qa for ONNX-OPT
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing bert-base-uncased, cosmos_qa for Quantized ONNX
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing bert-base-uncased, cosmos_qa for Quantized ONNX - OPT
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Loading: roberta-base cosmos_qa


Found cached dataset cosmos_qa (/Users/michaelhermann/.cache/huggingface/datasets/cosmos_qa/default/0.1.0/3e18538cbfdb2c04189b16642715f0f6da3e97ed5df0aadcec3641245b2cf157)


Loaded dataset: cosmos_qa
Preped data


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4637.15it/s]


Doing base inference. 
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Getting scoring for quant base model.
Doing ONNX loading. 
Doing roberta-base, cosmos_qa for ONNX
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing roberta-base, cosmos_qa for ONNX-OPT
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing roberta-base, cosmos_qa for Quantized ONNX
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing roberta-base, cosmos_qa for Quantized ONNX - OPT
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Loading: bert-base-uncased multirc
Loading: roberta-base multirc
Loading: bert-base-uncased quail
Loading: roberta-base quail


Found cached dataset quartz (/Users/michaelhermann/.cache/huggingface/datasets/quartz/default/0.1.0/6e5195fb88ecd7a75eda5d8f3549c262c8b15267366f38f9c153f40da92724a6)


Loaded dataset: quartz
Preped data


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3484.61it/s]


Doing base inference. 
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Getting scoring for quant base model.
Doing ONNX loading. 
Doing roberta-base, quartz for ONNX
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing roberta-base, quartz for ONNX-OPT
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing roberta-base, quartz for Quantized ONNX
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing roberta-base, quartz for Quantized ONNX - OPT
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Loading: bert-base-uncased race


Found cached dataset race (/Users/michaelhermann/.cache/huggingface/datasets/race/all/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)


Preped data


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2391.51it/s]


Doing base inference. 
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Getting scoring for quant base model.
Doing ONNX loading. 
Doing bert-base-uncased, race for ONNX
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing bert-base-uncased, race for ONNX-OPT
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing bert-base-uncased, race for Quantized ONNX
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing bert-base-uncased, race for Quantized ONNX - OPT
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Loading: roberta-base race


Found cached dataset race (/Users/michaelhermann/.cache/huggingface/datasets/race/all/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)


Preped data


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3155.98it/s]


Doing base inference. 
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Getting scoring for quant base model.
Doing ONNX loading. 
Doing roberta-base, race for ONNX
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing roberta-base, race for ONNX-OPT
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing roberta-base, race for Quantized ONNX
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring
Doing roberta-base, race for Quantized ONNX - OPT
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
Done with scoring


In [302]:
df = pd.DataFrame(result, columns=["base_name", "onnx_name", "skill", "reader", "adapter", "result" "dataset", "runs"])
df

Unnamed: 0,base_name,onnx_name,skill,reader,adapter,resultdataset,runs
0,Quantized Base Model,multiple-choice,bert-base-uncased,cosmos_qa,0.956,cosmos_qa,250
1,ONNX,multiple-choice,bert-base-uncased,cosmos_qa,1.0,cosmos_qa,250
2,ONNX-OPT,multiple-choice,bert-base-uncased,cosmos_qa,1.0,cosmos_qa,250
3,Quantized ONNX,multiple-choice,bert-base-uncased,cosmos_qa,0.76,cosmos_qa,250
4,Quantized ONNX - OPT,multiple-choice,bert-base-uncased,cosmos_qa,0.76,cosmos_qa,250
5,Quantized Base Model,multiple-choice,roberta-base,cosmos_qa,0.956,cosmos_qa,250
6,ONNX,multiple-choice,roberta-base,cosmos_qa,0.208,cosmos_qa,250
7,ONNX-OPT,multiple-choice,roberta-base,cosmos_qa,0.208,cosmos_qa,250
8,Quantized ONNX,multiple-choice,roberta-base,cosmos_qa,0.22,cosmos_qa,250
9,Quantized ONNX - OPT,multiple-choice,roberta-base,cosmos_qa,0.22,cosmos_qa,250


In [303]:
save_df(df, "sim_base_to_onnx.csv")