In [184]:
from datasets import load_dataset, load_metric
import evaluate
from transformers import AutoModelWithHeads, AutoTokenizer
from transformers.models.bert import BertOnnxConfig
from transformers.onnx import OnnxConfig, validate_model_outputs, export

import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxruntime import InferenceSession
import onnxruntime


import time
from typing import Tuple, Union
import torch
import numpy as np
import pandas as pd
import os
from typing import Mapping, OrderedDict
from tqdm import tqdm
from multiprocessing import Process

from huggingface_hub import hf_hub_download

### Functions

#### Basic Functions

In [167]:
# Load needed skills by skilltype (span-extraction, multiple-choice, categorical, abstractive)
def load_skills(skill_type, path="square_skills/impl_skills.csv"):
    all_skills = pd.read_csv(path)
    skills = all_skills[all_skills["Type"] == skill_type]
    return skills

In [168]:
def load_onnx_model(model_onnx, model_onnx_quant, as_list=False):
    onnx_model = onnxruntime.InferenceSession(model_onnx, providers=["CPUExecutionProvider"])
    onnx_model_quant = onnxruntime.InferenceSession(model_onnx_quant, providers=["CPUExecutionProvider"])
    
    so = onnxruntime.SessionOptions()
    so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    
    onnx_model_opt = onnxruntime.InferenceSession(model_onnx, so)
    onnx_model_quant_opt = onnxruntime.InferenceSession(model_onnx_quant, so)
    
    if as_list:
        return [onnx_model, onnx_model_opt, onnx_model_quant, onnx_model_quant_opt]
    return onnx_model, onnx_model_opt, onnx_model_quant, onnx_model_quant_opt

def repo_builder(reader, adapter):
    repo_id = f"UKP-SQuARE/{reader}-pf-{adapter}-onnx"
    filename_onnx = "model.onnx"
    filename_onnx_quant = "model_quant.onnx"

    model_onnx = hf_hub_download(repo_id=repo_id, filename=filename_onnx)
    model_onnx_quant = hf_hub_download(repo_id=repo_id, filename=filename_onnx_quant)

    return model_onnx, model_onnx_quant

In [169]:
def save_df(df_new, path_to_logger_file = "logs/logger_all.csv"):
    if os.path.exists(path_to_logger_file):
        df_fin = pd.concat([pd.read_csv(path_to_logger_file), df_new])
        df_fin.to_csv(path_to_logger_file,index=False)
    else: 
        df_new.to_csv(path_to_logger_file,index=False)

#### Inference Function Extractive 

In [6]:
# Inference extractive_qa models
# base model

def base_predict(
            model, input, tokenizer, preprocessing_kwargs, model_kwargs, batch_size=1, disable_gpu=True, output_features=False
    ) -> Union[dict, Tuple[dict, dict]]:
        """
        Inference on the input.
        Args:
         request: the request with the input and optional kwargs
         output_features: return the features of the input.
            Necessary if, e.g., attention mask is needed for post-processing.
        Returns:
             The model outputs and optionally the input features
        """

        all_predictions = []
        preprocessing_kwargs["padding"] = preprocessing_kwargs.get(
            "padding", True
        )
        preprocessing_kwargs["truncation"] = preprocessing_kwargs.get(
            "truncation", True
        )
        model.to(
            "cuda"
            if torch.cuda.is_available() and not disable_gpu
            else "cpu"
        )

        features = tokenizer(
            input, return_tensors="pt", **preprocessing_kwargs
        )

        for start_idx in range(0, len(input), batch_size):
            with torch.no_grad():
                input_features = {
                    k: features[k][start_idx: start_idx + batch_size]
                    for k in features.keys()
                }
                predictions = model(**input_features, **model_kwargs)
                all_predictions.append(predictions)

        keys = all_predictions[0].keys()
        final_prediction = {}
        for key in keys:
            # HuggingFace outputs for "attentions" and more is
            # returned as tuple of tensors
            # Tuple of tuples only exists for "past_key_values"
            # which is only relevant for generation.
            # Generation should NOT use this function
            if isinstance(all_predictions[0][key], tuple):
                tuple_of_lists = list(
                    zip(
                        *[
                            [
                                torch.stack(p).cpu()
                                if isinstance(p, tuple)
                                else p.cpu()
                                for p in tpl[key]
                            ]
                            for tpl in all_predictions
                        ]
                    )
                )
                final_prediction[key] = tuple(torch.cat(l) for l in tuple_of_lists)
            else:
                final_prediction[key] = torch.cat(
                    [p[key].cpu() for p in all_predictions]
                )
        if output_features:
            return final_prediction, features

        return final_prediction

def base_qa(model, tokenizer, input, preprocessing_kwargs, task_kwargs, model_kwargs):
    """
    Span-based question answering for a given question and context.
    We expect the input to use the (question, context) format for the text pairs.
    Args:
        request: the prediction request
    """    
    preprocessing_kwargs["truncation"] = "only_second"
    features = tokenizer(
        input, return_tensors="pt", **preprocessing_kwargs
    )
    predictions, features = base_predict(model, input, tokenizer, preprocessing_kwargs, model_kwargs, output_features=True)

    task_outputs = {
        "answers": [],
        "attributions": [],
        "adversarial": {
            "indices": [],
        },  # for hotflip, input_reduction and topk
    }

    for idx, (start, end, (_, context)) in enumerate(
            zip(predictions["start_logits"], predictions["end_logits"], input)
    ):
        # Ensure padded tokens & question tokens cannot
        # belong to the set of candidate answers.
        question_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1)
        # Unmask CLS token for "no answer"
        question_tokens[0] = 1
        undesired_tokens = question_tokens & features["attention_mask"][idx].numpy()

        # Generate mask
        undesired_tokens_mask = undesired_tokens == 0.0

        # Make sure non-context indexes in the tensor cannot
        # contribute to the softmax
        start = np.where(undesired_tokens_mask, -10000.0, start)
        end = np.where(undesired_tokens_mask, -10000.0, end)

        start = np.exp(start - np.log(np.sum(np.exp(start), axis=-1, keepdims=True)))
        end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True)))

        # Get score for "no answer" then mask for decoding step (CLS token
        no_answer_score = (start[0] * end[0]).item()
        start[0] = end[0] = 0.0

        starts, ends, scores = decode(
            start,
            end,
            task_kwargs.get("topk", 1),
            task_kwargs.get("max_answer_len", 128),
            undesired_tokens,
        )

        enc = features[idx]
        original_ans_start = enc.token_to_word(starts[0])
        original_ans_end = enc.token_to_word(ends[0])
        answers = [
            {
                "score": score.item(),
                "start": enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0],
                "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1],
                "answer": context[
                            enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0]: enc.word_to_chars(
                                enc.token_to_word(e), sequence_index=1
                            )[1]
                            ],
            }
            for s, e, score in zip(starts, ends, scores)
        ]
        if task_kwargs.get("show_null_answers", True):
            answers.append({"score": no_answer_score, "start": 0, "end": 0, "answer": ""})
        answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: task_kwargs.get("topk", 1)]
        task_outputs["answers"].append(answers)

    return predictions, task_outputs, original_ans_start, original_ans_end

def decode(
            start_: np.ndarray,
            end_: np.ndarray,
            topk: int,
            max_answer_len: int,
            undesired_tokens_: np.ndarray,
    ) -> Tuple:
    """
    Take the output of any :obj:`ModelForQuestionAnswering` and
        will generate probabilities for each span to be the
        actual answer.
    In addition, it filters out some unwanted/impossible cases
    like answer len being greater than max_answer_len or
    answer end position being before the starting position.
    The method supports output the k-best answer through
    the topk argument.
    Args:
        start_ (:obj:`np.ndarray`): Individual start
            probabilities for each token.
        end (:obj:`np.ndarray`): Individual end_ probabilities
            for each token.
        topk (:obj:`int`): Indicates how many possible answer
            span(s) to extract from the model output.
        max_answer_len (:obj:`int`): Maximum size of the answer
            to extract from the model"s output.
        undesired_tokens_ (:obj:`np.ndarray`): Mask determining
            tokens that can be part of the answer
    """
    # Ensure we have batch axis
    if start_.ndim == 1:
        start_ = start_[None]

    if end_.ndim == 1:
        end_ = end_[None]

    # Compute the score of each tuple(start_, end_) to be the real answer
    outer = np.matmul(np.expand_dims(start_, -1), np.expand_dims(end_, 1))

    # Remove candidate with end_ < start_ and end_ - start_ > max_answer_len
    candidates = np.tril(np.triu(outer), max_answer_len - 1)

    #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
    scores_flat = candidates.flatten()
    if topk == 1:
        idx_sort = [np.argmax(scores_flat)]
    elif len(scores_flat) < topk:
        idx_sort = np.argsort(-scores_flat)
    else:
        idx = np.argpartition(-scores_flat, topk)[0:topk]
        idx_sort = idx[np.argsort(-scores_flat[idx])]

    starts_, ends_ = np.unravel_index(idx_sort, candidates.shape)[1:]
    desired_spans = np.isin(starts_, undesired_tokens_.nonzero()) & np.isin(
        ends_, undesired_tokens_.nonzero()
    )
    starts_ = starts_[desired_spans]
    ends_ = ends_[desired_spans]
    scores_ = candidates[0, starts_, ends_]

    return starts_, ends_, scores_

In [None]:
# Code from SQuARE ONNX QA Pipeline (note: some features like explainability and attack mode have been removed)
def question_answering(model_qa, tokenizer, input, preprocessing_kwargs, task_kwargs, model_kwargs):
    """
    Span-based question answering for a given question and context.
    We expect the input to use the (question, context) format for the text pairs.
    Args:
        request: the prediction request
    """    
    preprocessing_kwargs["truncation"] = "only_second"

    features = tokenizer(
        input, return_tensors="np", **preprocessing_kwargs
    )
    onnx_inputs = {key: np.array(features[key], dtype=np.int64) for key in features}
    
    predictions_onnx = model_qa.run(input_feed=onnx_inputs, output_names=None)
    predictions = {
        "start_logits": predictions_onnx[0],
        "end_logits": predictions_onnx[1]
    }

    task_outputs = {
        "answers": [],
        "attributions": [],
        "adversarial": {
            "indices": [],
        },  # for hotflip, input_reduction and topk
    }

    for idx, (start, end, (_, context)) in enumerate(
            zip(predictions["start_logits"], predictions["end_logits"], input)
    ):
        # Ensure padded tokens & question tokens cannot
        # belong to the set of candidate answers.
        question_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1)
        # Unmask CLS token for "no answer"
        question_tokens[0] = 1
        undesired_tokens = question_tokens & features["attention_mask"][idx]

        # Generate mask
        undesired_tokens_mask = undesired_tokens == 0.0

        # Make sure non-context indexes in the tensor cannot
        # contribute to the softmax
        start = np.where(undesired_tokens_mask, -10000.0, start)
        end = np.where(undesired_tokens_mask, -10000.0, end)

        start = np.exp(start - np.log(np.sum(np.exp(start), axis=-1, keepdims=True)))
        end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True)))

        # Get score for "no answer" then mask for decoding step (CLS token
        no_answer_score = (start[0] * end[0]).item()
        start[0] = end[0] = 0.0

        starts, ends, scores = decode(
            start,
            end,
            task_kwargs.get("topk", 1),
            task_kwargs.get("max_answer_len", 128),
            undesired_tokens,
        )

        enc = features[idx]
        original_ans_start = enc.token_to_word(starts[0])
        original_ans_end = enc.token_to_word(ends[0])
        answers = [
            {
                "score": score.item(),
                "start": enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0],
                "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1],
                "answer": context[
                            enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0]: enc.word_to_chars(
                                enc.token_to_word(e), sequence_index=1
                            )[1]
                            ],
            }
            for s, e, score in zip(starts, ends, scores)
        ]
        if task_kwargs.get("show_null_answers", True):
            answers.append({"score": no_answer_score, "start": 0, "end": 0, "answer": ""})
        answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: task_kwargs.get("topk", 1)]
        task_outputs["answers"].append(answers)

    return predictions, task_outputs, original_ans_start, original_ans_end

#### Inference Function Categorical

In [7]:
def categorical_base_inference(model, tokenizer, question, context):
    
    raw_input = [[context, question]]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="pt")
    
    outputs = model(**inputs)
    answer_idx = torch.argmax(outputs.logits)
    
    return bool(answer_idx), outputs.logits[0]

def categorical_onnx_inference(onnx_model, tokenizer, question, context):

    inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors="np")
    inputs = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}

    outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)

    return bool(np.argmax(outputs[0][0])), outputs[0][0]

#### Inference Function Multiple Choice

In [170]:
def mc_base_inference(model, tokenizer, question, context, choices):
    outputs = []
    raw_input = [[context, question + " " + choice] for choice in choices]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)

    answer_logits = outputs.logits
    answer_idx = torch.argmax(answer_logits)
    answer = choices[answer_idx]

    return answer, answer_logits

def mc_onnx_inference(onnx_model, tokenizer, context, question, choices):

    raw_input = [[context, question + " " + choice] for choice in choices]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="np")

    inputs['input_ids'] =  np.expand_dims(inputs['input_ids'], axis=0)
    inputs['attention_mask'] =  np.expand_dims(inputs['attention_mask'], axis=0)
    if "token_type_ids" in inputs: #roberta does not use this
        inputs['token_type_ids'] = np.expand_dims(inputs['token_type_ids'], axis=0)
    
    outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)

    answer_logits = outputs[0]
    answer_idx = np.argmax(answer_logits)
    answer = choices[answer_idx]
    
    return answer, outputs[0]


## Measure Inference Time 

#### extractive qa

In [42]:
all_skills = pd.read_csv("square_skills/impl_skills.csv")
skill = "span-extraction"
skills = load_skills(skill)

In [46]:
preprocessing_kwargs = {"padding": True, "truncation": True}
task_kwargs = {"show_null_answers": False, "topk": 1, "max_answer_len": 128}
model_kwargs = {"": {}}

In [47]:
def run_torch(model, inputs):
    with torch.no_grad():
        model(**inputs)

def run_onnx(qa_model, onnx_inputs):
    qa_model.run(output_names=["start_logits", "end_logits"], input_feed=dict(onnx_inputs))   

def get_time_duration(func, model, inputs): 
    st= time.time()
    func(model, inputs)
    et = time.time()
    return 1000 * (et - st)

def save_df(df_new, path_to_logger_file = "logger_all.csv"):

    if os.path.exists(path_to_logger_file):
        df_fin = pd.concat([pd.read_csv(path_to_logger_file), df_new])
        df_fin.to_csv(path_to_logger_file,index=False)
    else: 
        df_new.to_csv(path_to_logger_file,index=False)

def measure_time(perf_type, tokenizer, question, context, model):
    if perf_type == "base":
        inputs = tokenizer(question, context, return_tensors="pt", truncation=True)
        mode = run_torch
        # time_once = get_time_duration(run_torch, model, inputs)

    elif perf_type == "seq_length":
        inputs = tokenizer(question, context, return_tensors="np", truncation=True)
        inputs = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}
        mode = run_onnx
        # time_once = get_time_duration(run_onnx, model, inputs) 
    
    time_once = get_time_duration(mode, model, inputs) 

    return time_once

def performance_log(reader, adapter, perf_type, name, model, tokenizer, data, data_intervall = 1): #TODO add truncacte
    df = pd.DataFrame(columns=["reader", "adapter", "model_name", "time once (ms)", "seq_length", "context", "question", "data_id"])
    
    if adapter == "drop":
        context_name = "passage"
        id_name = "query_id"
    else:
        context_name = "context"
        id_name = "id"

    for i in range(0, len(data[context_name]), data_intervall):
        context = data[context_name][i]
        question = data["question"][i]
        time_duration = measure_time(perf_type, tokenizer, question, context, model)
        
        seq_length = len(context.split()) # TODO -> reduce stopwords? Real Tokenization?

        df.loc[len(df)] = [reader, adapter, name, time_duration, seq_length, context, question, data[id_name][i]]
        
        print("Model: {}, Input Length {}: {:.3f} ms".format(name, seq_length, time_duration))
    save_df(df, path_to_logger_file="inference_time_extractive_2.csv")

In [48]:
runs = 100

for i in range(5):
    for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
        print("Loading: {} {}".format(reader, adapter))

        #load adapter specific dataset
        data_set_name = adapter
        if data_set_name in ["newsqa", "hotpotqa"]:
            continue
        else: 
            data = load_dataset(data_set_name, split=f"validation[:{runs}]")
            print(f"Loaded dataset: {data_set_name}")

        tokenizer = AutoTokenizer.from_pretrained(reader)
        default_model = AutoModelWithHeads.from_pretrained(reader)
        adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
        default_model.active_adapters = adapter_name
        # eval base
        performance_log(reader , adapter, "base", "Base", default_model, tokenizer, data)

        #load quant model
        quantized_base_model = torch.quantization.quantize_dynamic(default_model, {torch.nn.Linear}, dtype=torch.qint8)
        performance_log(reader , adapter, "base", "Base Quant", quantized_base_model, tokenizer, data)


        #load onnx models
        model_onnx, model_onnx_quant = repo_builder(reader, adapter)
        onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
        onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "ONNX Quantized", "ONNX-OPT Quantized"]

        # eval onnx models
        for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
            performance_log(reader , adapter, "seq_length", onnx_model_name, onnx_model, tokenizer, data)

Loading: bert-base-uncased drop


KeyboardInterrupt: 

### categorical 

In [52]:
all_skills = pd.read_csv("square_skills/impl_skills.csv")
skill = "categorical"
skills = load_skills(skill)

In [54]:
def get_time_duration(func, model, tokenizer, question, context): 
    st= time.time()
    func(model, tokenizer, question, context)
    et = time.time()
    return 1000 * (et - st)

def save_df(df_new, path_to_logger_file = "logger_all.csv"):
    if os.path.exists(path_to_logger_file):
        df_fin = pd.concat([pd.read_csv(path_to_logger_file), df_new])
        df_fin.to_csv(path_to_logger_file,index=False)
    else: 
        df_new.to_csv(path_to_logger_file,index=False)

def performance_log(adapter, reader, func, name, model, tokenizer, data, data_set_name, data_intervall = 0): 
    df = pd.DataFrame(columns=["adapter", "reader", "model_name", "time once (ms)", "average_time 50 times (ms)", "seq_length", "context", "question", "data_id", "data_set_name"])
    
    for i in range(0, len(data["passage"]), data_intervall):
        context = data["passage"][i]
        question = data["question"][i]
        time_duration = get_time_duration(func, model, tokenizer, question, context)
        
        seq_length = len(context.split()) # TODO -> reduce stopwords
        
        df.loc[len(df)] = [adapter, reader, name, time_duration, "", seq_length, context, question, i, data_set_name]
        
        print("Model: {}, Input Length {}: {:.3f} ms".format(name, seq_length, time_duration))
    save_df(df, path_to_logger_file="inference_time_categorical.csv")

In [55]:
runs = 250
data_set_name = "boolq"
data = load_dataset(data_set_name, split=f"validation[:{runs}]")

for i in range(5):
    for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
        print("Loading: {} {}".format(reader, adapter))
        
        #load base model
        tokenizer = AutoTokenizer.from_pretrained(reader)
        default_model = AutoModelWithHeads.from_pretrained(reader)
        adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
        default_model.active_adapters = adapter_name

        performance_log(adapter, reader, categorical_base_inference, "Base", default_model, tokenizer, data, data_set_name, 1) 

        #load quant model
        quantized_base_model = torch.quantization.quantize_dynamic(default_model, {torch.nn.Linear}, dtype=torch.qint8)
        performance_log(adapter, reader, categorical_base_inference, "Base Quantized", quantized_base_model, tokenizer, data, data_set_name, 1) 
        
        #load onnx models
        model_onnx, model_onnx_quant = repo_builder(reader, adapter)
        onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
        onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "ONNX Quantized", "ONNX-OPT Quantized"]

        # eval onnx models
        for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
            performance_log(adapter, reader, categorical_onnx_inference, onnx_model_name, onnx_model, tokenizer, data, data_set_name, 1) 


Found cached dataset boolq (/Users/michaelhermann/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5)


Loading: bert-base-uncased boolq


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3115.35it/s]


Model: Base, Input Length 217: 308.926 ms
Model: Base, Input Length 191: 215.285 ms
Model: Base, Input Length 54: 119.329 ms
Model: Base, Input Length 85: 110.199 ms
Model: Base, Input Length 47: 91.601 ms
Model: Base, Input Length 55: 84.536 ms
Model: Base, Input Length 94: 109.555 ms
Model: Base, Input Length 110: 102.167 ms
Model: Base, Input Length 14: 47.015 ms
Model: Base, Input Length 85: 87.096 ms
Model: Base, Input Length 138: 135.540 ms
Model: Base, Input Length 85: 116.758 ms
Model: Base, Input Length 97: 113.000 ms
Model: Base, Input Length 90: 111.234 ms
Model: Base, Input Length 116: 117.523 ms
Model: Base, Input Length 55: 73.665 ms
Model: Base, Input Length 113: 120.170 ms
Model: Base, Input Length 49: 56.421 ms
Model: Base, Input Length 116: 105.916 ms
Model: Base, Input Length 46: 67.538 ms
Model: Base, Input Length 125: 123.035 ms
Model: Base, Input Length 160: 218.278 ms
Model: Base, Input Length 43: 68.123 ms
Model: Base, Input Length 53: 66.496 ms
Model: Base, Inp

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: Base, Input Length 217: 222.542 ms
Model: Base, Input Length 191: 203.045 ms
Model: Base, Input Length 54: 71.490 ms
Model: Base, Input Length 85: 115.493 ms
Model: Base, Input Length 47: 114.893 ms
Model: Base, Input Length 55: 100.076 ms
Model: Base, Input Length 94: 125.194 ms
Model: Base, Input Length 110: 131.380 ms
Model: Base, Input Length 14: 41.098 ms
Model: Base, Input Length 85: 87.596 ms
Model: Base, Input Length 138: 127.965 ms
Model: Base, Input Length 85: 89.589 ms
Model: Base, Input Length 97: 104.654 ms
Model: Base, Input Length 90: 106.230 ms
Model: Base, Input Length 116: 123.246 ms
Model: Base, Input Length 55: 65.615 ms
Model: Base, Input Length 113: 112.935 ms
Model: Base, Input Length 49: 58.757 ms
Model: Base, Input Length 116: 129.689 ms
Model: Base, Input Length 46: 63.350 ms
Model: Base, Input Length 125: 140.565 ms
Model: Base, Input Length 160: 148.527 ms
Model: Base, Input Length 43: 60.254 ms
Model: Base, Input Length 53: 70.014 ms
Model: Base, Inp

### mcq

In [17]:
all_skills = pd.read_csv("square_skills/impl_skills.csv")
skill = "multiple-choice"
skills = load_skills(skill)

In [18]:
def mc_model_inference(model, tokenizer, question, context, choices):
    outputs = []
    raw_input = [[context, question + " " + choice] for choice in choices]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="pt")
        
    with torch.no_grad():
        outputs = model(**inputs)

    answer_idx = torch.argmax(outputs.logits)

    return choices[answer_idx]

def mc_onnx_inference(onnx_model, tokenizer, question, context, choices):

    raw_input = [[context, question + " " + choice] for choice in choices]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="np")

    inputs['input_ids'] =  np.expand_dims(inputs['input_ids'], axis=0)
    inputs['attention_mask'] =  np.expand_dims(inputs['attention_mask'], axis=0)

    if "token_type_ids" in inputs: #roberta does not use this
        inputs['token_type_ids'] = np.expand_dims(inputs['token_type_ids'], axis=0)

    outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)
    answer_idx = np.argmax(outputs[0])
    return choices[answer_idx]

In [20]:
def get_time_duration(func, model, tokenizer, question, context, choices): 
    st= time.time()
    func(model, tokenizer, question, context, choices)
    et = time.time()
    return 1000 * (et - st)

def save_df(df_new, path_to_logger_file = "logger_all.csv"):
    if os.path.exists(path_to_logger_file):
        df_fin = pd.concat([pd.read_csv(path_to_logger_file), df_new])
        df_fin.to_csv(path_to_logger_file,index=False)
    else: 
        df_new.to_csv(path_to_logger_file,index=False)

def performance_log(adapter, reader, func, name, model, tokenizer, preped_data_set, data_set_name, data_intervall=1, run_amount=10): 

    for i in range(run_amount):
        df = pd.DataFrame(columns=["adapter", "reader", "model_name", "time once (ms)", "average_time 50 times (ms)", "seq_length", "context", "question", "choices", "data_id", "data_set_name"])
        
        for i in range(0, len(preped_data_set), data_intervall):
            question, context, choices = preped_data_set[i][0], preped_data_set[i][1], preped_data_set[i][2]
            time_duration = get_time_duration(func, model, tokenizer, question, context, choices)
            
            seq_length = len(context.split()) # TODO -> reduce stopwords
            
            df.loc[len(df)] = [adapter, reader, name, time_duration, "", seq_length, context, question, choices, i, data_set_name]
            
            print("Model: {}, Input Length {}: {:.3f} ms".format(name, seq_length, time_duration))
        save_df(df, path_to_logger_file="inference_time_mcq_3.csv")

In [21]:
data_amount = 100
for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print("Loading: {} {}".format(reader, adapter))
    
    #load adapter specific dataset
    data_set_name = adapter
    if data_set_name in ["commonsense_qa", "social_i_qa", "multirc"]:
        continue
    elif data_set_name == "race":
        data = load_dataset(data_set_name, "all", split=f"validation[:{data_amount}]")
    else: 
        data = load_dataset(data_set_name, split=f"validation[:{data_amount}]")
    
    print(f"Loaded dataset: {data_set_name}")

    # build preped data
    preped_data_set = []
    for example in data:
        if data_set_name == "cosmos_qa":
            choices = [example["answer0"], example["answer1"], example["answer2"], example["answer3"]]
            preped_data_set.append((example["question"], example["context"], choices))
        elif data_set_name == "quail":
            preped_data_set.append((example["question"], example["context"], example["answers"]))
        elif data_set_name == "quartz":
            preped_data_set.append((example["question"], example["para"], example["choices"]["text"]))
        elif data_set_name =="race":
            preped_data_set.append((example["question"], example["article"], example["options"]))
            id_name = "example_id"
            
        else:
            print("Error. Not implemented data_set. Dont know how to build preped_data_set.")
            Exception
    print("Preped data")

    data_runs = 5
    data_intervall = 10
    
    #load and eval base model
    tokenizer = AutoTokenizer.from_pretrained(reader)
    default_model = AutoModelWithHeads.from_pretrained(reader)
    adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
    default_model.active_adapters = adapter_name
    performance_log(adapter, reader, mc_model_inference, "Base", default_model, tokenizer, preped_data_set, data_set_name, data_intervall, data_runs) 
    
    #load and eval quant model
    quantized_base_model = torch.quantization.quantize_dynamic(default_model, {torch.nn.Linear}, dtype=torch.qint8)
    performance_log(adapter, reader, mc_model_inference, "Base Quantized", quantized_base_model, tokenizer, preped_data_set, data_set_name, data_intervall, data_runs) 
    
    #load onnx models
    model_onnx, model_onnx_quant = repo_builder(reader, adapter)
    onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
    onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "ONNX Quantized", "ONNX-OPT Quantized"]

    # eval onnx models
    for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
        performance_log(adapter, reader, mc_onnx_inference, onnx_model_name, onnx_model, tokenizer, preped_data_set, data_set_name, data_intervall, data_runs) 

Loading: bert-base-uncased cosmos_qa


Found cached dataset cosmos_qa (/Users/michaelhermann/.cache/huggingface/datasets/cosmos_qa/default/0.1.0/3e18538cbfdb2c04189b16642715f0f6da3e97ed5df0aadcec3641245b2cf157)


Loaded dataset: cosmos_qa
Preped data


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3588.97it/s]


Model: Base, Input Length 92: 426.945 ms
Model: Base, Input Length 67: 250.775 ms
Model: Base, Input Length 117: 504.971 ms
Model: Base, Input Length 58: 332.886 ms
Model: Base, Input Length 62: 292.247 ms
Model: Base, Input Length 92: 342.463 ms
Model: Base, Input Length 112: 514.830 ms
Model: Base, Input Length 69: 278.529 ms
Model: Base, Input Length 107: 589.166 ms
Model: Base, Input Length 44: 255.814 ms
Model: Base, Input Length 92: 390.713 ms
Model: Base, Input Length 67: 277.638 ms
Model: Base, Input Length 117: 577.551 ms
Model: Base, Input Length 58: 307.321 ms
Model: Base, Input Length 62: 307.414 ms
Model: Base, Input Length 92: 365.235 ms
Model: Base, Input Length 112: 531.736 ms
Model: Base, Input Length 69: 260.557 ms
Model: Base, Input Length 107: 432.058 ms
Model: Base, Input Length 44: 276.357 ms
Model: Base, Input Length 92: 325.536 ms
Model: Base, Input Length 67: 276.707 ms
Model: Base, Input Length 117: 546.486 ms
Model: Base, Input Length 58: 288.886 ms
Model: Ba

Found cached dataset cosmos_qa (/Users/michaelhermann/.cache/huggingface/datasets/cosmos_qa/default/0.1.0/3e18538cbfdb2c04189b16642715f0f6da3e97ed5df0aadcec3641245b2cf157)


Loaded dataset: cosmos_qa
Preped data


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: Base, Input Length 92: 280.029 ms
Model: Base, Input Length 67: 230.252 ms
Model: Base, Input Length 117: 434.275 ms
Model: Base, Input Length 58: 235.553 ms
Model: Base, Input Length 62: 253.420 ms
Model: Base, Input Length 92: 276.344 ms
Model: Base, Input Length 112: 311.104 ms
Model: Base, Input Length 69: 194.644 ms
Model: Base, Input Length 107: 322.256 ms
Model: Base, Input Length 44: 194.387 ms
Model: Base, Input Length 92: 301.184 ms
Model: Base, Input Length 67: 245.218 ms
Model: Base, Input Length 117: 406.497 ms
Model: Base, Input Length 58: 276.011 ms
Model: Base, Input Length 62: 222.802 ms
Model: Base, Input Length 92: 298.752 ms
Model: Base, Input Length 112: 430.356 ms
Model: Base, Input Length 69: 308.470 ms
Model: Base, Input Length 107: 471.311 ms
Model: Base, Input Length 44: 282.988 ms
Model: Base, Input Length 92: 414.957 ms
Model: Base, Input Length 67: 335.109 ms
Model: Base, Input Length 117: 630.408 ms
Model: Base, Input Length 58: 345.023 ms
Model: Ba

Found cached dataset quail (/Users/michaelhermann/.cache/huggingface/datasets/quail/quail/1.3.0/3cabab19c99e571b528209e14313cfff1debf772db9e24e19b4fcbeb8399336c)


Loaded dataset: quail
Preped data


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3010.63it/s]


Model: Base, Input Length 310: 1303.256 ms
Model: Base, Input Length 310: 1402.113 ms
Model: Base, Input Length 320: 1502.304 ms
Model: Base, Input Length 320: 1674.653 ms
Model: Base, Input Length 325: 1854.459 ms
Model: Base, Input Length 325: 1935.818 ms
Model: Base, Input Length 278: 1680.534 ms
Model: Base, Input Length 278: 1635.078 ms
Model: Base, Input Length 313: 1725.402 ms
Model: Base, Input Length 313: 1642.391 ms
Model: Base, Input Length 310: 1504.776 ms
Model: Base, Input Length 310: 1486.798 ms
Model: Base, Input Length 320: 1594.606 ms
Model: Base, Input Length 320: 1690.476 ms
Model: Base, Input Length 325: 1791.503 ms
Model: Base, Input Length 325: 1867.741 ms
Model: Base, Input Length 278: 1486.974 ms
Model: Base, Input Length 278: 1639.316 ms
Model: Base, Input Length 313: 1830.416 ms
Model: Base, Input Length 313: 1582.012 ms
Model: Base, Input Length 310: 1745.011 ms
Model: Base, Input Length 310: 1724.833 ms
Model: Base, Input Length 320: 1847.859 ms
Model: Base

Found cached dataset quail (/Users/michaelhermann/.cache/huggingface/datasets/quail/quail/1.3.0/3cabab19c99e571b528209e14313cfff1debf772db9e24e19b4fcbeb8399336c)


Loaded dataset: quail
Preped data


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: Base, Input Length 310: 1404.750 ms
Model: Base, Input Length 310: 1334.878 ms
Model: Base, Input Length 320: 1241.838 ms
Model: Base, Input Length 320: 1235.162 ms
Model: Base, Input Length 325: 1366.376 ms
Model: Base, Input Length 325: 1482.925 ms
Model: Base, Input Length 278: 1269.972 ms
Model: Base, Input Length 278: 1279.464 ms
Model: Base, Input Length 313: 1277.459 ms
Model: Base, Input Length 313: 1303.000 ms
Model: Base, Input Length 310: 1381.914 ms
Model: Base, Input Length 310: 1483.488 ms
Model: Base, Input Length 320: 1535.306 ms
Model: Base, Input Length 320: 1498.500 ms
Model: Base, Input Length 325: 1583.278 ms
Model: Base, Input Length 325: 1775.756 ms
Model: Base, Input Length 278: 1561.959 ms
Model: Base, Input Length 278: 1557.847 ms
Model: Base, Input Length 313: 1587.280 ms
Model: Base, Input Length 313: 1710.099 ms
Model: Base, Input Length 310: 1688.143 ms
Model: Base, Input Length 310: 1847.453 ms
Model: Base, Input Length 320: 4352.211 ms
Model: Base

Found cached dataset quartz (/Users/michaelhermann/.cache/huggingface/datasets/quartz/default/0.1.0/6e5195fb88ecd7a75eda5d8f3549c262c8b15267366f38f9c153f40da92724a6)


Loaded dataset: quartz
Preped data


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: Base, Input Length 19: 61.021 ms
Model: Base, Input Length 20: 49.230 ms
Model: Base, Input Length 20: 82.056 ms
Model: Base, Input Length 18: 119.725 ms
Model: Base, Input Length 18: 63.673 ms
Model: Base, Input Length 10: 49.744 ms
Model: Base, Input Length 16: 63.643 ms
Model: Base, Input Length 10: 80.672 ms
Model: Base, Input Length 22: 60.755 ms
Model: Base, Input Length 14: 54.604 ms
Model: Base, Input Length 19: 63.584 ms
Model: Base, Input Length 20: 47.030 ms
Model: Base, Input Length 20: 56.905 ms
Model: Base, Input Length 18: 65.986 ms
Model: Base, Input Length 18: 94.128 ms
Model: Base, Input Length 10: 71.926 ms
Model: Base, Input Length 16: 87.300 ms
Model: Base, Input Length 10: 111.518 ms
Model: Base, Input Length 22: 84.403 ms
Model: Base, Input Length 14: 45.700 ms
Model: Base, Input Length 19: 64.764 ms
Model: Base, Input Length 20: 49.461 ms
Model: Base, Input Length 20: 58.390 ms
Model: Base, Input Length 18: 55.564 ms
Model: Base, Input Length 18: 63.956 m

Found cached dataset race (/Users/michaelhermann/.cache/huggingface/datasets/race/all/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)


Loaded dataset: race
Preped data


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2443.05it/s]


Model: Base, Input Length 339: 1102.949 ms
Model: Base, Input Length 433: 1404.662 ms
Model: Base, Input Length 385: 1284.259 ms
Model: Base, Input Length 317: 934.925 ms
Model: Base, Input Length 327: 997.387 ms
Model: Base, Input Length 583: 1400.525 ms
Model: Base, Input Length 254: 708.003 ms
Model: Base, Input Length 347: 1271.092 ms
Model: Base, Input Length 305: 889.786 ms
Model: Base, Input Length 278: 886.947 ms
Model: Base, Input Length 339: 1176.637 ms
Model: Base, Input Length 433: 1405.618 ms
Model: Base, Input Length 385: 1348.178 ms
Model: Base, Input Length 317: 940.470 ms
Model: Base, Input Length 327: 999.911 ms
Model: Base, Input Length 583: 1464.257 ms
Model: Base, Input Length 254: 780.219 ms
Model: Base, Input Length 347: 1329.559 ms
Model: Base, Input Length 305: 907.678 ms
Model: Base, Input Length 278: 945.687 ms
Model: Base, Input Length 339: 1227.425 ms
Model: Base, Input Length 433: 1513.969 ms
Model: Base, Input Length 385: 1319.548 ms
Model: Base, Input Le

Found cached dataset race (/Users/michaelhermann/.cache/huggingface/datasets/race/all/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)


Loaded dataset: race
Preped data


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: Base, Input Length 339: 1231.804 ms
Model: Base, Input Length 433: 1379.646 ms
Model: Base, Input Length 385: 1271.117 ms
Model: Base, Input Length 317: 977.235 ms
Model: Base, Input Length 327: 1041.774 ms
Model: Base, Input Length 583: 1421.205 ms
Model: Base, Input Length 254: 708.533 ms
Model: Base, Input Length 347: 1233.355 ms
Model: Base, Input Length 305: 904.011 ms
Model: Base, Input Length 278: 962.422 ms
Model: Base, Input Length 339: 1198.474 ms
Model: Base, Input Length 433: 1355.160 ms
Model: Base, Input Length 385: 1281.995 ms
Model: Base, Input Length 317: 996.682 ms
Model: Base, Input Length 327: 1168.753 ms
Model: Base, Input Length 583: 1438.865 ms
Model: Base, Input Length 254: 775.726 ms
Model: Base, Input Length 347: 1338.522 ms
Model: Base, Input Length 305: 1007.953 ms
Model: Base, Input Length 278: 1038.411 ms
Model: Base, Input Length 339: 1456.903 ms
Model: Base, Input Length 433: 1587.107 ms
Model: Base, Input Length 385: 1500.965 ms
Model: Base, Inpu

In [391]:
df = pd.read_csv("inference_time_mcq_2.csv")
df



Unnamed: 0,model_name,time once (ms),average_time 50 times (ms),seq_length,context,question,choices,data_id,data_set_name
0,Base,364.140987,,92,Do i need to go for a legal divorce ? I wanted...,Why is this person asking about divorce ?,['If he gets married in the church he wo nt ha...,0,cosmos_qa
1,Base,240.724087,,67,I watched the first McCain / Obama debate last...,How would this person be classified ?,"['None of the above choices .', 'Liberal', 'Co...",10,cosmos_qa
2,Base,454.405308,,117,"So , while i was in the library in my old neig...",What did you do after realizing that your thin...,['I set about reporting the theft to the campu...,20,cosmos_qa
3,Base,268.949986,,58,At the beginning of the change there were jet ...,Why did jet airplanes allow us jump from one p...,['Because it was enough to add two or three co...,30,cosmos_qa
4,Base,261.901140,,62,Another thing I do n't appreciate is the shoot...,Why might I have problems with Jon playing a s...,"[""Because a shooting game involves violence bu...",40,cosmos_qa
...,...,...,...,...,...,...,...,...,...
2095,ONNX-OPT Quantized,940.914154,,583,What is one of the most boring and tiresome wo...,What can we learn about responsibility?,"[""It's of secondary importance to discipline.""...",50,race
2096,ONNX-OPT Quantized,500.789881,,254,Children have their own rules in playing games...,The writer believes that _ .,['children should make better rules for their ...,60,race
2097,ONNX-OPT Quantized,797.747374,,347,Before l tell you bow many hours a day people ...,"According to the poll, the time people spend v...","['one to three hours', 'four to six hours', 'o...",70,race
2098,ONNX-OPT Quantized,599.487305,,305,We are fortunate to be living in a time when a...,The writer's attitude toward the digital socie...,"['critical', 'positive', 'neutral', 'negative']",80,race


## Measure filesize

In [423]:
all_skills = pd.read_csv("square_skills/impl_skills.csv")
# skill = "multiple-choice"
# skills = load_skills(skill)

skills = all_skills

In [424]:
def get_size_of_model(model):
    # torch.save(model.state_dict(), "temp.p")
    # size_of_model = os.path.getsize("temp.p")/(1024*1024)
    # print('Size (MB):', os.path.getsize("temp.p")/(1024*1024))
    # os.remove('temp.p')

    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()

    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_of_model = (param_size + buffer_size) / 1024**2
    return size_of_model

def save_df(df_new, path_to_logger_file = "logger_all.csv"):
    if os.path.exists(path_to_logger_file):
        df_fin = pd.concat([pd.read_csv(path_to_logger_file), df_new])
        df_fin.to_csv(path_to_logger_file,index=False)
    else: 
        df_new.to_csv(path_to_logger_file,index=False)

In [426]:
df = pd.DataFrame(columns=["reader", "adapter", "base", "base_quant", "onnx", "onnx_quant"])

for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print("Loading: {} {}".format(reader, adapter))


    #load base model
    tokenizer = AutoTokenizer.from_pretrained(reader)
    default_model = AutoModelWithHeads.from_pretrained(reader)
    adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
    default_model.active_adapters = adapter_name
    # get base model size
    default_model_size = get_size_of_model(default_model)
    print(default_model_size)
    
    #get quant model size
    quantized_base_model = torch.quantization.quantize_dynamic(default_model, {torch.nn.Linear}, dtype=torch.qint8)
    quantized_base_model_size = get_size_of_model(quantized_base_model)
    print(quantized_base_model_size)

    try:
        #load onnx model
        onnx_model_size = os.path.getsize(f"onnx/{reader}-pf-{adapter}-onnx/model.onnx")/(1024*1024)
        print(onnx_model_size)

    except:
        print("error while exporting onnx")
        onnx_model_size = "error"
    
    try:
        # get onnx quant size 
        onnx_quant_model_size = os.path.getsize(f"onnx/{reader}-pf-{adapter}-onnx/model_quant.onnx")/(1024*1024)
        print(onnx_quant_model_size)

    except:
        print("error while exporting onnx quant")
        onnx_quant_model_size = "error"

        
    df.loc[len(df)] = [reader, adapter, default_model_size, quantized_base_model_size, onnx_model_size, onnx_quant_model_size]
save_df(df, path_to_logger_file="file_size_2.csv")

Loading: bert-base-uncased boolq


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3083.29it/s]


423.32056427001953
388.361328125
Loading: roberta-base boolq


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

481.16434478759766
446.2051086425781
Loading: bert-base-uncased cosmos_qa


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4683.76it/s]


423.31763076782227
388.361328125
Loading: roberta-base cosmos_qa


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

481.1614112854004
446.2051086425781
Loading: bert-base-uncased drop


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 5154.82it/s]


421.06763458251953
388.361328125
Loading: roberta-base drop


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

478.91141510009766
446.2051086425781
Loading: bert-base-uncased hotpotqa


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 5136.93it/s]


421.06763458251953
388.361328125
Loading: roberta-base hotpotqa


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

478.91141510009766
446.2051086425781
Loading: bert-base-uncased multirc


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4431.38it/s]


423.32056427001953
388.361328125
Loading: roberta-base multirc


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

481.16434478759766
446.2051086425781
Loading: bert-base-uncased newsqa


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 5272.54it/s]


421.06763458251953
388.361328125
Loading: roberta-base newsqa


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

478.91141510009766
446.2051086425781
Loading: bert-base-uncased quail


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4918.08it/s]


423.31763076782227
388.361328125
Loading: roberta-base quail


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

481.1614112854004
446.2051086425781
Loading: roberta-base quartz


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

481.1614112854004
446.2051086425781
Loading: bert-base-uncased quoref


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3666.88it/s]


421.06763458251953
388.361328125
Loading: roberta-base quoref


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

478.91141510009766
446.2051086425781
Loading: bert-base-uncased race


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4655.17it/s]


423.31763076782227
388.361328125
Loading: roberta-base race


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

481.1614112854004
446.2051086425781
Loading: bert-base-uncased squad


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3829.25it/s]


421.06763458251953
388.361328125
Loading: roberta-base squad


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

478.91141510009766
446.2051086425781
Loading: bert-base-uncased squad_v2


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2830.16it/s]


421.06763458251953
388.361328125
Loading: roberta-base squad_v2


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

478.91141510009766
446.2051086425781


## Measure Accuracy 
compare base model prediction to exported model

### evaluate all extractive qa model on squad

In [None]:
data_set_name = "squad"
data = load_dataset(data_set_name, split="validation[:500]")
metric = evaluate.load(data_set_name)

In [None]:
def squad_evaluate(inference_func, model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs):
    examples = list(zip(data["question"], data["context"]))
    predictions = []
    for example in examples:
        _, task_outputs, _, _ = inference_func(model, tokenizer, [example], preprocessing_kwargs, task_kwargs, model_kwargs)
        predictions.append(task_outputs["answers"][0][0]["answer"])
    
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in zip(data["id"], predictions)]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in data]
    
    score = metric.compute(predictions=formatted_predictions, references=references)

    return score["f1"], score["exact_match"]

In [None]:
result = []
for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print(f"Loading: {reader} {adapter}")

    #load base model
    tokenizer = AutoTokenizer.from_pretrained(reader)
    default_model = AutoModelWithHeads.from_pretrained(reader)
    adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
    default_model.active_adapters = adapter_name

    # Test acc for base model
    f1, exact = squad_evaluate(base_qa, default_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)
    result.append(("Base", skill, reader, adapter, f1, exact, data_set_name))

    #load onnx models
    model_onnx, model_onnx_quant = repo_builder(reader, adapter)
    onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
    onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "Quantized ONNX", "Quantized ONNX - OPT"]

    # Test acc for onnx models
    for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
        f1, exact = squad_evaluate(question_answering, onnx_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)
        result.append((onnx_model_name, skill, reader, adapter, f1, exact, data_set_name))   

In [None]:
df = pd.DataFrame(result, columns=["name", "skill", "reader", "adapter", "f1", "exact", "dataset"])

In [None]:
save_df(df, "accuracy.csv")

### evaluate extractive qa model on specific adapter

In [180]:
skill =  "span-extraction"
skills_df = load_skills(skill)
skills_df

Unnamed: 0,Name,Retrieval Model,Datastore,Reader Model,Reader Adapter,Type,Code
4,DROP BERT Adapter,,,bert-base-uncased,drop,span-extraction,code
5,DROP RoBERTa Adapter,,,roberta-base,drop,span-extraction,code
6,HotpotQA BERT Adapter,,,bert-base-uncased,hotpotqa,span-extraction,code
7,HotpotQA RoBERTa Adapter,,,roberta-base,hotpotqa,span-extraction,code
8,NewsQA BERT Adapter,,,bert-base-uncased,newsqa,span-extraction,code
9,NewsQA RoBERTa Adapter,,,roberta-base,newsqa,span-extraction,code
14,Quoref BERT Adapter,,,bert-base-uncased,quoref,span-extraction,code
15,Quoref RoBERTa Adapter,,,roberta-base,quoref,span-extraction,code
18,SQuAD 1.1 BERT Adapter,,,bert-base-uncased,squad,span-extraction,code
19,SQuAD 1.1 RoBERTa Adapter,,,roberta-base,squad,span-extraction,code


In [181]:
df = pd.DataFrame(columns=[
        "skill", "reader", "adapter", 
        "timestamp", 
        "answer_base", 
        "answer_quantized_model", 
        "answer_onnx_model", 
        "answer_onnx_opt_model", 
        "answer_quant_onnx_model", 
        "answer_quant_onnx_opt_model", 
        "data_id", "dataset", "question", "context", "answer_dataset"
    ])

In [182]:
def extractive_get_results(adapter, inference_func, model, question, context, tokenizer):
    _, task_outputs, _, _ = inference_func(model, tokenizer, question, context)
    prediction = task_outputs["answers"][0][0]["answer"]
    return prediction

In [None]:
result = []
runs = 10
for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print(f"Loading: {reader} {adapter}")

    #load adapter specific dataset
    data_set_name = adapter
    if data_set_name in ["newsqa", "hotpot_qa"]:
        continue
    else: 
        data = load_dataset(data_set_name, split=f"validation[:{runs}]")
        print(f"Loaded dataset: {data_set_name}")
    
    if adapter == "drop":
        context_name = "passage"
        id_name = "query_id"
        answers_name = "answers_spans"
    else:
        context_name = "context"
        id_name = "id"
        answers_name = "answers"
    
    #load base model
    tokenizer = AutoTokenizer.from_pretrained(reader)
    default_model = AutoModelWithHeads.from_pretrained(reader)
    adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
    default_model.active_adapters = adapter_name
    
    # Get base results
    base_model_result = extractive_get_results(adapter, base_qa, default_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)

    #load and eval quant model 
    quantized_model = torch.quantization.quantize_dynamic(default_model, {torch.nn.Linear}, dtype=torch.qint8)
    quant_base_model_result = extractive_get_results(adapter, base_qa, quantized_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)

    result.append(("Quantized Base Model", skill, reader, adapter, scoring, data_set_name, runs)) 
    
    #load onnx models
    model_onnx, model_onnx_quant = repo_builder(reader, adapter)
    onnx_models_list = load_onnx_model(model_onnx, model_onnx_quant, as_list=True)
    onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "Quantized ONNX", "Quantized ONNX - OPT"] 
    
    for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
        onnx = extractive_get_results(adapter, question_answering, onnx_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)

        scoring = accuracy_scoring(base_model_result, onnx)

        result.append((onnx_model_name, skill, reader, adapter, scoring, data_set_name, runs))          

In [None]:
def run_inf(
        preped_data_set, modelname, run_func, input_model, tokenizer,
    ):    

    
    df = pd.DataFrame(columns=[
            "skill", "reader", "adapter", 
            "timestamp", 
            "answer_base", 
            "answer_quantized_model", 
            "answer_onnx_model", 
            "answer_onnx_opt_model", 
            "answer_quant_onnx_model", 
            "answer_quant_onnx_opt_model", 
            "data_id", "dataset", "question", "context", "answer_dataset"
        ])

    for data_id in tqdm(range(len(preped_data_set))):
        
        example_id = preped_data_set[data_id][0]
        question = preped_data_set[data_id][1]
        context = preped_data_set[data_id][2]
        choices = preped_data_set[data_id][3]
        answer_dataset = preped_data_set[data_id][4]
        
        answer, answer_logits = run_func(input_model, tokenizer, question, context, choices)   
        data_set_name = adapter

        df.loc[len(df)] = [
            skill, reader, adapter, modelname,
            pd.Timestamp.now(),
            answer, answer_logits,
            example_id, data_set_name, question, context[:90], choices, answer_dataset
        ]
    
    save_df(df, f"temp/{adapter}_{reader}_{modelname}.csv")

In [None]:
extractive_get_results(adapter, base_qa, quantized_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)

In [None]:
example_amount = 1

skipping_adapters = ["newsqa", "hotpot_qa"] 
# TODO quail for roberta
for adapter in skills_df["Reader Adapter"].unique():

    if adapter in skipping_adapters:
        print(f"Skipping {adapter}")
        continue
    
    adapter_df = skills_df[skills_df["Reader Adapter"] == adapter]

    #load adapter specific dataset
    data_set_name = adapter
    if example_amount == 0:
        data = load_dataset(data_set_name, split=f"validation")
    else: 
        data = load_dataset(data_set_name, split=f"validation[:{runs}]")

    if adapter == "drop":
        context_name = "passage"
        id_name = "query_id"
        answers_name = "answers_spans"
    else:
        context_name = "context"
        id_name = "id"
        answers_name = "answers"

    print(f"Loaded and preped dataset: {data_set_name} with {len(data)} example questions")

    # load models
    for reader in adapter_df["Reader Model"].unique():
        print(f"Loading: {reader} {adapter}")
        
        #  load base model
        tokenizer = AutoTokenizer.from_pretrained(reader)
        base_model = AutoModelWithHeads.from_pretrained(reader)
        adapter_name = base_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
        base_model.active_adapters = adapter_name
        
        #load and eval quant model 
        quantized_base_model = torch.quantization.quantize_dynamic(base_model, {torch.nn.Linear}, dtype=torch.qint8)

        #load onnx models
        model_onnx, model_onnx_quant = repo_builder(reader, adapter)
        onnx_model, onnx_model_opt, onnx_model_quant, onnx_model_quant_opt = load_onnx_model(model_onnx, model_onnx_quant)
        
        base_p = Process(target=run_inf, args=(preped_data_set, "base", mc_base_inference, base_model, tokenizer))
        quant_base_p = Process(target=run_inf, args=(preped_data_set, "quant_base", mc_base_inference, quantized_base_model, tokenizer,))
        onnx_p = Process(target=run_inf, args=(preped_data_set, "onnx", mc_onnx_inference, onnx_model, tokenizer))
        onn_opt_p = Process(target=run_inf, args=(preped_data_set, "onnx_opt", mc_onnx_inference, onnx_model_opt, tokenizer))
        quant_onnx_p = Process(target=run_inf, args=(preped_data_set, "quant_onnx", mc_onnx_inference, onnx_model_quant, tokenizer))
        quant_onnx_opt_p = Process(target=run_inf, args=(preped_data_set, "quant_onnx_opt", mc_onnx_inference, onnx_model_quant_opt, tokenizer))
    
        base_p.start()
        quant_base_p.start()
        onnx_p.start()
        onn_opt_p.start()
        quant_onnx_p.start()
        quant_onnx_opt_p.start()

        base_p.join()
        quant_base_p.join()
        onnx_p.join()
        onn_opt_p.join()
        quant_onnx_p.join()
        quant_onnx_opt_p.join()

In [None]:
save_df(df, "sim_base_to_onnx.csv")

### evaluate categorical qa

In [35]:
skill = "categorical"
skills_df = load_skills(skill)
skills_df.head()

Unnamed: 0,Name,Retrieval Model,Datastore,Reader Model,Reader Adapter,Type,Code
0,BoolQ BERT Adapter,,,bert-base-uncased,boolq,categorical,code
1,BoolQ RoBERTa Adapter,,,roberta-base,boolq,categorical,code


In [36]:
df = pd.DataFrame(columns=["skill", "reader", "adapter", "timestamp", 
                "answer_base", "logits_answer_base",
                "answer_quantized_model", "logits_answer_quantized_model", 
                "answer_onnx_model", "logits_answer_onnx_model",
                "answer_onnx_opt_model", "logits_answer_onnx_opt_model",
                "answer_quant_onnx_model", "logits_answer_quant_onnx_model",
                "answer_quant_onnx_opt_model", "logits_answer_quant_onnx_opt_model",
                "data_id", "dataset", "question", "context", "answer_dataset"])

In [34]:
example_amount = 1

for adapter in skills_df["Reader Adapter"].unique():
    
    # load dataset
    data_set_name = adapter
    # data = load_dataset(data_set_name, split=f"validation[:{example_amount}]")
    data = load_dataset(data_set_name, split="validation")
    print(f"Loaded dataset: {data_set_name} with {len(data)} example questions")

    # load models
    for reader in skills_df["Reader Model"]:
        print(f"Loading: {reader} {adapter}")

        #load base model
        tokenizer = AutoTokenizer.from_pretrained(reader)
        default_model = AutoModelWithHeads.from_pretrained(reader)
        adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
        default_model.active_adapters = adapter_name

        #load quant model
        quantized_base_model = torch.quantization.quantize_dynamic(default_model, {torch.nn.Linear}, dtype=torch.qint8)
        
        #load onnx models
        model_onnx, model_onnx_quant = repo_builder(reader, adapter)
        onnx_model, onnx_model_opt, onnx_model_quant, onnx_model_quant_opt = load_onnx_model(model_onnx, model_onnx_quant)
        
        for data_id in tqdm(range(len(data))):

            question = data[data_id]["question"]
            context = data[data_id]["passage"]
            answer_dataset = data[data_id]["answer"]

            # Get base results
            base_model_answer, base_model_answer_logit = categorical_base_inference(default_model, tokenizer, question, context)
            
            # #eval quant model
            quant_base_model_answer, quant_base_model_logit = categorical_base_inference(quantized_base_model, tokenizer, question, context)
            
            # eval onnx models
            onnx_model_answer, onnx_model_answer_logit = categorical_onnx_inference(onnx_model, tokenizer, question, context)
            onnx_opt_model_answer, onnx_opt_model_answer_logit = categorical_onnx_inference(onnx_model_opt, tokenizer, question, context)
            quant_onnx_model_answer, quant_onnx_model_answer_logit = categorical_onnx_inference(onnx_model_quant, tokenizer, question, context)
            quant_onnx_opt_model_answer, quant_onnx_opt_model_answer_logit = categorical_onnx_inference(onnx_model_quant_opt, tokenizer, question, context)

            df.loc[len(df)] = [
                skill, reader, adapter,
                pd.Timestamp.now(),
                base_model_answer, base_model_answer_logit.detach().numpy(), # returned tensor
                quant_base_model_answer, quant_base_model_logit.detach().numpy(), # returned tensor
                onnx_model_answer, onnx_model_answer_logit,
                onnx_opt_model_answer, onnx_opt_model_answer_logit,
                quant_onnx_model_answer, quant_onnx_model_answer_logit,
                quant_onnx_opt_model_answer, quant_onnx_opt_model_answer_logit,
                data_id, data_set_name, question, context, answer_dataset

            ]

Found cached dataset cosmos_qa (/Users/michaelhermann/.cache/huggingface/datasets/cosmos_qa/default/0.1.0/3e18538cbfdb2c04189b16642715f0f6da3e97ed5df0aadcec3641245b2cf157)


Loaded dataset: cosmos_qa with 2985 example questions
Loading: bert-base-uncased cosmos_qa


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3036.05it/s]
  0%|          | 0/2985 [00:00<?, ?it/s]


KeyError: 'passage'

In [149]:
save_df(df, "sim_base_to_onnx_cat.csv")

In [None]:
# TODO get results of acc.

### evaluate mcq qa on specific adapter

In [111]:
skill =  "multiple-choice"
skills_df = load_skills(skill)
skills_df

Unnamed: 0,Name,Retrieval Model,Datastore,Reader Model,Reader Adapter,Type,Code
2,CosmosQA BERT,,,bert-base-uncased,cosmos_qa,multiple-choice,code
3,CosmosQA RoBERTa Adapter,,,roberta-base,cosmos_qa,multiple-choice,code
10,QuAIL BERT Adapter,,,bert-base-uncased,quail,multiple-choice,code
11,QuAIL RoBERTa Adapter,,,roberta-base,quail,multiple-choice,code
12,QuaRTz BERT Adapter,,,bert-base-uncased,quartz,multiple-choice,code
13,QuaRTz RoBERTa Adapter,,,roberta-base,quartz,multiple-choice,code
16,RACE BERT Adapter,,,bert-base-uncased,race,multiple-choice,code
17,RACE RoBERTa Adapter,,,roberta-base,race,multiple-choice,code
23,Social-IQA BERT Adapter,,,bert-base-uncased,social_i_qa,multiple-choice,code
24,Social-IQA RoBERTa Adapter,,,roberta-base,social_i_qa,multiple-choice,code


In [38]:
### Some Models are not implemented for testing
# "commonsense_qa", 
# non mcq: "multirc" is categorical (True/False)
# TODO "social_i_qa",

In [159]:
def load_and_prep_dataset(data_set_name, example_amount=0):
    if example_amount == 0:
        print(f"Loading all example data of {data_set_name} dataset")
        split_size = f"validation" #loading complete dataset
    else:
        print(f"Loading just {example_amount} example of {data_set_name} dataset")
        split_size = f"validation[:{example_amount}]" #loading only a part of the dataset
        
    preped_data_set = []
    print("Now laoding dataset.")

    if data_set_name in ["cosmos_qa", "quail", "quartz"]:
        data = load_dataset(data_set_name, split=split_size)
    elif data_set_name == "race":
        data = load_dataset(data_set_name, "middle", split=split_size)
    elif data_set_name in ["multi_rc", "commonsense_qa", "social_i_qa"]: #social_i_qa not implemented
        print("Error. Not implemented data_set. Don't know how to build preped_data_set.")
        return False
    else: 
        print("Error. Not implemented data_set. Cant load dataset.")
        return False
    
    print(f"Loaded dataset: {data_set_name}. Now preping dataset")
    
    # build preped data 
    i = 0 #helper varibale for social_i_qa dataset
    for example in data:
        if data_set_name == "cosmos_qa":
            example_id = example["id"]
            question = example["question"]
            context = example["context"]
            choices = [example["answer0"], example["answer1"], example["answer2"], example["answer3"]]
            correct_answer = choices[example["label"]]
        elif data_set_name == "quail":
            example_id = example["id"]
            question = example["question"]
            context = example["context"]
            choices = example["answers"]
            correct_answer = example["answers"][example["correct_answer_id"]]
        elif data_set_name == "quartz":
            example_id = example["id"]
            question = example["question"]
            context = example["para"]
            choices = example["choices"]["text"]
            correct_answer = example["choices"]["text"][ord(example["answerKey"])-65] # convert ASCII char to Int.
        elif data_set_name =="race":
            example_id = example["example_id"]
            question = example["question"]
            context = example["article"]
            choices = example["options"]
            correct_answer = example["options"][ord(example["answer"])-65] # convert ASCII char to Int.  
        elif data_set_name == "social_i_qa":
            example_id = i
            i+=1
            question = example["question"]
            context = example["context"]
            choices = [example["answerA"], example["answerB"], example["answerC"]]
            correct_answer = choices[int(example["label"])-1]
        else:
            print("Error. Not implemented data_set. Don't know how to build preped_data_set.")
            Exception
        
        preped_data_set.append((example_id, question, context, choices, correct_answer))
    return preped_data_set

In [161]:
df = pd.DataFrame(columns=[
        "skill", "reader", "adapter", 
        "timestamp", 
        "answer_base", "logits_answer_base",
        "answer_quantized_model", "logits_answer_quantized_model", 
        "answer_onnx_model", "logits_answer_onnx_model",
        "answer_onnx_opt_model", "logits_answer_onnx_opt_model",
        "answer_quant_onnx_model", "logits_answer_quant_onnx_model",
        "answer_quant_onnx_opt_model", "logits_answer_quant_onnx_opt_model",
        "data_id", "dataset", "question", "context", "choices", "answer_dataset"
    ])

In [163]:
example_amount = 0

skipping_adapters = ["cosmos_qa", "quail", "quartz"] 
# TODO quail for roberta
for adapter in skills_df["Reader Adapter"].unique():

    if adapter in skipping_adapters:
        print(f"Skipping {adapter}")
        continue

    adapter_df = skills_df[skills_df["Reader Adapter"] == adapter]
    # load dataset
    data_set_name = adapter
    preped_data_set = load_and_prep_dataset(data_set_name, example_amount=example_amount)
    
    if not preped_data_set:
        continue
    print(f"Loaded and preped dataset: {data_set_name} with {len(preped_data_set)} example questions")

    # load models
    for reader in adapter_df["Reader Model"].unique():
        print(f"Loading: {reader} {adapter}")
        
        #  load base model
        tokenizer = AutoTokenizer.from_pretrained(reader)
        base_model = AutoModelWithHeads.from_pretrained(reader)
        adapter_name = base_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
        base_model.active_adapters = adapter_name
        
        #load and eval quant model 
        quantized_base_model = torch.quantization.quantize_dynamic(base_model, {torch.nn.Linear}, dtype=torch.qint8)

        #load onnx models
        model_onnx, model_onnx_quant = repo_builder(reader, adapter)
        onnx_model, onnx_model_opt, onnx_model_quant, onnx_model_quant_opt = load_onnx_model(model_onnx, model_onnx_quant)
        
        for data_id in tqdm(range(len(preped_data_set))):
            
            example_id = preped_data_set[data_id][0]
            question = preped_data_set[data_id][1]
            context = preped_data_set[data_id][2]
            choices = preped_data_set[data_id][3]
            answer_dataset = preped_data_set[data_id][4]
            
            # Get base results
            base_model_answer, base_model_answer_logit = mc_base_inference(base_model, tokenizer, question, context, choices)
            
            # #eval quant model
            quant_base_model_answer, quant_base_model_logit = mc_base_inference(quantized_base_model, tokenizer, question, context, choices)
            
            # eval onnx models
            onnx_model_answer, onnx_model_answer_logit = mc_onnx_inference(onnx_model, tokenizer, question, context, choices)
            onnx_opt_model_answer, onnx_opt_model_answer_logit = mc_onnx_inference(onnx_model_opt, tokenizer, question, context, choices)
            quant_onnx_model_answer, quant_onnx_model_answer_logit = mc_onnx_inference(onnx_model_quant, tokenizer, question, context, choices)
            quant_onnx_opt_model_answer, quant_onnx_opt_model_answer_logit = mc_onnx_inference(onnx_model_quant_opt, tokenizer, question, context, choices)
            
            df.loc[len(df)] = [
                skill, reader, adapter,
                pd.Timestamp.now(),
                base_model_answer, base_model_answer_logit.detach().numpy(), # returned tensor
                quant_base_model_answer, quant_base_model_logit.detach().numpy(), # returned tensor
                onnx_model_answer, onnx_model_answer_logit,
                onnx_opt_model_answer, onnx_opt_model_answer_logit,
                quant_onnx_model_answer, quant_onnx_model_answer_logit,
                quant_onnx_opt_model_answer, quant_onnx_opt_model_answer_logit,
                example_id, data_set_name, question, context[:90], choices, answer_dataset
            ]

In [135]:
df

Unnamed: 0,skill,reader,adapter,timestamp,answer_base,logits_answer_base,answer_quantized_model,logits_answer_quantized_model,answer_onnx_model,logits_answer_onnx_model,...,answer_quant_onnx_model,logits_answer_quant_onnx_model,answer_quant_onnx_opt_model,logits_answer_quant_onnx_opt_model,data_id,dataset,question,context,choices,answer_dataset
0,multiple-choice,bert-base-uncased,quartz,2023-01-23 14:05:46.014108,increase,"[[-0.32308546, -0.30430058]]",decrease,"[[-0.308659, -0.31289622]]",increase,"[[-0.4425696, -0.42104053]]",...,decrease,"[[-0.3296341, -0.37187508]]",decrease,"[[-0.3296341, -0.37187508]]",QRQA-10372-1-flip,quartz,If Jim moves some particles of matter farther ...,"When particles of matter are closer together, ...","[decrease, increase]",decrease
1,multiple-choice,bert-base-uncased,quartz,2023-01-23 14:05:46.371430,decreased,"[[-0.4190234, -0.41873103]]",increased,"[[-0.41074574, -0.42236027]]",decreased,"[[-0.3726902, -0.35832736]]",...,decreased,"[[-0.09447962, -0.052512124]]",decreased,"[[-0.09447962, -0.052512124]]",QRQA-10371-4-flip,quartz,Long ago the surface of Venus warmed enough th...,An increase in greenhouse gases leads to great...,"[increased, decreased]",decreased
2,multiple-choice,bert-base-uncased,quartz,2023-01-23 14:05:46.770601,less,"[[-0.23802754, -0.16951808]]",less,"[[-0.25703636, -0.19954418]]",more,"[[-0.14772616, -0.27151406]]",...,more,"[[0.04064788, -0.005649729]]",more,"[[0.04064788, -0.005649729]]",QRQA-10296-1-flip,quartz,If less waters falls on an area of land it wil...,"As more water covered the land, sand and silt ...","[more, less]",less
3,multiple-choice,bert-base-uncased,quartz,2023-01-23 14:05:47.174047,open,"[[-0.4421321, -0.43315184]]",open,"[[-0.43794847, -0.43287364]]",open,"[[-0.4584921, -0.42407662]]",...,open,"[[-0.18492502, -0.16221972]]",open,"[[-0.18492502, -0.16221972]]",QRQA-10115-3,quartz,Rich applies a solution to the dish that incre...,The increase in turgor pressure of the guard c...,"[close, open]",open
4,multiple-choice,bert-base-uncased,quartz,2023-01-23 14:05:47.519285,younger,"[[-0.25191066, -0.23725833]]",younger,"[[-0.2438518, -0.21938437]]",older,"[[-0.29735819, -0.30204332]]",...,younger,"[[0.000687332, 0.018745586]]",younger,"[[0.000687332, 0.018745586]]",QRQA-10082-1,quartz,Simon was digging in his yard and found that t...,"Therefore, deeper rock layers must be older th...","[older, younger]",older
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475,multiple-choice,bert-base-uncased,race,2023-01-23 15:33:21.106616,in the city's Royal Infirmary,"[[-3.917631, -6.4893255, -5.9038916, -8.029111]]",in the city's Royal Infirmary,"[[-4.3647966, -6.3341856, -5.713045, -8.156919]]",in the city's Royal Infirmary,"[[-0.6763504, -0.72670203, -0.7026611, -0.7307...",...,in the city's Royal Infirmary,"[[-1.1538754, -1.6065819, -1.6095581, -1.49463...",in the city's Royal Infirmary,"[[-1.1538754, -1.6065819, -1.6095581, -1.49463...",high4931.txt,race,"Carmen Blake, the 27-year-old mother, gave gir...",Mother-of-three Carmen Blake called her midwif...,"[in the city's Royal Infirmary, in the ambulan...",in the street on her way to hospital
1476,multiple-choice,bert-base-uncased,race,2023-01-23 15:33:25.883003,there were not enough ambulance in the Royal I...,"[[-4.259598, -6.2071323, -6.157572, -5.3511953]]",there were not enough ambulance in the Royal I...,"[[-4.143508, -6.141632, -6.0479107, -5.3279276]]",there were not enough ambulance in the Royal I...,"[[-1.0495282, -1.0743983, -1.3698975, -1.27539...",...,the maternity ward said Ms Blake ought to call...,"[[-1.5933855, -1.8198922, -1.6915305, -1.33661...",the maternity ward said Ms Blake ought to call...,"[[-1.5933855, -1.8198922, -1.6915305, -1.33661...",high4931.txt,race,It can be inferred that _ .,Mother-of-three Carmen Blake called her midwif...,[there were not enough ambulance in the Royal ...,the maternity ward said Ms Blake only needed a...
1477,multiple-choice,bert-base-uncased,race,2023-01-23 15:33:30.686327,failing to send an ambulance to help her,"[[-2.0323603, -9.8245, -5.6678953, -9.328723]]",failing to send an ambulance to help her,"[[-2.2095993, -9.937094, -5.8011723, -8.906239]]",failing to send an ambulance to help her,"[[-0.6847511, -0.8641702, -0.93433905, -0.7369...",...,having killed her newly-born baby,"[[-1.7267761, -1.033859, -1.3351793, -1.3934815]]",having killed her newly-born baby,"[[-1.7267761, -1.033859, -1.3351793, -1.3934815]]",high4931.txt,race,Carmen Blake accused the Royal Infirmary of _ .,Mother-of-three Carmen Blake called her midwif...,"[failing to send an ambulance to help her, hav...",failing to send an ambulance to help her
1478,multiple-choice,bert-base-uncased,race,2023-01-23 15:33:38.350756,some experts in education,"[[-2.3059714, -0.6309271, -1.1633444, 0.229082...",some experts in education,"[[-1.9526169, -0.77610993, -1.026824, 0.719092...",the Ban bossy campaigners,"[[2.3862607, 2.43561, 2.2959297, 2.2583227]]",...,the Oxford English Dictionary,"[[2.4741068, 1.9389747, 1.8998454, 2.3113618]]",the Oxford English Dictionary,"[[2.4741068, 1.9389747, 1.8998454, 2.3113618]]",high19651.txt,race,"More evidence is provided to show"" bossy"" is m...",Face-book chief operating officer Sheryl Sandb...,"[the Oxford English Dictionary, the Ban bossy ...",the Ban bossy campaigners


In [136]:
save_df(df, "sim_base_to_onnx_mcq.csv")

### Abstractive qa

In [97]:
all_skills = pd.read_csv("square_skills/impl_skills.csv")
skill = "abstractive"
skills = load_skills(skill)

In [98]:
result = []
runs = 250
reader = "facebook/bart-base"
adapter = "narrativeqa"

In [99]:
#load adapter specific dataset
data_set_name = adapter
data = load_dataset(data_set_name, split=f"validation[:{runs}]")
print(f"Loaded dataset: {data_set_name}")

Found cached dataset narrativeqa (/Users/michaelhermann/.cache/huggingface/datasets/narrativeqa/default/0.0.0/daef7ccc51ec258bef464658d11751bb20f033da9b4c219fd84563b3a4af0422)


Loaded dataset: narrativeqa


In [60]:
#load base model
tokenizer = AutoTokenizer.from_pretrained(reader)
default_model = AutoModelWithHeads.from_pretrained(reader)
adapter_name = default_model.load_adapter(f"AdapterHub/{adapter}", source="hf", set_active=True)
default_model.active_adapters = adapter_name

Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3303.90it/s]


In [79]:
def accuracy_scoring(pred_list, true_val_list):
    hit = 0
    for pred, true_val in zip(pred_list, true_val_list):
        if pred in true_val:
            hit += 1
    return hit/len(pred_list)

def base_inference(tokenizer, model, data):
    results = []
    i = 0
    for example in data:
        # print(i)
        try:
            text = example["document"]["summary"]["text"]
            question = example["question"]["text"]
            id = example["document"]["id"]

            prompt = question + "</s>" + text + "</s>"

            if len(prompt.split()) > 1024:
                continue
            
            encoding = tokenizer(prompt, return_tensors='pt', padding=False)
            input_ids = encoding['input_ids']
            attention_mask = encoding['attention_mask']

            answer = model.generate(input_ids, attention_mask=attention_mask, num_beams=4, max_length=128, early_stopping=True)
            answer = tokenizer.decode(answer[0], skip_special_tokens=True)
            results.append((i, id, answer, question))

        except:
            print("error")
            results.append((i, "", "", ""))
        i += 1

    return results


def onnx_inference(onnx_model, tokenizer, data):
    result = []
    i = 0
    for example in preped_data_set:
        if i % 10 == 0:
            print(i)
        i += 1

        text = example["document"]["summary"]["text"]
        question = example["question"]["text"]


        encoding = tokenizer(question, text, return_tensors='np')
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        
        outputs = onnx_model.run(input_feed=dict(encoding), output_names=None)

        answer_idx = np.argmax(outputs[0])
        result.append(outputs)
    return result

In [80]:
# text = data[3]["document"]["summary"]["text"]
# question = data[3]["question"]["text"]
# id = data[3]["document"]["id"]

# print(text)
# print(question)

In [81]:
# Get base results
base_model_result = base_inference(tokenizer, default_model, data)

Token indices sequence length is longer than the specified maximum sequence length for this model (1145 > 1024). Running this sequence through the model will result in indexing errors


error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error


In [83]:
#load and eval quant model 
quantized_model = torch.quantization.quantize_dynamic(default_model, {torch.nn.Linear}, dtype=torch.qint8)
quant_base_model_result = base_inference(tokenizer, quantized_model, data)

error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error


In [91]:
#eval quant base 
scoring = accuracy_scoring([a[2] for a in base_model_result], [a[2] for a in quant_base_model_result])
# result.append(("Quantized Base Model", skill, reader, adapter, scoring, data_set_name, runs))

In [92]:
scoring

0.744

In [9]:
#load onnx models
model_onnx, model_onnx_quant = repo_builder(reader, adapter)
onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "Quantized ONNX", "Quantized ONNX - OPT"] 

for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
    onnx = onnx_inference(adapter, question_answering, onnx_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)

    scoring = accuracy_scoring([a[2] for a in base_model_result], onnx)

    result.append((onnx_model_name, skill, reader, adapter, scoring, data_set_name, runs))          

Loading: facebook/bart-base narrativeqa


Found cached dataset narrativeqa (/Users/michaelhermann/.cache/huggingface/datasets/narrativeqa/default/0.0.0/daef7ccc51ec258bef464658d11751bb20f033da9b4c219fd84563b3a4af0422)


Loaded dataset: narrativeqa
