In [2]:
from transformers import QuestionAnsweringPipeline, AutoAdapterModel, AutoModelWithHeads, AutoTokenizer, AutoConfig
from transformers.onnx import OnnxConfig, validate_model_outputs, export
from transformers.models.bert import BertOnnxConfig

import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxruntime import InferenceSession
import onnxruntime

from onnx_opcounter import calculate_params

import os
import time
import torch
import numpy as np

from datasets import load_metric, load_dataset

from typing import Mapping, OrderedDict
from pathlib import Path
import random
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


### Extractive QA (DROP BERT)

In [4]:
# Basic extractive QA Example
question = "What is Mary's job?"
context = "Thomas likes animals. Mary likes strawberry ice and is a physician"

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
adapter_name = model.load_adapter("AdapterHub/bert-base-uncased-pf-drop", source="hf")
model.active_adapters = adapter_name

def base_model_inference(question, context):
    inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors="pt")
    
    # model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    start_idx = torch.argmax(outputs.start_logits, axis=1).numpy()[0]
    end_idx = (torch.argmax(outputs.end_logits, axis=1) + 1).numpy()[0]
    return tokenizer.decode(inputs['input_ids'][0, start_idx:end_idx])

answer = base_model_inference(question, context)
print(answer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4826.59it/s]

physician





In [118]:
# Export to ONNX with custom config
class DropBertOnnxConfig(OnnxConfig):
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            dynamic_axis = {0: "batch", 1: "sequence"}
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),
                ("attention_mask", dynamic_axis),
                ("token_type_ids", dynamic_axis),
            ]
        )

config = AutoConfig.from_pretrained("bert-base-uncased")
onnx_config = DropBertOnnxConfig(config, task="question-answering")

onnx_path = Path("onnx/dropbert/model.onnx")

onnx_inputs, onnx_outputs = export(tokenizer, model, onnx_config, onnx_config.default_onnx_opset, onnx_path)

onnx_model = onnx.load(onnx_path)
onnx.checker.check_model(onnx_model)

  if context.output_adapter_gating_scores:
  if tensor is not None and hidden_states.shape[0] != tensor.shape[0]:
  if getattr(ctx, "output_" + attr, False):


In [119]:
# ONNX model inference
def onnx_inference(onnx_path, question, context):
    onnx_model = onnxruntime.InferenceSession(
        str(onnx_path), providers=["CPUExecutionProvider"]
    )

    inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors="np")
    inputs = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}
    outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)

    start_scores = outputs[0]
    end_scores = outputs[1]
    ans_start = np.argmax(start_scores)
    ans_end = np.argmax(end_scores)+1
    return tokenizer.decode(inputs['input_ids'][0, ans_start:ans_end])

question = 'Where do I live?'
context = 'My name is Paul and I live next to Darmstadt'

answer = onnx_inference(onnx_path, question, context)
print(answer)

next to darmstadt


### Model Quantization

In [3]:
# Quantize base model
quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/(1024*1024))
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model)

Size (MB): 421.1566400527954
Size (MB): 396.7126741409302


In [121]:
def quantize_onnx_model(onnx_model_path, quantized_model_path):
    quantize_dynamic(onnx_model_path,
                     quantized_model_path,
                     weight_type=QuantType.QInt8)

    print(f"Quantized model saved to:{quantized_model_path}")

quantize_onnx_model('onnx/dropbert/model.onnx', 'onnx/dropbert/model_quant.onnx')

print('ONNX full precision model size (MB):', os.path.getsize("onnx/dropbert/model.onnx")/(1024*1024))
print('ONNX quantized model size (MB):', os.path.getsize("onnx/dropbert/model_quant.onnx")/(1024*1024))

Ignore MatMul due to non constant B: /[/bert/encoder/layer.0/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.0/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.1/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.1/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.2/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.2/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.3/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.3/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.4/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.4/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.5/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.5/atten

In [122]:
print('Number of base model params:', model.num_parameters())

def count_params_onnx(model_path):
    onnx_model = onnx.load(model_path)
    params = calculate_params(onnx_model)
    return params

print("ONNX number of full precision model params:", count_params_onnx("onnx/dropbert/model.onnx"))
print("ONNX number of quantized model params:", count_params_onnx("onnx/dropbert/model_quant.onnx"))

Number of base model params: 110378306
ONNX number of full precision model params: 109788226
ONNX number of quantized model params: 109788758.0


### Measure Performance on Dataset

In [28]:
import matplotlib.pyplot as plt
from huggingface_hub import hf_hub_download

In [29]:
data = load_dataset("squad", split='validation')

Found cached dataset squad (/Users/michaelhermann/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [39]:
def run_torch(model, inputs):
    with torch.no_grad():
        model(**inputs)

def run_onnx(qa_model, onnx_inputs):
    qa_model.run(output_names=["start_logits", "end_logits"], input_feed=dict(onnx_inputs))   

def get_time_duration(func, model, inputs): 
    st= time.time()
    func(model, inputs)
    et = time.time()
    return 1000 * (et - st)

def save_df(df_new):
    path_to_logger_file = "logger_all.csv"
    if os.path.exists(path_to_logger_file):
        df_fin = pd.concat([pd.read_csv(path_to_logger_file), df_new])
        df_fin.to_csv(path_to_logger_file,index=False)
    else: 
        df_new.to_csv(path_to_logger_file,index=False)

def get_index_of_data_id(data, data_id):
    index = data["id"].index(data_id)
    return data["context"][index], data["question"][index]

def measure_time(perf_type, tokenizer, question, context, model):
    if perf_type == "base":
        inputs = tokenizer(question, context, return_tensors="pt", truncation=True)
        mode = run_torch
        # time_once = get_time_duration(run_torch, model, inputs)
    
    elif perf_type == "seq_length":
        inputs = tokenizer(question, context, return_tensors="np", truncation=True)
        inputs = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}
        mode = run_onnx
        # time_once = get_time_duration(run_onnx, model, inputs) 
    
    time_once = get_time_duration(mode, model, inputs) 
    return time_once

def test_specific_id(perf_type, tokenizer, model, data_id):
    context, question = get_index_of_data_id(data, data_id)
    time_duration = measure_time(perf_type, tokenizer, question, context, model)
    return time_duration  # TODO 

def performance_plot(perf_type, name, model, tokenizer, data, data_intervall = 100):
    X = []
    time_measurements = []
    for i in range(0, len(data["context"]), data_intervall):
        context = data["context"][i]
        question = data["question"][i]

        time_duration = measure_time(perf_type, tokenizer, question, context, model)
        
        seq_length = len(context.split()) # TODO -> reduce stopwords? Real Tokenization?
        X.append(seq_length)
        # time_measurements.append(average_time)
        
        time_measurements.append(time_duration)
    plt.scatter(X, time_measurements, label=name)


def performance_log(perf_type, name, model, tokenizer, data, data_intervall = 100): #TODO add truncacte
    df = pd.DataFrame(columns=["model_name", "time once (ms)", "average_time 50 times (ms)", "seq_length", "context", "question", "data_id"])
    
    for i in range(0, len(data["context"]), data_intervall):
        context = data["context"][i]
        question = data["question"][i]
        time_duration = measure_time(perf_type, tokenizer, question, context, model)
        
        seq_length = len(context.split()) # TODO -> reduce stopwords? Real Tokenization?

        df.loc[len(df)] = [name, time_duration, "", seq_length, context, question, data["id"][i]]
        
        print("Model: {}, Input Length {}: {:.3f} ms".format(name, seq_length, time_duration))
    save_df(df)

In [40]:
def load_model(model_onnx, model_onnx_quant):
    local_onnx_model = onnxruntime.InferenceSession(model_onnx, providers=["CPUExecutionProvider"])
    local_onnx_model_quant = onnxruntime.InferenceSession(model_onnx_quant, providers=["CPUExecutionProvider"])
    
    so = onnxruntime.SessionOptions()
    so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    
    local_onnx_model_opt = onnxruntime.InferenceSession(model_onnx, so)
    local_onnx_model_quant_opt = onnxruntime.InferenceSession(model_onnx_quant, so)
    
    return local_onnx_model, local_onnx_model_opt, local_onnx_model_quant, local_onnx_model_quant_opt

def repo_builder(reader, adapter):
    repo_id = f"UKP-SQuARE/{reader}-pf-{adapter}-onnx"
    filename_onnx = "model.onnx"
    filename_onnx_quant = "model_quant.onnx"

    model_onnx = hf_hub_download(repo_id=repo_id, filename=filename_onnx)
    model_onnx_quant = hf_hub_download(repo_id=repo_id, filename=filename_onnx_quant)

    return model_onnx, model_onnx_quant

In [41]:
skills = pd.read_csv('square_skills/extractive_qa_skills.csv')

for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print("Loading: {} {}".format(reader, adapter))
    model_onnx, model_onnx_quant = repo_builder(reader, adapter)
    onnx_model, onnx_model_opt, onnx_model_quant, onnx_model_quant_opt = load_model(model_onnx, model_onnx_quant)

    tokenizer = AutoTokenizer.from_pretrained(reader)
    default_model = AutoModelWithHeads.from_pretrained(reader)
    adapter_name = default_model.load_adapter(f"AdapterHub/{reader}-pf-{adapter}", source="hf")
    default_model.active_adapters = adapter_name

    data_intervall = 250 # of 10370 ?
    performance_log("base", f"{reader} {adapter} Base", default_model, tokenizer, data, data_intervall=data_intervall)
    performance_log("seq_length", f"{reader} {adapter} ONNX" , onnx_model, tokenizer, data, data_intervall=data_intervall)
    performance_log("seq_length", f"{reader} {adapter} ONNX-OPT", onnx_model_opt, tokenizer, data, data_intervall=data_intervall)

    performance_log("seq_length", f"{reader} {adapter} ONNX Quantized", onnx_model_quant, tokenizer, data, data_intervall=data_intervall)
    performance_log("seq_length", f"{reader} {adapter} ONNX Quantized - OPT", onnx_model_quant_opt, tokenizer, data, data_intervall=data_intervall)


Loading: bert-base-uncased drop


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4528.67it/s]


Model: bert-base-uncased drop Base, Input Length 124: 140.297 ms
Model: bert-base-uncased drop Base, Input Length 65: 81.481 ms
Model: bert-base-uncased drop Base, Input Length 149: 161.599 ms
Model: bert-base-uncased drop Base, Input Length 117: 133.288 ms
Model: bert-base-uncased drop Base, Input Length 152: 150.195 ms
Model: bert-base-uncased drop Base, Input Length 65: 79.839 ms
Model: bert-base-uncased drop Base, Input Length 41: 66.550 ms
Model: bert-base-uncased drop Base, Input Length 115: 148.819 ms
Model: bert-base-uncased drop Base, Input Length 247: 286.772 ms
Model: bert-base-uncased drop Base, Input Length 172: 185.303 ms
Model: bert-base-uncased drop Base, Input Length 140: 245.451 ms
Model: bert-base-uncased drop Base, Input Length 47: 78.456 ms
Model: bert-base-uncased drop Base, Input Length 112: 137.201 ms
Model: bert-base-uncased drop Base, Input Length 64: 72.941 ms
Model: bert-base-uncased drop Base, Input Length 188: 188.233 ms
Model: bert-base-uncased drop Base,

Downloading config.json: 100%|██████████| 481/481 [00:00<00:00, 224kB/s]
Downloading vocab.json: 100%|██████████| 878k/878k [00:00<00:00, 1.39MB/s] 
Downloading merges.txt: 100%|██████████| 446k/446k [00:00<00:00, 822kB/s] 
Downloading tokenizer.json: 100%|██████████| 1.29M/1.29M [00:00<00:00, 1.89MB/s]
Downloading pytorch_model.bin: 100%|██████████| 478M/478M [01:12<00:00, 6.88MB/s] 
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to 

Model: roberta-base drop Base, Input Length 124: 152.329 ms
Model: roberta-base drop Base, Input Length 65: 81.104 ms
Model: roberta-base drop Base, Input Length 149: 165.203 ms
Model: roberta-base drop Base, Input Length 117: 146.144 ms
Model: roberta-base drop Base, Input Length 152: 162.561 ms
Model: roberta-base drop Base, Input Length 65: 73.710 ms
Model: roberta-base drop Base, Input Length 41: 60.340 ms
Model: roberta-base drop Base, Input Length 115: 137.119 ms
Model: roberta-base drop Base, Input Length 247: 263.317 ms
Model: roberta-base drop Base, Input Length 172: 162.008 ms
Model: roberta-base drop Base, Input Length 140: 173.190 ms
Model: roberta-base drop Base, Input Length 47: 57.944 ms
Model: roberta-base drop Base, Input Length 112: 91.863 ms
Model: roberta-base drop Base, Input Length 64: 59.503 ms
Model: roberta-base drop Base, Input Length 188: 183.039 ms
Model: roberta-base drop Base, Input Length 133: 125.550 ms
Model: roberta-base drop Base, Input Length 30: 53.

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 421kB/s]
Downloading: 100%|██████████| 2.25k/2.25k [00:00<00:00, 869kB/s

Model: bert-base-uncased hotpotqa Base, Input Length 124: 120.040 ms
Model: bert-base-uncased hotpotqa Base, Input Length 65: 56.512 ms
Model: bert-base-uncased hotpotqa Base, Input Length 149: 113.740 ms
Model: bert-base-uncased hotpotqa Base, Input Length 117: 118.608 ms
Model: bert-base-uncased hotpotqa Base, Input Length 152: 129.423 ms
Model: bert-base-uncased hotpotqa Base, Input Length 65: 69.606 ms
Model: bert-base-uncased hotpotqa Base, Input Length 41: 46.040 ms
Model: bert-base-uncased hotpotqa Base, Input Length 115: 113.224 ms
Model: bert-base-uncased hotpotqa Base, Input Length 247: 227.856 ms
Model: bert-base-uncased hotpotqa Base, Input Length 172: 167.060 ms
Model: bert-base-uncased hotpotqa Base, Input Length 140: 197.250 ms
Model: bert-base-uncased hotpotqa Base, Input Length 47: 70.950 ms
Model: bert-base-uncased hotpotqa Base, Input Length 112: 124.925 ms
Model: bert-base-uncased hotpotqa Base, Input Length 64: 73.812 ms
Model: bert-base-uncased hotpotqa Base, Inpu

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: roberta-base hotpotqa Base, Input Length 124: 105.291 ms
Model: roberta-base hotpotqa Base, Input Length 65: 62.245 ms
Model: roberta-base hotpotqa Base, Input Length 149: 105.152 ms
Model: roberta-base hotpotqa Base, Input Length 117: 95.336 ms
Model: roberta-base hotpotqa Base, Input Length 152: 129.623 ms
Model: roberta-base hotpotqa Base, Input Length 65: 67.609 ms
Model: roberta-base hotpotqa Base, Input Length 41: 55.921 ms
Model: roberta-base hotpotqa Base, Input Length 115: 133.331 ms
Model: roberta-base hotpotqa Base, Input Length 247: 248.288 ms
Model: roberta-base hotpotqa Base, Input Length 172: 199.955 ms
Model: roberta-base hotpotqa Base, Input Length 140: 139.403 ms
Model: roberta-base hotpotqa Base, Input Length 47: 59.614 ms
Model: roberta-base hotpotqa Base, Input Length 112: 81.121 ms
Model: roberta-base hotpotqa Base, Input Length 64: 52.680 ms
Model: roberta-base hotpotqa Base, Input Length 188: 164.857 ms
Model: roberta-base hotpotqa Base, Input Length 133:

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 609kB/s]
Downloading: 100%|██████████| 2.24k/2.24k [00:00<00:00, 1.40MB/

Model: bert-base-uncased newsqa Base, Input Length 124: 104.918 ms
Model: bert-base-uncased newsqa Base, Input Length 65: 82.974 ms
Model: bert-base-uncased newsqa Base, Input Length 149: 145.072 ms
Model: bert-base-uncased newsqa Base, Input Length 117: 113.235 ms
Model: bert-base-uncased newsqa Base, Input Length 152: 141.350 ms
Model: bert-base-uncased newsqa Base, Input Length 65: 71.701 ms
Model: bert-base-uncased newsqa Base, Input Length 41: 56.547 ms
Model: bert-base-uncased newsqa Base, Input Length 115: 143.266 ms
Model: bert-base-uncased newsqa Base, Input Length 247: 211.103 ms
Model: bert-base-uncased newsqa Base, Input Length 172: 132.735 ms
Model: bert-base-uncased newsqa Base, Input Length 140: 161.056 ms
Model: bert-base-uncased newsqa Base, Input Length 47: 60.655 ms
Model: bert-base-uncased newsqa Base, Input Length 112: 96.781 ms
Model: bert-base-uncased newsqa Base, Input Length 64: 55.872 ms
Model: bert-base-uncased newsqa Base, Input Length 188: 145.164 ms
Model:

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: roberta-base newsqa Base, Input Length 124: 149.117 ms
Model: roberta-base newsqa Base, Input Length 65: 108.605 ms
Model: roberta-base newsqa Base, Input Length 149: 158.378 ms
Model: roberta-base newsqa Base, Input Length 117: 134.327 ms
Model: roberta-base newsqa Base, Input Length 152: 162.418 ms
Model: roberta-base newsqa Base, Input Length 65: 72.900 ms
Model: roberta-base newsqa Base, Input Length 41: 67.568 ms
Model: roberta-base newsqa Base, Input Length 115: 153.458 ms
Model: roberta-base newsqa Base, Input Length 247: 306.491 ms
Model: roberta-base newsqa Base, Input Length 172: 238.436 ms
Model: roberta-base newsqa Base, Input Length 140: 240.069 ms
Model: roberta-base newsqa Base, Input Length 47: 78.784 ms
Model: roberta-base newsqa Base, Input Length 112: 127.019 ms
Model: roberta-base newsqa Base, Input Length 64: 82.661 ms
Model: roberta-base newsqa Base, Input Length 188: 251.251 ms
Model: roberta-base newsqa Base, Input Length 133: 157.545 ms
Model: roberta-ba

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 633kB/s]
Downloading: 100%|██████████| 2.27k/2.27k [00:00<00:00, 1.17MB/

Model: bert-base-uncased squad_v2 Base, Input Length 124: 106.587 ms
Model: bert-base-uncased squad_v2 Base, Input Length 65: 55.659 ms
Model: bert-base-uncased squad_v2 Base, Input Length 149: 106.271 ms
Model: bert-base-uncased squad_v2 Base, Input Length 117: 106.950 ms
Model: bert-base-uncased squad_v2 Base, Input Length 152: 121.584 ms
Model: bert-base-uncased squad_v2 Base, Input Length 65: 53.609 ms
Model: bert-base-uncased squad_v2 Base, Input Length 41: 46.662 ms
Model: bert-base-uncased squad_v2 Base, Input Length 115: 97.843 ms
Model: bert-base-uncased squad_v2 Base, Input Length 247: 204.948 ms
Model: bert-base-uncased squad_v2 Base, Input Length 172: 142.910 ms
Model: bert-base-uncased squad_v2 Base, Input Length 140: 161.376 ms
Model: bert-base-uncased squad_v2 Base, Input Length 47: 76.685 ms
Model: bert-base-uncased squad_v2 Base, Input Length 112: 124.011 ms
Model: bert-base-uncased squad_v2 Base, Input Length 64: 67.125 ms
Model: bert-base-uncased squad_v2 Base, Input

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3608.00it/s]


Model: bert-base-uncased squad_v2 Base, Input Length 124: 129.954 ms
Model: bert-base-uncased squad_v2 Base, Input Length 65: 93.262 ms
Model: bert-base-uncased squad_v2 Base, Input Length 149: 136.117 ms
Model: bert-base-uncased squad_v2 Base, Input Length 117: 136.122 ms
Model: bert-base-uncased squad_v2 Base, Input Length 152: 146.389 ms
Model: bert-base-uncased squad_v2 Base, Input Length 65: 110.585 ms
Model: bert-base-uncased squad_v2 Base, Input Length 41: 63.985 ms
Model: bert-base-uncased squad_v2 Base, Input Length 115: 170.446 ms
Model: bert-base-uncased squad_v2 Base, Input Length 247: 370.848 ms
Model: bert-base-uncased squad_v2 Base, Input Length 172: 211.298 ms
Model: bert-base-uncased squad_v2 Base, Input Length 140: 210.021 ms
Model: bert-base-uncased squad_v2 Base, Input Length 47: 74.398 ms
Model: bert-base-uncased squad_v2 Base, Input Length 112: 137.801 ms
Model: bert-base-uncased squad_v2 Base, Input Length 64: 83.465 ms
Model: bert-base-uncased squad_v2 Base, Inp

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2914.06it/s]


Model: bert-base-uncased squad_v2 Base, Input Length 124: 94.130 ms
Model: bert-base-uncased squad_v2 Base, Input Length 65: 71.412 ms
Model: bert-base-uncased squad_v2 Base, Input Length 149: 135.116 ms
Model: bert-base-uncased squad_v2 Base, Input Length 117: 103.304 ms
Model: bert-base-uncased squad_v2 Base, Input Length 152: 139.842 ms
Model: bert-base-uncased squad_v2 Base, Input Length 65: 61.853 ms
Model: bert-base-uncased squad_v2 Base, Input Length 41: 52.884 ms
Model: bert-base-uncased squad_v2 Base, Input Length 115: 128.304 ms
Model: bert-base-uncased squad_v2 Base, Input Length 247: 226.858 ms
Model: bert-base-uncased squad_v2 Base, Input Length 172: 202.111 ms
Model: bert-base-uncased squad_v2 Base, Input Length 140: 154.624 ms
Model: bert-base-uncased squad_v2 Base, Input Length 47: 59.243 ms
Model: bert-base-uncased squad_v2 Base, Input Length 112: 107.720 ms
Model: bert-base-uncased squad_v2 Base, Input Length 64: 60.832 ms
Model: bert-base-uncased squad_v2 Base, Input

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 476kB/s]
Downloading: 100%|██████████| 2.24k/2.24k [00:00<00:00, 1.08MB/

Model: bert-base-uncased quoref Base, Input Length 124: 116.606 ms
Model: bert-base-uncased quoref Base, Input Length 65: 63.928 ms
Model: bert-base-uncased quoref Base, Input Length 149: 129.087 ms
Model: bert-base-uncased quoref Base, Input Length 117: 104.233 ms
Model: bert-base-uncased quoref Base, Input Length 152: 130.315 ms
Model: bert-base-uncased quoref Base, Input Length 65: 57.959 ms
Model: bert-base-uncased quoref Base, Input Length 41: 46.338 ms
Model: bert-base-uncased quoref Base, Input Length 115: 111.476 ms
Model: bert-base-uncased quoref Base, Input Length 247: 219.430 ms
Model: bert-base-uncased quoref Base, Input Length 172: 181.861 ms
Model: bert-base-uncased quoref Base, Input Length 140: 137.451 ms
Model: bert-base-uncased quoref Base, Input Length 47: 53.801 ms
Model: bert-base-uncased quoref Base, Input Length 112: 92.489 ms
Model: bert-base-uncased quoref Base, Input Length 64: 54.882 ms
Model: bert-base-uncased quoref Base, Input Length 188: 164.758 ms
Model:

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: roberta-base quoref Base, Input Length 124: 277.090 ms
Model: roberta-base quoref Base, Input Length 65: 183.142 ms
Model: roberta-base quoref Base, Input Length 149: 300.091 ms
Model: roberta-base quoref Base, Input Length 117: 250.935 ms
Model: roberta-base quoref Base, Input Length 152: 323.831 ms
Model: roberta-base quoref Base, Input Length 65: 129.762 ms
Model: roberta-base quoref Base, Input Length 41: 112.349 ms
Model: roberta-base quoref Base, Input Length 115: 224.476 ms
Model: roberta-base quoref Base, Input Length 247: 473.247 ms
Model: roberta-base quoref Base, Input Length 172: 340.396 ms
Model: roberta-base quoref Base, Input Length 140: 361.689 ms
Model: roberta-base quoref Base, Input Length 47: 128.937 ms
Model: roberta-base quoref Base, Input Length 112: 222.326 ms
Model: roberta-base quoref Base, Input Length 64: 143.466 ms
Model: roberta-base quoref Base, Input Length 188: 426.572 ms
Model: roberta-base quoref Base, Input Length 133: 310.962 ms
Model: robert

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2598.43it/s]


Model: bert-base-uncased squad Base, Input Length 124: 257.386 ms
Model: bert-base-uncased squad Base, Input Length 65: 153.091 ms
Model: bert-base-uncased squad Base, Input Length 149: 285.788 ms
Model: bert-base-uncased squad Base, Input Length 117: 273.396 ms
Model: bert-base-uncased squad Base, Input Length 152: 283.908 ms
Model: bert-base-uncased squad Base, Input Length 65: 149.564 ms
Model: bert-base-uncased squad Base, Input Length 41: 105.916 ms
Model: bert-base-uncased squad Base, Input Length 115: 257.647 ms
Model: bert-base-uncased squad Base, Input Length 247: 501.380 ms
Model: bert-base-uncased squad Base, Input Length 172: 303.966 ms
Model: bert-base-uncased squad Base, Input Length 140: 305.084 ms
Model: bert-base-uncased squad Base, Input Length 47: 178.284 ms
Model: bert-base-uncased squad Base, Input Length 112: 307.374 ms
Model: bert-base-uncased squad Base, Input Length 64: 204.366 ms
Model: bert-base-uncased squad Base, Input Length 188: 519.843 ms
Model: bert-bas

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: roberta-base squad Base, Input Length 124: 129.348 ms
Model: roberta-base squad Base, Input Length 65: 70.272 ms
Model: roberta-base squad Base, Input Length 149: 147.098 ms
Model: roberta-base squad Base, Input Length 117: 115.745 ms
Model: roberta-base squad Base, Input Length 152: 141.902 ms
Model: roberta-base squad Base, Input Length 65: 49.958 ms
Model: roberta-base squad Base, Input Length 41: 59.073 ms
Model: roberta-base squad Base, Input Length 115: 105.851 ms
Model: roberta-base squad Base, Input Length 247: 217.190 ms
Model: roberta-base squad Base, Input Length 172: 143.796 ms
Model: roberta-base squad Base, Input Length 140: 207.963 ms
Model: roberta-base squad Base, Input Length 47: 53.535 ms
Model: roberta-base squad Base, Input Length 112: 112.396 ms
Model: roberta-base squad Base, Input Length 64: 64.092 ms
Model: roberta-base squad Base, Input Length 188: 181.547 ms
Model: roberta-base squad Base, Input Length 133: 136.148 ms
Model: roberta-base squad Base, In

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2379.97it/s]


Model: bert-base-uncased squad_v2 Base, Input Length 124: 363.778 ms
Model: bert-base-uncased squad_v2 Base, Input Length 65: 212.231 ms
Model: bert-base-uncased squad_v2 Base, Input Length 149: 359.966 ms
Model: bert-base-uncased squad_v2 Base, Input Length 117: 269.155 ms
Model: bert-base-uncased squad_v2 Base, Input Length 152: 365.187 ms
Model: bert-base-uncased squad_v2 Base, Input Length 65: 206.530 ms
Model: bert-base-uncased squad_v2 Base, Input Length 41: 178.367 ms
Model: bert-base-uncased squad_v2 Base, Input Length 115: 303.521 ms
Model: bert-base-uncased squad_v2 Base, Input Length 247: 599.686 ms
Model: bert-base-uncased squad_v2 Base, Input Length 172: 458.799 ms
Model: bert-base-uncased squad_v2 Base, Input Length 140: 392.675 ms
Model: bert-base-uncased squad_v2 Base, Input Length 47: 163.759 ms
Model: bert-base-uncased squad_v2 Base, Input Length 112: 264.210 ms
Model: bert-base-uncased squad_v2 Base, Input Length 64: 166.017 ms
Model: bert-base-uncased squad_v2 Base,

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

Model: roberta-base squad_v2 Base, Input Length 124: 122.568 ms
Model: roberta-base squad_v2 Base, Input Length 65: 84.329 ms
Model: roberta-base squad_v2 Base, Input Length 149: 161.502 ms
Model: roberta-base squad_v2 Base, Input Length 117: 132.132 ms
Model: roberta-base squad_v2 Base, Input Length 152: 156.511 ms
Model: roberta-base squad_v2 Base, Input Length 65: 73.705 ms
Model: roberta-base squad_v2 Base, Input Length 41: 47.218 ms
Model: roberta-base squad_v2 Base, Input Length 115: 116.498 ms
Model: roberta-base squad_v2 Base, Input Length 247: 230.104 ms
Model: roberta-base squad_v2 Base, Input Length 172: 164.456 ms
Model: roberta-base squad_v2 Base, Input Length 140: 170.507 ms
Model: roberta-base squad_v2 Base, Input Length 47: 67.002 ms
Model: roberta-base squad_v2 Base, Input Length 112: 111.521 ms
Model: roberta-base squad_v2 Base, Input Length 64: 67.071 ms
Model: roberta-base squad_v2 Base, Input Length 188: 190.830 ms
Model: roberta-base squad_v2 Base, Input Length 13

In [42]:
df = pd.read_csv("logger_all.csv")

In [22]:
# # Test all other 
# from huggingface_hub import hf_hub_download
# #test other models for extractive qa
# roberta_drop_onnx = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-drop-onnx', filename='model.onnx')
# roberta_drop_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-drop-onnx', filename='model_quant.onnx')

# roberta_squad2_onnx = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-squad_v2-onnx', filename='model.onnx')
# roberta_squad2_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-squad_v2-onnx', filename='model_quant.onnx')

# bert_squad2_onnx = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-squad_v2-onnx', filename='model.onnx')
# bert_squad2_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-squad_v2-onnx', filename='model_quant.onnx')

# roberta_squad_onnx = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-squad-onnx', filename='model.onnx')
# roberta_squad_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-squad-onnx', filename='model_quant.onnx')

# bert_squad_onnx = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-squad-onnx', filename='model.onnx')
# bert_squad_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-squad-onnx', filename='model_quant.onnx')

# roberta_quoref_onnx = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-quoref-onnx', filename='model.onnx')
# roberta_quoref_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-quoref-onnx', filename='model_quant.onnx')

# bert_quoref_onnx = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-quoref-onnx', filename='model.onnx')
# bert_quoref_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-quoref-onnx', filename='model_quant.onnx')

# roberta_newsqa_onnx = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-newsqa-onnx', filename='model.onnx')
# roberta_newsqa_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-newsqa-onnx', filename='model_quant.onnx')

# bert_newsqa_onnx = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-newsqa-onnx', filename='model.onnx')
# bert_newsqa_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-newsqa-onnx', filename='model_quant.onnx')

# roberta_hotpotqa_onnx = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-hotpotqa-onnx', filename='model.onnx')
# roberta_hotpotqa_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-hotpotqa-onnx', filename='model_quant.onnx')

# bert_hotpotqa_onnx = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-hotpotqa-onnx', filename='model.onnx')
# bert_hotpotqa_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-hotpotqa-onnx', filename='model_quant.onnx')

In [27]:
# def run_n_times(func, model, inputs, n = 50): 
#     st= time.time()
#     for _ in range(n):
#         func(model, inputs)
#     et = time.time()
#     avg_inference_time = 1000 * (et - st) / n
#     # message = "Average inference time for %s (n=%d): %.2fms " % (desc, n, avg_inference_time)
#     return avg_inference_time

# def run_torch(model, inputs):
#     with torch.no_grad():
#         model(**inputs)

# def run_onnx(qa_model, onnx_inputs):
#     qa_model.run(output_names=["start_logits", "end_logits"], input_feed=dict(onnx_inputs))   

In [18]:
# import matplotlib.pyplot as plt

# def performance_all(perf_type, name, model, data_id=None): #TODO add truncacte
#     print(f"Starting: {name}")
#     X = []
#     time_measurements = []
#     df = pd.DataFrame(columns=["model_name", "time once (ms)", "average_time 50 times (ms)", "seq_length", "context", "question", "data_id"])

#     # for i in range(6):
#     #     context = (100*i+1)*"World "
#     #     question = "Hello"
    

#     for i in range(0, len(data["context"]), 25):
#         context = data["context"][i]
#         question = data["question"][i]

#         #test specific id
#         if data_id: context, question = test_specific_id(data, data_id)


#         if perf_type == "base":
#             inputs = tokenizer(question, context, return_tensors="pt")
#             time_once = run_n_times(run_torch, model, inputs, 1)
#             # average_time = run_n_times(run_torch, model, inputs, 50)
        
#         elif perf_type == "seq_length":
#             inputs = tokenizer(question, context, return_tensors="np")
#             inputs = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}
#             time_once = run_n_times(run_onnx, model, inputs, 1) # just run once
#             # average_time = run_n_times(run_onnx, model, inputs, 50) # just run once
        
#         seq_length = len(context.split()) # TODO -> reduce stopwords? Real Tokenization?
#         X.append(seq_length)
#         # time_measurements.append(average_time)
#         average_time = ""
#         time_measurements.append(time_once)

#         # print("Model: {}, Input Length {}: {:.3f} ms".format(name, seq_length, average_time))
#         print("Model: {}, Input Length {}: {:.3f} ms".format(name, seq_length, time_once))

#         df.loc[len(df)] = [name, time_once, average_time, seq_length, context, question, data["id"][i]]

#         #test specifc id
#         if data_id: break
   
        
        
#     save_df(df)
    
#     plt.scatter(X, time_measurements, label=name)
#     print(f"Done: {name}")

In [21]:
# performance_all("base", "DROP BERT Base", model)

# performance_all("seq_length", "DROP BERT ONNX", onnx_model)
# performance_all("seq_length", "DROP BERT ONNX - OPT", opt_session_onnx)

# performance_all("seq_length", "DROP BERT ONNX Quantized", onnx_model_quant)
# performance_all("seq_length", "DROP BERT ONNX Quantized - OPT", opt_session_onnx_quant)

# plt.xlabel("Sequence Length (tokens)")
# plt.ylabel("Average Inference Time (ms)")
# plt.legend()
# plt.show()

In [9]:
# import matplotlib.pyplot as plt

# def performance_seq_length(name, model):
#     X = []
#     time_measurements = []

#     for i in range(6):
#         context = (100*i+1)*"World "
#         question = "Hello"

#         inputs = tokenizer(question, context, return_tensors="np")
#         inputs = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}
#         average_time = run_n_times(run_onnx, model, inputs)

#         seq_length = 100*i+5
#         print("Input Length {}: {:.3f} ms".format(seq_length, average_time))
#         X.append(seq_length)
#         time_measurements.append(average_time)

#     plt.scatter(X, time_measurements, label=name)

# def base_performance(name, model):
#     X = []
#     time_measurements = []

#     for i in range(6):
#         context = (100*i+1)*"World "
#         question = "Hello"

#         inputs = tokenizer(question, context, return_tensors="pt")
#         average_time = run_n_times(run_torch, model, inputs)
        
#         seq_length = 100*i+5 
#         print("Input Length {}: {:.3f} ms".format(seq_length, average_time))
#         X.append(seq_length)
#         time_measurements.append(average_time)

#     plt.scatter(X, time_measurements, label=name)


# onnx_model = InferenceSession(
#     "onnx/dropbert/model.onnx", providers=["CPUExecutionProvider"]
# )

# onnx_model_quant = InferenceSession(
#     "onnx/dropbert/model_quant.onnx", providers=["CPUExecutionProvider"]
# )

# base_performance("DROP BERT Base", model)
# performance_seq_length("DROP BERT ONNX", onnx_model)
# performance_seq_length("DROP BERT ONNX Quantized", onnx_model_quant)

# plt.xlabel("Sequence Length (tokens)")
# plt.ylabel("Average Inference Time (ms)")
# plt.legend()
# plt.show()

### Multiple-Choice (CosmosQA BERT)

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
# adapter_name = model.load_adapter("AdapterHub/bert-base-uncased-pf-cosmos_qa", source="hf")
# model.active_adapters = adapter_name

# def mc_model_inference(question, context, choices):
#     outputs = []

#     raw_input = [[context, question + " " + choice] for choice in choices]
#     inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="pt")

#     with torch.no_grad():
#         outputs = model(**inputs)

#     answer_idx = torch.argmax(outputs.logits)
#     return choices[answer_idx]

# question = "What animal has the most hair?"
# context = "Fish are typically not hairy. Cats have 10g of hair. Tigers have 50g of hair. Horses have 100g of hair."
# answer0 = "Tiger"
# answer1= "Cat"
# answer2= "Horse"
# answer3= "Fish"
# choices = [answer0, answer1, answer2, answer3]
# answer = mc_model_inference(question, context, choices)
# print(answer)

In [None]:
# config = AutoConfig.from_pretrained("bert-base-uncased")
# onnx_config = DropBertOnnxConfig(config, task="question-answering")

# onnx_path = Path("onnx/cosmosqabert/model.onnx")

# onnx_inputs, onnx_outputs = export(tokenizer, model, onnx_config, onnx_config.default_onnx_opset, onnx_path)

# onnx_model = onnx.load(onnx_path)
# onnx.checker.check_model(onnx_model)

In [None]:
# def onnx_inference(onnx_path, question, context, choices):
#     onnx_model = InferenceSession(
#         str(onnx_path), providers=["CPUExecutionProvider"]
#     )

#     raw_input = [[context, question + " " + choice] for choice in choices]
#     inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="np")

#     outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)

#     answer_idx = np.argmax(np.abs(np.sum(outputs[0], axis=1)))
#     return choices[answer_idx]

# answer = onnx_inference(onnx_path, question, context, choices)
# print(answer)

In [9]:
# def sorter(lst):
#     ls = [0]*len(lst)
#     for i in range(len(lst)):
#         space = 0
#         t = lst[i]
#         for j in t:
#             if j == " ":
#                 space += 1
#         ls[i] = space+1
#     return ls

# def sorting_data(lst):
#     lst.sort(key=len)
#     return lst

# def sorting_data2(lst):
#     h = [i.split() for i in lst]
#     m =  sorted(h, key=len)
#     return [" ".join(n) for n in m]
    
# def get_question(sorted_context_set_index, set_of_sorted_context_list):
#     original_index = data["context"].index(set_of_sorted_context_list[sorted_context_set_index])
#     return data["question"][original_index]

# def get_random_indexes_list(amount=41, intervals=50): #  2067 unique indexes
#     max = 2066 # TODO
#     index_list = []
#     for i in range(amount):
#         rand_index = random.randint(i*intervals, (i+1)*intervals)
#         if rand_index > max:
#             rand_index = max
#         index_list.append(rand_index)
#     return index_list

# def get_specific_indexes_list(start, intervals):
#     max = 2066 # TODO
#     index_list = []
#     for i in range(start, max, intervals):
#         rand_index = random.randint(i, i+intervals)
#         if rand_index > max:
#             rand_index = max
#         index_list.append(rand_index)
#     return index_list

In [None]:
# import matplotlib.pyplot as plt


# # Idea - > only use each context once. still > 2k unique contexts 
# # but also uses only of multiple questions
# sorted_context_list = sorting_data(data["context"])
# set_of_sorted_context_list = sorting_data(list(set(sorted_context_list)))

# def performance(perf_type, name, model, random_indexes):
#     X = []
#     time_measurements = []

#     for random_index in random_indexes:

#         context = set_of_sorted_context_list[random_index]
#         question = get_question(random_index, set_of_sorted_context_list)  


#         if perf_type == "base":
#             inputs = tokenizer(question, context, return_tensors="pt")
#             average_time = run_n_times(run_torch, model, inputs)
#         elif perf_type == "seq_length":
#             inputs = tokenizer(question, context, return_tensors="np")
#             inputs = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}
#             average_time = run_n_times(run_onnx, model, inputs)
            
#         seq_length = len(context.split()) # TODO -> reduce stopwords? Real Tokenization?
#         print("Input Length {}: {:.3f} ms".format(seq_length, average_time))
#         X.append(seq_length)
#         time_measurements.append(average_time)

#     plt.scatter(X, time_measurements, label=name)

In [4]:
# from huggingface_hub import hf_hub_download
# #test other models for extractive qa
# roberta_drop_onnx = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-drop-onnx', filename='model.onnx')
# roberta_drop_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-drop-onnx', filename='model_quant.onnx')

# roberta_squad2_onnx = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-squad_v2-onnx', filename='model.onnx')
# roberta_squad2_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-squad_v2-onnx', filename='model_quant.onnx')

# bert_squad2_onnx = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-squad_v2-onnx', filename='model.onnx')
# bert_squad2_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-squad_v2-onnx', filename='model_quant.onnx')


# roberta_squad_onnx = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-squad-onnx', filename='model.onnx')
# roberta_squad_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-squad-onnx', filename='model_quant.onnx')


# bert_squad_onnx = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-squad-onnx', filename='model.onnx')
# bert_squad_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-squad-onnx', filename='model_quant.onnx')


# roberta_quoref_onnx = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-quoref-onnx', filename='model.onnx')
# roberta_quoref_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-quoref-onnx', filename='model_quant.onnx')


# bert_quoref_onnx = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-quoref-onnx', filename='model.onnx')
# bert_quoref_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-quoref-onnx', filename='model_quant.onnx')


# roberta_newsqa_onnx = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-newsqa-onnx', filename='model.onnx')
# roberta_newsqa_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-newsqa-onnx', filename='model_quant.onnx')


# bert_newsqa_onnx = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-newsqa-onnx', filename='model.onnx')
# bert_newsqa_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-newsqa-onnx', filename='model_quant.onnx')


# roberta_hotpotqa_onnx = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-hotpotqa-onnx', filename='model.onnx')
# roberta_hotpotqa_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-hotpotqa-onnx', filename='model_quant.onnx')


# bert_hotpotqa_onnx = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-hotpotqa-onnx', filename='model.onnx')
# bert_hotpotqa_onnx_quant = hf_hub_download(repo_id='UKP-SQuARE/bert-base-uncased-pf-hotpotqa-onnx', filename='model_quant.onnx')

Downloading: 100%|██████████| 500M/500M [01:14<00:00, 6.74MB/s] 
Downloading: 100%|██████████| 126M/126M [00:20<00:00, 6.15MB/s] 
Downloading: 100%|██████████| 439M/439M [01:02<00:00, 7.04MB/s] 
Downloading: 100%|██████████| 111M/111M [00:15<00:00, 6.95MB/s] 
Downloading: 100%|██████████| 500M/500M [01:11<00:00, 7.00MB/s] 
Downloading: 100%|██████████| 126M/126M [00:18<00:00, 6.97MB/s] 
Downloading: 100%|██████████| 439M/439M [01:02<00:00, 7.06MB/s] 
Downloading: 100%|██████████| 111M/111M [00:16<00:00, 6.88MB/s] 
Downloading: 100%|██████████| 500M/500M [01:12<00:00, 6.94MB/s] 
Downloading: 100%|██████████| 126M/126M [00:18<00:00, 6.90MB/s] 
Downloading: 100%|██████████| 439M/439M [01:03<00:00, 6.92MB/s] 
Downloading: 100%|██████████| 111M/111M [00:16<00:00, 6.89MB/s] 
Downloading: 100%|██████████| 500M/500M [01:10<00:00, 7.07MB/s] 
Downloading: 100%|██████████| 126M/126M [00:18<00:00, 6.96MB/s] 
Downloading: 100%|██████████| 439M/439M [01:02<00:00, 7.07MB/s] 
Downloading: 100%|███████