## Extractive QA Pipeline

In [1]:
from datasets import load_dataset
import evaluate
from transformers import AutoModelWithHeads, AutoTokenizer

from onnxruntime import InferenceSession

import time
from typing import Tuple, Union
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def timing(f):
    def wrap(*args, **kwargs):
        time1 = time.time()
        ret = f(*args, **kwargs)
        time2 = time.time()
        
        print("Time: {:.3f} s".format(time2-time1))
        return ret, time2 - time1
    return wrap

In [3]:
def preprocessing(question, context, tokenizer):
    inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors="np")
    return inputs

def postprocessing(outputs, inputs, tokenizer):
    start_scores = outputs[0]
    end_scores = outputs[1]
    ans_start = np.argmax(start_scores)
    ans_end = np.argmax(end_scores)+1
    return tokenizer.decode(inputs['input_ids'][0, ans_start:ans_end])

def onnx_inference(onnx_path, tokenizer, question, context):
    onnx_model = get_onnx_model(onnx_path)

    inputs = preprocessing(question, context, tokenizer)
    outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)
    answer = postprocessing(outputs, inputs, tokenizer)
    return answer

def get_onnx_model(onnx_path):
    return InferenceSession(
        str(onnx_path), providers=["CPUExecutionProvider"]
    )

In [5]:
# API Parameters
base_model = 'bert-base-uncased'
head = 'AdapterHub/bert-base-uncased-pf-drop'
context = 'ONNX is an open format built to represent machine learning models. The key benefits of using ONNX are interoperability of frameworks and HARDware optimization.'
question = 'What are advantages of ONNX?'

tokenizer = AutoTokenizer.from_pretrained(base_model)

# TODO Replace hardcoded string with directory structure
# onnx_path = "/".join("onnx", base_model, head + ".onnx")
answer = onnx_inference("onnx/dropbert/model.onnx", tokenizer, question, context)
print(answer)

interoperability of frameworks and hardware optimization


In [6]:
def decode(
            start_: np.ndarray,
            end_: np.ndarray,
            topk: int,
            max_answer_len: int,
            undesired_tokens_: np.ndarray,
    ) -> Tuple:
    """
    Take the output of any :obj:`ModelForQuestionAnswering` and
        will generate probabilities for each span to be the
        actual answer.
    In addition, it filters out some unwanted/impossible cases
    like answer len being greater than max_answer_len or
    answer end position being before the starting position.
    The method supports output the k-best answer through
    the topk argument.
    Args:
        start_ (:obj:`np.ndarray`): Individual start
            probabilities for each token.
        end (:obj:`np.ndarray`): Individual end_ probabilities
            for each token.
        topk (:obj:`int`): Indicates how many possible answer
            span(s) to extract from the model output.
        max_answer_len (:obj:`int`): Maximum size of the answer
            to extract from the model's output.
        undesired_tokens_ (:obj:`np.ndarray`): Mask determining
            tokens that can be part of the answer
    """
    # Ensure we have batch axis
    if start_.ndim == 1:
        start_ = start_[None]

    if end_.ndim == 1:
        end_ = end_[None]

    # Compute the score of each tuple(start_, end_) to be the real answer
    outer = np.matmul(np.expand_dims(start_, -1), np.expand_dims(end_, 1))

    # Remove candidate with end_ < start_ and end_ - start_ > max_answer_len
    candidates = np.tril(np.triu(outer), max_answer_len - 1)

    #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
    scores_flat = candidates.flatten()
    if topk == 1:
        idx_sort = [np.argmax(scores_flat)]
    elif len(scores_flat) < topk:
        idx_sort = np.argsort(-scores_flat)
    else:
        idx = np.argpartition(-scores_flat, topk)[0:topk]
        idx_sort = idx[np.argsort(-scores_flat[idx])]

    starts_, ends_ = np.unravel_index(idx_sort, candidates.shape)[1:]
    desired_spans = np.isin(starts_, undesired_tokens_.nonzero()) & np.isin(
        ends_, undesired_tokens_.nonzero()
    )
    starts_ = starts_[desired_spans]
    ends_ = ends_[desired_spans]
    scores_ = candidates[0, starts_, ends_]

    return starts_, ends_, scores_

In [11]:
# Code from SQuARE ONNX QA Pipeline (note: some features like explainability and attack mode have been removed)
def question_answering(model_qa, tokenizer, input, preprocessing_kwargs, task_kwargs, model_kwargs):
    """
    Span-based question answering for a given question and context.
    We expect the input to use the (question, context) format for the text pairs.
    Args:
        request: the prediction request
    """    
    preprocessing_kwargs["truncation"] = "only_second"

    features = tokenizer(
        input, return_tensors="np", **preprocessing_kwargs
    )

    predictions_onnx = model_qa.run(input_feed=dict(features), output_names=None)
    predictions = {
        "start_logits": predictions_onnx[0],
        "end_logits": predictions_onnx[1]
    }

    task_outputs = {
        "answers": [],
        "attributions": [],
        "adversarial": {
            "indices": [],
        },  # for hotflip, input_reduction and topk
    }

    for idx, (start, end, (_, context)) in enumerate(
            zip(predictions["start_logits"], predictions["end_logits"], input)
    ):
        # Ensure padded tokens & question tokens cannot
        # belong to the set of candidate answers.
        question_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1)
        # Unmask CLS token for 'no answer'
        question_tokens[0] = 1
        undesired_tokens = question_tokens & features["attention_mask"][idx]

        # Generate mask
        undesired_tokens_mask = undesired_tokens == 0.0

        # Make sure non-context indexes in the tensor cannot
        # contribute to the softmax
        start = np.where(undesired_tokens_mask, -10000.0, start)
        end = np.where(undesired_tokens_mask, -10000.0, end)

        start = np.exp(start - np.log(np.sum(np.exp(start), axis=-1, keepdims=True)))
        end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True)))

        # Get score for 'no answer' then mask for decoding step (CLS token
        no_answer_score = (start[0] * end[0]).item()
        start[0] = end[0] = 0.0

        starts, ends, scores = decode(
            start,
            end,
            task_kwargs.get("topk", 1),
            task_kwargs.get("max_answer_len", 128),
            undesired_tokens,
        )

        enc = features[idx]
        original_ans_start = enc.token_to_word(starts[0])
        original_ans_end = enc.token_to_word(ends[0])
        answers = [
            {
                "score": score.item(),
                "start": enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0],
                "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1],
                "answer": context[
                            enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0]: enc.word_to_chars(
                                enc.token_to_word(e), sequence_index=1
                            )[1]
                            ],
            }
            for s, e, score in zip(starts, ends, scores)
        ]
        if task_kwargs.get("show_null_answers", True):
            answers.append({"score": no_answer_score, "start": 0, "end": 0, "answer": ""})
        answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: task_kwargs.get("topk", 1)]
        task_outputs["answers"].append(answers)

    return predictions, task_outputs, original_ans_start, original_ans_end

In [12]:
preprocessing_kwargs = {
    'padding': True, 'truncation': True 
}

task_kwargs = {
    "show_null_answers": False,
    'topk': 1,
    'max_answer_len': 128
}

model_kwargs = {
    "": {}
}

context = "Angela Merkel (CDU) was the chancelor of Germany. The current chancelor is Olaf Scholz (SPD)."
inputs = [["Who was the chancelor of Germany?", context], ["Who is the current chancelor of Germany?", context], ["Whats the name of Angela Merkel's party?", context]]

model_qa = InferenceSession(
        str("onnx/squadbert/model.onnx"), providers=["CPUExecutionProvider"]
)

_, task_outputs, _, _ = question_answering(model_qa, tokenizer, inputs, preprocessing_kwargs, task_kwargs, model_kwargs)

for i, q in enumerate(task_outputs["answers"]):
    print(inputs[i][0])
    for a in q:
        print("%s (score: %.2f)" % (a['answer'], a['score']))

    print()

Who was the chancelor of Germany?
Angela Merkel (score: 0.79)

Who is the current chancelor of Germany?
Olaf Scholz (score: 0.83)

Whats the name of Angela Merkel's party?
CDU (score: 0.53)



### Accuracy on SQuAD

In [14]:
data = load_dataset("squad", split='validation[:100]')
metric = evaluate.load("squad")


model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
adapter_name = model.load_adapter("AdapterHub/bert-base-uncased-pf-squad", source="hf")
model.active_adapters = adapter_name

onnx_model = InferenceSession(
        str("onnx/squadbert/model.onnx"), providers=["CPUExecutionProvider"]
)

onnx_model_quant = InferenceSession(
        str("onnx/squadbert/model_quant.onnx"), providers=["CPUExecutionProvider"]
)

Found cached dataset squad (/home/daedalus/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertFo

#### Measure Baseline

In [15]:
def base_predict(
            model, input, tokenizer, preprocessing_kwargs, model_kwargs, batch_size=1, disable_gpu=True, output_features=False
    ) -> Union[dict, Tuple[dict, dict]]:
        """
        Inference on the input.
        Args:
         request: the request with the input and optional kwargs
         output_features: return the features of the input.
            Necessary if, e.g., attention mask is needed for post-processing.
        Returns:
             The model outputs and optionally the input features
        """

        all_predictions = []
        preprocessing_kwargs["padding"] = preprocessing_kwargs.get(
            "padding", True
        )
        preprocessing_kwargs["truncation"] = preprocessing_kwargs.get(
            "truncation", True
        )
        model.to(
            "cuda"
            if torch.cuda.is_available() and not disable_gpu
            else "cpu"
        )

        features = tokenizer(
            input, return_tensors="pt", **preprocessing_kwargs
        )

        for start_idx in range(0, len(input), batch_size):
            with torch.no_grad():
                input_features = {
                    k: features[k][start_idx: start_idx + batch_size]
                    for k in features.keys()
                }
                predictions = model(**input_features, **model_kwargs)
                all_predictions.append(predictions)

        keys = all_predictions[0].keys()
        final_prediction = {}
        for key in keys:
            # HuggingFace outputs for 'attentions' and more is
            # returned as tuple of tensors
            # Tuple of tuples only exists for 'past_key_values'
            # which is only relevant for generation.
            # Generation should NOT use this function
            if isinstance(all_predictions[0][key], tuple):
                tuple_of_lists = list(
                    zip(
                        *[
                            [
                                torch.stack(p).cpu()
                                if isinstance(p, tuple)
                                else p.cpu()
                                for p in tpl[key]
                            ]
                            for tpl in all_predictions
                        ]
                    )
                )
                final_prediction[key] = tuple(torch.cat(l) for l in tuple_of_lists)
            else:
                final_prediction[key] = torch.cat(
                    [p[key].cpu() for p in all_predictions]
                )
        if output_features:
            return final_prediction, features

        return final_prediction


def base_qa(model, tokenizer, input, preprocessing_kwargs, task_kwargs, model_kwargs):
    """
    Span-based question answering for a given question and context.
    We expect the input to use the (question, context) format for the text pairs.
    Args:
        request: the prediction request
    """    
    preprocessing_kwargs["truncation"] = "only_second"
    features = tokenizer(
        input, return_tensors="pt", **preprocessing_kwargs
    )
    predictions, features = base_predict(model, input, tokenizer, preprocessing_kwargs, model_kwargs, output_features=True)

    task_outputs = {
        "answers": [],
        "attributions": [],
        "adversarial": {
            "indices": [],
        },  # for hotflip, input_reduction and topk
    }

    for idx, (start, end, (_, context)) in enumerate(
            zip(predictions["start_logits"], predictions["end_logits"], input)
    ):
        # Ensure padded tokens & question tokens cannot
        # belong to the set of candidate answers.
        question_tokens = np.abs(np.array([s != 1 for s in features.sequence_ids(idx)]) - 1)
        # Unmask CLS token for 'no answer'
        question_tokens[0] = 1
        undesired_tokens = question_tokens & features["attention_mask"][idx].numpy()

        # Generate mask
        undesired_tokens_mask = undesired_tokens == 0.0

        # Make sure non-context indexes in the tensor cannot
        # contribute to the softmax
        start = np.where(undesired_tokens_mask, -10000.0, start)
        end = np.where(undesired_tokens_mask, -10000.0, end)

        start = np.exp(start - np.log(np.sum(np.exp(start), axis=-1, keepdims=True)))
        end = np.exp(end - np.log(np.sum(np.exp(end), axis=-1, keepdims=True)))

        # Get score for 'no answer' then mask for decoding step (CLS token
        no_answer_score = (start[0] * end[0]).item()
        start[0] = end[0] = 0.0

        starts, ends, scores = decode(
            start,
            end,
            task_kwargs.get("topk", 1),
            task_kwargs.get("max_answer_len", 128),
            undesired_tokens,
        )

        enc = features[idx]
        original_ans_start = enc.token_to_word(starts[0])
        original_ans_end = enc.token_to_word(ends[0])
        answers = [
            {
                "score": score.item(),
                "start": enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0],
                "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1)[1],
                "answer": context[
                            enc.word_to_chars(enc.token_to_word(s), sequence_index=1)[0]: enc.word_to_chars(
                                enc.token_to_word(e), sequence_index=1
                            )[1]
                            ],
            }
            for s, e, score in zip(starts, ends, scores)
        ]
        if task_kwargs.get("show_null_answers", True):
            answers.append({"score": no_answer_score, "start": 0, "end": 0, "answer": ""})
        answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: task_kwargs.get("topk", 1)]
        task_outputs["answers"].append(answers)

    return predictions, task_outputs, original_ans_start, original_ans_end

In [16]:
@timing
def squad_evaluate(name, inference_func, model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs):
    examples = list(zip(data["question"], data["context"]))
    predictions = []
    for example in examples:
        _, task_outputs, _, _ = inference_func(model, tokenizer, [example], preprocessing_kwargs, task_kwargs, model_kwargs)
        predictions.append(task_outputs["answers"][0][0]["answer"])

    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in zip(data["id"], predictions)]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in data]
    score = metric.compute(predictions=formatted_predictions, references=references)
    print("{} exact match: {:.1f}%".format(name, score['exact_match']))
    print("{} f1: {:.1f}%".format(name, score['f1']))

In [17]:
squad_evaluate("Base Model", base_qa, model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)
print()
squad_evaluate("ONNX", question_answering, onnx_model, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)
print()
squad_evaluate("Quantized ONNX", question_answering, onnx_model_quant, data, tokenizer, preprocessing_kwargs, task_kwargs, model_kwargs)
print()

Base Model exact match: 87.0%
Base Model f1: 91.9%
Time: 10.276 s

ONNX exact match: 87.0%
ONNX f1: 91.9%
Time: 7.075 s

Quantized ONNX exact match: 81.0%
Quantized ONNX f1: 85.8%
Time: 5.299 s

