In [134]:
from transformers import QuestionAnsweringPipeline, AutoAdapterModel, AutoModelWithHeads, AutoTokenizer, AutoConfig
from transformers.onnx import OnnxConfig, validate_model_outputs, export
from transformers.models.bert import BertOnnxConfig
from transformers.models.bart import BartOnnxConfig

import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxruntime import InferenceSession
import onnxruntime

from onnx_opcounter import calculate_params

import os
import time
import torch
import numpy as np

from datasets import load_metric, load_dataset

from typing import Mapping, OrderedDict
from pathlib import Path
import random
import pandas as pd

In [135]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = AutoModelWithHeads.from_pretrained("facebook/bart-base")
adapter_name = model.load_adapter("AdapterHub/narrativeqa", source="hf", set_active=True)
model.set_active_adapters(adapter_name)

Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 5180.28it/s]


In [136]:
question, text = "What does Sara hate?", "Sara hates taxes. She loves vanilla ice cream."
prompt = text + "</s>" + question + "</s>"

encoding = tokenizer(prompt, return_tensors='pt', padding=False)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

answer = model.generate(input_ids, attention_mask=attention_mask, num_beams=4, max_length=128, early_stopping=True)
answer = tokenizer.decode(answer[0], skip_special_tokens=True)
answer


' Taxes \n'

In [137]:
# second test 

question = "WHAT NAME WAS CYNTHIA MORE FAMOUSLY KNOWN BY?"
text = 'The play begins with three pages disputing over the black cloak usually worn by the actor who delivers the prologue. They draw lots for the cloak, and one of the losers, Anaides, starts telling the audience what happens in the play to come; the others try to suppress him, interrupting him and putting their hands over his mouth. Soon they are fighting over the cloak and criticizing the author and the spectators as well. In the play proper, the goddess Diana, also called Cynthia, has ordained a "solemn revels" in the valley of Gargaphie in Greece. The gods Cupid and Mercury appear, and they too start to argue. Mercury has awakened Echo, who weeps for Narcissus, and states that a drink from Narcissus\'s spring causes the drinkers to "Grow dotingly enamored of themselves." The courtiers and ladies assembled for the Cynthia\'s revels all drink from the spring. Asotus, a foolish spendthrift who longs to become a courtier and a master of fashion and manners, also drinks from the spring; emboldened by vanity and self-love, he challenges all comers to a competition of "court compliment." The competition is held, in four phases, and the courtiers are beaten. Two symbolic masques are performed within the play for the assembled revelers. At their conclusion, Cynthia (representing Queen Elizabeth) has the dancers unmask and shows that vices have masqueraded as virtues. She sentences them to make reparation and to purify themselves by bathing in the spring at Mount Helicon. The figure of Actaeon in the play may represent Robert Devereux, 2nd Earl of Essex, while Cynthia\'s lady in waiting Arete may be Lucy, Countess of Bedford, one of Elizabeth\'s ladies in waiting as well as Jonson\'s patroness\' The play is notably rich in music, as is typical for the theatre of the boys\' companies, which originated as church choirs.'

prompt = text + "</s>" + question + "</s>"

encoding = tokenizer(prompt, return_tensors='pt', padding=False)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

answer = model.generate(input_ids, attention_mask=attention_mask, num_beams=4, max_length=128, early_stopping=True)
answer = tokenizer.decode(answer[0], skip_special_tokens=True)
answer


' Anaides \n'

In [138]:
question_2 = "WHO DOES ECHO WEEP FOR?"

prompt = text + "</s>" + question_2 + "</s>"

encoding = tokenizer(prompt, return_tensors='pt', padding=False)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

answer = model.generate(input_ids, attention_mask=attention_mask, num_beams=4, max_length=128, early_stopping=True)
answer = tokenizer.decode(answer[0], skip_special_tokens=True)
answer

' Anaides and Anaides \n'

In [139]:
question_3 = "WHAT DOES A DRINK FROM NARCISSUS'S SPRING CAUSE THE DRINKER TO DO?"

prompt = text + "</s>" + question_3 + "</s>"

encoding = tokenizer(prompt, return_tensors='pt', padding=False)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

answer = model.generate(input_ids, attention_mask=attention_mask, num_beams=4, max_length=128, early_stopping=True)
answer1 = tokenizer.decode(answer[0], skip_special_tokens=True)
answer1

' They are fighting over the cloak. \n'

In [142]:
logits = model(input_ids, attention_mask=attention_mask).logits[0]
print(logits.shape)
print(logits.argmax())

torch.Size([433, 50265])
tensor(1115979)


In [143]:
cur_len = 0
max_length = 10
eos_token_id = (
    tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id
)
features = encoding
input_ids = logits
generated_ids = []
unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
scores = ()

In [144]:
# We need manual generation for the ONNX model (greedy generation)
# https://github.com/huggingface/transformers/blob/main/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py
while cur_len < max_length:
    input_data = features.copy()
    input_data["input_ids"] = torch.cat(
        (
            features["input_ids"],
            torch.tensor(generated_ids, dtype=int).unsqueeze(dim=0),
        ),
        dim=1,
    )
    input_data["attention_mask"] = torch.ones(input_data["input_ids"].shape, dtype=torch.int64)

    predictions = model(input_data["input_ids"], attention_mask=input_data["attention_mask"])
    
    next_token_logits = predictions["logits"][:, -1, :]
    scores += (next_token_logits,)

    # argmax
    next_tokens = torch.argmax(next_token_logits, dim=-1)
    # update generated ids, model inputs, and length for next step
    generated_ids.append(next_tokens[:, None].item())
    cur_len = cur_len + 1
    
    if eos_token_id is not None:
        unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long())
        # stop when each sentence is finished, or if we exceed the maximum length
    if unfinished_sequences.max() == 0:
        break


In [224]:
config = AutoConfig.from_pretrained("facebook/bart-base")
onnx_config = BartOnnxConfig.from_model_config(config, task="causal-lm")

onnx_path = Path("onnx/narrativeqabart/model.onnx")

onnx_inputs, onnx_outputs = export(tokenizer, model, onnx_config, onnx_config.default_onnx_opset, onnx_path)

onnx_model = onnx.load(onnx_path)
onnx.checker.check_model(onnx_model)

  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
  if context.output_adapter_gating_scores:
  if tensor is not None and hidden_states.shape[0] != tensor.shape[0]:
  if input_shape[-1] > 1:
  mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
  if getattr(ctx, "output_" + attr, False):
  if input_ids is not None and x.shape[1] == input_ids.shape[1]:
  if len(torch.unique(eos_mask.sum(1))) > 1:


In [279]:
onnx_model = InferenceSession(
    str(onnx_path), providers=["CPUExecutionProvider"]
)

encoding = tokenizer(question, text, return_tensors='np')
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

outputs = onnx_model.run(input_feed=dict(encoding), output_names=None)

In [280]:
onnx_config.outputs

OrderedDict([('logits', {0: 'batch', 1: 'sequence'})])

In [283]:
encoding["input_ids"]

array([[    0, 44628, 42374, 26817, 32854,   487,  3732,  2889,  3001,
        32113,  5061, 11160,   975, 30105, 14356, 10786,   116,     2,
            2,   133,   310,  3772,    19,   130,  6052,  2982, 34618,
           81,     5,   909, 40725,  2333, 10610,    30,     5,  2701,
           54,  8806,     5, 41255, 10149,     4,   252,  2451,  3739,
           13,     5, 40725,     6,     8,    65,     9,     5, 19113,
            6, 14493,  4376,     6,  2012,  2758,     5,  2437,    99,
         2594,    11,     5,   310,     7,   283,   131,     5,   643,
          860,     7, 23192,   123,     6, 22749,   154,   123,     8,
         2057,    49,  1420,    81,    39,  6085,     4,  9561,    51,
           32,  2190,    81,     5, 40725,     8, 18384,     5,  2730,
            8,     5, 17596,    25,   157,     4,    96,     5,   310,
         4692,     6,     5, 34293, 10670,     6,    67,   373, 16589,
            6,    34, 36433,    10,    22, 37870, 17673, 20853,    29,
      

In [289]:
with torch.no_grad():
    num_beams = 4
    ort_out = onnx_model.run(
        None,
        {
            "input_ids": encoding["input_ids"],
            "attention_mask": encoding["attention_mask"],
        },
    )

In [291]:
len(ort_out)

2

In [292]:
inputs = tokenizer(prompt, max_length=1024, return_tensors="pt").to(model.device)
summary_ids = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    num_beams=num_beams,
    max_length=max_length,
    early_stopping=True,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [295]:
summary_ids.cpu().numpy()

array([[    2,     0,   252,    32,  2190,    81,     5, 40725,     8,
            2]])

In [252]:
from torch.nn import functional as F
next_token_logits = torch.Tensor(outputs[0][:, -1, :])
top_k = 12
top_n_logits, top_n_tokens = torch.topk(next_token_logits, top_k, dim=1)
top_n_probs = F.softmax(top_n_logits, dim=-1)
words = [
    tokenizer.decode(x, skip_special_tokens=True)
    for x in top_n_tokens[0]
]
new_past = np.array(outputs[1:])

print(words)

['', ' suscept', '-', 'abled', ' "', ' \xad', ' �', 'GoldMagikarp', ' ', 'OK', ' OK', 'inged']


AssertionError: 
Not equal to tolerance rtol=0.001, atol=0.001

(shapes (1, 10), (1, 426, 50265) mismatch)
 x: array([[    2,     0,   252,    32,  2190,    81,     5, 40725,     8,
            2]])
 y: array([[[30.044197,  3.914951, 11.842628, ...,  3.704317,  3.88825 ,
         -3.931822],
        [-6.724422, -5.136206,  1.204158, ..., -4.670123, -4.980473,...

In [253]:
top_n_logits

tensor([[4.9881, 4.2705, 3.8324, 3.7781, 3.7627, 3.6943, 3.6371, 3.5584, 3.4352,
         3.3904, 3.3634, 3.3491]])

In [179]:
tokenizer.decode(answer[0], skip_special_tokens=True)

' They are fighting over the cloak. \n'