In [None]:
%pip install "optimum-intel[openvino, nncf]" datasets evaluate[evaluator] ipywidgets

In [None]:
import time
import warnings
from pathlib import Path
from copy import deepcopy

import datasets
import evaluate
import numpy as np
import pandas as pd
import transformers
from evaluate import evaluator
from openvino.runtime import Core
from optimum.intel.openvino import OVModelForQuestionAnswering, OVQuantizer, OVConfig

from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()

In [3]:
# MODEL_ID = "csarron/bert-base-uncased-squad-v1"
MODEL_ID="deepset/tinyroberta-squad2"
DATASET_NAME = "squad"
VERSION_2_WITH_NEGATIVE = False

base_model_path = Path(f"models/{MODEL_ID}")
fp32_model_path = base_model_path.with_name(base_model_path.name + "_FP32")
int8_ptq_model_path = base_model_path.with_name(base_model_path.name + "_INT8_PTQ")

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# See how the tokenizer for the given model converts input text to model input values
print(tokenizer("hello world!"))

In [None]:
dataset = datasets.load_dataset(DATASET_NAME)
dataset["train"][31415]

In [6]:
def preprocess_fn(examples, tokenizer):
    """convert the text from the dataset into tokens in the format that the model expects"""
    return tokenizer(
        examples["question"],
        examples["context"],
        padding=True,
        truncation=True,
        max_length=384,
    )

In [21]:
NUM_TRAIN_ITEMS = 600
filtered_examples = dataset["validation"].filter(lambda x: x["title"].startswith("Super_Bowl_50"))
train_examples = filtered_examples.select(range(0, NUM_TRAIN_ITEMS))
train_dataset = train_examples.map(lambda x: preprocess_fn(x, tokenizer), batched=True)

validation_examples = filtered_examples.select(range(NUM_TRAIN_ITEMS, len(filtered_examples)))
validation_dataset = validation_examples.map(lambda x: preprocess_fn(x, tokenizer), batched=True)

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [22]:
quantization_config = OVConfig()
quantization_config.compression["overflow_fix"] = "enable"

bit8_config=deepcopy(quantization_config)
bit_4_4_config=deepcopy(quantization_config)
bit_8_4_config=deepcopy(quantization_config)




In [24]:
import torch.nn as nn
import torch
loss_criterion = nn.CrossEntropyLoss()
BATCH_SIZE=64
train_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                               batch_size=BATCH_SIZE, shuffle=True)


In [27]:
# quantization_config.compression['weights']={'bits':4}
# quantization_config.compression['activations']={'bits':8}
quantization_config.compression['initializer']={
    'range': {'num_init_samples': 300, 
              'type': 'mean_min_max'},
    'batchnorm_adaptation': {
        'num_bn_adaptation_samples': 0},
    "precision": {
        "type": "autoq",
        "bits": [2,4,8],
        "iter_number": 300,
        "compression_ratio": 0.15,
        "eval_subset_ratio": 0.20
        },
    "params":{
        "type":"autoq",
        "params":{
            "num_init_samples": 600,
            "parms":{
                "data_loader":train_dataloader,
                "loss_criterion":loss_criterion
            }
        }
    }

}
quantization_config.target_device='TRIAL'

In [30]:
# warnings.simplefilter("ignore")

# # Quantize the model
# quantizer = OVQuantizer.from_pretrained(model)

# quantizer.quantize(
#     calibration_dataset=train_dataset, 
#                    save_directory=int8_ptq_model_path, 
#                    quantization_config=quantization_config
#                    )

ERROR:nncf:Invalid NNCF config supplied!


ValidationError: ignored

In [36]:
quantized_model_ptq = OVModelForQuestionAnswering.from_pretrained(int8_ptq_model_path)
original_model = AutoModelForQuestionAnswering.from_pretrained(MODEL_ID)
ov_qa_pipeline_ptq = pipeline("question-answering", model=quantized_model_ptq, tokenizer=tokenizer)
hf_qa_pipeline = pipeline("question-answering", model=original_model, tokenizer=tokenizer)


KeyboardInterrupt: ignored

In [22]:
quantized_model_ptq.save_pretrained("/content/drive/MyDrive/GSOC/model/QA_quantized_model_ptq")
original_model.save_pretrained("/content/drive/MyDrive/GSOC/model/QA_original_model")

In [23]:
context = validation_examples[200]["context"]
question = "Who won the game?"
print(context)

Super Bowl 50 featured numerous records from individuals and teams. Denver won despite being massively outgained in total yards (315 to 194) and first downs (21 to 11). Their 194 yards and 11 first downs were both the lowest totals ever by a Super Bowl winning team. The previous record was 244 yards by the Baltimore Ravens in Super Bowl XXXV. Only seven other teams had ever gained less than 200 yards in a Super Bowl, and all of them had lost. The Broncos' seven sacks tied a Super Bowl record set by the Chicago Bears in Super Bowl XX. Kony Ealy tied a Super Bowl record with three sacks. Jordan Norwood's 61-yard punt return set a new record, surpassing the old record of 45 yards set by John Taylor in Super Bowl XXIII. Denver was just 1-of-14 on third down, while Carolina was barely better at 3-of-15. The two teams' combined third down conversion percentage of 13.8 was a Super Bowl low. Manning and Newton had quarterback passer ratings of 56.6 and 55.4, respectively, and their added total

In [24]:
hf_qa_pipeline({"question": question, "context": context})["answer"]


'Denver'

In [25]:
ov_qa_pipeline_ptq({"question": question, "context": context})["answer"]


'third down, while Carolina was barely'

In [26]:
squad_eval = evaluator("question-answering")

ov_eval_results = squad_eval.compute(
    model_or_pipeline=ov_qa_pipeline_ptq,
    data=validation_examples,
    metric="squad",
    squad_v2_format=VERSION_2_WITH_NEGATIVE,
)

hf_eval_results = squad_eval.compute(
    model_or_pipeline=hf_qa_pipeline,
    data=validation_examples,
    metric="squad",
    squad_v2_format=VERSION_2_WITH_NEGATIVE,
)
pd.DataFrame.from_records(
    [hf_eval_results, ov_eval_results],
    columns=["exact_match", "f1"],
    index=["FP32", "INT8 PTQ"],
).round(2)

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Unnamed: 0,exact_match,f1
FP32,86.67,89.28
INT8 PTQ,0.95,3.79


In [28]:
def benchmark(qa_pipeline, dataset, num_items=100):
    """
    Benchmark PyTorch or OpenVINO model. This function does inference on `num_items`
    dataset items and returns the median latency in milliseconds
    """
    latencies = []
    for i, item in enumerate(dataset.select(range(num_items))):
        start_time = time.perf_counter()
        results = qa_pipeline({"question": item["question"], "context": item["context"]})
        end_time = time.perf_counter()
        latencies.append(end_time - start_time)

    return np.median(latencies) * 1000


original_latency = benchmark(hf_qa_pipeline, validation_dataset)
quantized_latency = benchmark(ov_qa_pipeline_ptq, validation_dataset)
cpu_device_name = Core().get_property("CPU", "FULL_DEVICE_NAME")

print(cpu_device_name)
print(f"Latency of original FP32 model: {original_latency:.2f} ms")
print(f"Latency of quantized model: {quantized_latency:.2f} ms")
print(f"Speedup: {(original_latency/quantized_latency):.2f}x")

           Intel(R) Xeon(R) CPU @ 2.20GHz
Latency of original FP32 model: 333.42 ms
Latency of quantized model: 287.91 ms
Speedup: 1.16x


In [29]:
results = []
metric = evaluate.load("squad_v2" if VERSION_2_WITH_NEGATIVE else "squad")

for item in validation_examples:
    id, title, context, question, answers = item.values()
    fp32_answer = hf_qa_pipeline(question, context)["answer"]
    int8_answer = ov_qa_pipeline_ptq(question, context)["answer"]

    references = [{"id": id, "answers": answers}]
    fp32_predictions = [{"id": id, "prediction_text": fp32_answer}]
    int8_predictions = [{"id": id, "prediction_text": int8_answer}]

    fp32_score = round(metric.compute(references=references, predictions=fp32_predictions)["f1"], 2)
    int8_score = round(metric.compute(references=references, predictions=int8_predictions)["f1"], 2)

    if int8_score != fp32_score:
        results.append((question, answers["text"], fp32_answer, fp32_score, int8_answer, int8_score))

pd.set_option("display.max_colwidth", None)
pd.DataFrame(
    results,
    columns=["Question", "Answer", "FP32 answer", "FP32 F1", "INT8 answer", "INT8 F1"],
)

KeyboardInterrupt: ignored