# Compiling and Deploying HuggingFace Pretrained BERT



### Introduction

In this tutorial we will compile and deploy BERT-base version of HuggingFace BERT for Inferentia. The full list of HuggingFace's pretrained BERT models can be found in the BERT section on this page https://huggingface.co/transformers/pretrained_models.html. 

This Jupyter notebook should be run on an instance which is inf1.6xlarge or larger. The compile part of this tutorial requires inf1.6xlarge and not the inference itself. For simplicity we will run this tutorial on inf1.6xlarge but in real life scenario the compilation should be done on a compute instance and the deployment on inf1 instance to save costs.

Before running the following verify this Jupyter notebook is running “conda_aws_neuron_pytorch_p36” kernel. You can select the Kernel from the “Kernel -> Change Kernel” option on the top of this Jupyter notebook page.

### Compile the model into an AWS Neuron optimized TorchScript


In [2]:
import tensorflow  # to workaround a protobuf version conflict issue
import torch
import torch.neuron
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import transformers

# Build tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=False)

# Setup some example inputs
sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

max_length=128
paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")

# Run the original PyTorch model on compilation exaple
paraphrase_classification_logits = model(**paraphrase)[0]

# Convert example inputs to a format that is compatible with TorchScript tracing
example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask'], paraphrase['token_type_ids']
example_inputs_not_paraphrase = not_paraphrase['input_ids'], not_paraphrase['attention_mask'], not_paraphrase['token_type_ids']

# Run torch.neuron.trace to generate a TorchScript that is optimized by AWS Neuron
model_neuron = torch.neuron.trace(model, example_inputs_paraphrase)

# Verify the TorchScript works on both example inputs
paraphrase_classification_logits_neuron = model_neuron(*example_inputs_paraphrase)
not_paraphrase_classification_logits_neuron = model_neuron(*example_inputs_not_paraphrase)

# Save the TorchScript for later use
model_neuron.save('bert_neuron.pt')

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=433297515.0), HTML(value='')))




  position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
  input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors
INFO:Neuron:There are 3 ops of 1 different types in the TorchScript that are not compiled by neuron-cc: aten::embedding, (For more information see https://github.com/aws/aws-neuron-sdk/blob/master/release-notes/neuron-cc-ops/neuron-cc-ops-pytorch.md)
INFO:Neuron:Number of arithmetic operators (pre-compilation) before = 711, fused = 694, percent fused = 97.61%
INFO:Neuron:compiling function _NeuronGraph$661 with neuron-cc
INFO:Neuron:Compiling with command line: '/opt/conda/bin/neuron-cc compile /tmp/tmpepbk8pfy/graph_def.pb --framework TENSORFLOW --pipeline compile SaveTemps --output /tmp/tmpepbk8pfy/graph_def.neff --io-config {"inputs": {"0:0": [[1, 128, 768], "float32"], "1:0": [[1, 1, 1, 128], "float32"]}, "outputs": ["Add_133:0"]} --verbose 35'
Tensor output are ** NOT CALCULATED ** during CPU execution

You may inspect `model_neuron.graph` to see which part is running on CPU versus running on the accelerator. All native `aten` operators in the graph will be running on CPU.

In [None]:
print(model_neuron.graph)

# Compare Single example TorchScript with default vanilla Model

In [134]:
import time
import torch
from datasets import load_dataset

sequence = ["The company HuggingFace is based in New York City",
           # "Apples are especially bad for your health"
           "HuggingFace's headquarters are situated in Manhattan"]

model_neuron = torch.jit.load('bert_neuron.pt')
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=False)


classes = ['not paraphrase', 'paraphrase']

def tokenize(sequence,max_length=128):
    tokenized_seq = tokenizer.encode_plus(sequence, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
    return tokenized_seq['input_ids'], tokenized_seq['attention_mask'], tokenized_seq['token_type_ids']

    
def run_benchmark(model,inputs,device=None):
    with torch.no_grad():
        if isinstance(model, torch.jit.ScriptModule):
            model_start = time.perf_counter()
            outputs = model(*inputs)
            model_stop = time.perf_counter()
            pred = classes[outputs[0][0].argmax().item()]
            print(f"TorchScript Neuron inference took {round(model_stop - model_start,4) * 1000}ms and class was {pred}")
        else:
            model_start = time.perf_counter()
            outputs = model(*inputs)
            model_stop = time.perf_counter()
            pred = classes[outputs[0][0].argmax().item()]
            print(f"PyTorch inference took {round(model_stop - model_start,4) * 1000}ms and class was {pred}")
                            
tokenized_seq = tokenize(sequence) 
# neuron model
print(label)
run_benchmark(model_neuron, tokenized_seq)
run_benchmark(model, tokenized_seq)



0
TorchScript Neuron inference took 5.8ms and class was paraphrase
PyTorch inference took 150.5ms and class was not paraphrase


# Compare complete MRPC Test dataset

using `batch_size=1`

In [1]:
from datasets import load_dataset,load_metric
import tensorflow  # to workaround a protobuf version conflict issue
import torch
import torch.neuron
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time

In [2]:


task = "mrpc"
split="validation"
all_datasets = load_dataset("glue", task)
metric = load_metric("glue", task)
dataset= all_datasets[split]

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
max_length=128
padding='max_length'

def preprocess_function(examples):
    # Tokenize the texts
    texts = (examples['sentence1'], examples['sentence2'])
    result = tokenizer(*texts, padding=padding, max_length=max_length, truncation=True,return_tensors="pt")
    result["labels"] = examples["label"]
    return result


def do_test(raw_dataset,model,model_type):
    processed_dataset = raw_dataset.map(preprocess_function)
    processed_dataset = processed_dataset.select(range(1000))
    model_start = time.perf_counter()
#     model_type = 'neuron' if isinstance(model, torch.jit.ScriptModule) else 'torch'
    with torch.no_grad():
        for step, batch in enumerate(processed_dataset):
            input_ids = torch.tensor(batch['input_ids'])
            attention_mask = torch.tensor(batch['attention_mask'])
            token_type_ids = torch.tensor(batch['token_type_ids'])
            outputs = model(*[input_ids,attention_mask,token_type_ids])
            predictions = outputs[0][0].argmax().item()
            metric.add_batch(predictions=[predictions],references=[batch["labels"]])
        
    eval_metric = metric.compute()
    model_stop = time.perf_counter()
    total_time = round(model_stop - model_start,4)*1000
    average_time =  round(total_time/len(processed_dataset),4)
    return {'model_type':model_type,**eval_metric,'total_time':f"{total_time}ms",'average_time':f"{average_time}ms"}   





In [3]:
model_neuron = torch.jit.load('bert_neuron.pt')
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=False)



model_res=do_test(dataset, model,'pytorch')
model_neuron_res = do_test(dataset, model_neuron,'neuron')

print(model_res)
print(model_neuron_res)

Tensor output are ** NOT CALCULATED ** during CPU execution and only indicate tensor shape (Triggered internally at  /opt/workspace/KaenaPyTorchRuntime/neuron_op/neuron_op_impl.cpp:38.)
  result = self.forward(*input, **kwargs)


{'model_type': 'pytorch', 'accuracy': 0.852, 'f1': 0.8987688098495212, 'total_time': '246881.30000000002ms', 'average_time': '246.8813ms'}
{'model_type': 'neuron', 'accuracy': 0.685, 'f1': 0.8130563798219584, 'total_time': '2283.7999999999997ms', 'average_time': '2.2838ms'}


# Compile the model into an non AWS optimized TorchScript


In [160]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import transformers

# Build tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=False)


sequence = ["The company HuggingFace is based in New York City",
            "HuggingFace's headquarters are situated in Manhattan"]


max_length=128
paraphrase = tokenizer.encode_plus(sequence, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")

example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask'], paraphrase['token_type_ids']

# Run torch.jit.trace to generate a TorchScript 
model_torch = torch.jit.trace(model, example_inputs_paraphrase)

# Verify the TorchScript works on both example inputs
paraphrase_classification_logits_neuron = model_neuron(*example_inputs_paraphrase)
# Save the TorchScript for later use
model_torch.save('bert_torch.pt')

  position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
  input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors


In [4]:
model_torch = torch.jit.load('bert_torch.pt')


model_torch_res = do_test(dataset, model_torch,'torchscript')
model_torch_res



{'model_type': 'torchscript',
 'accuracy': 0.852,
 'f1': 0.8987688098495212,
 'total_time': '277930.2ms',
 'average_time': '277.9302ms'}

# Result comparison

In [6]:
import pandas as pd
df = pd.DataFrame([model_res,model_neuron_res,model_torch_res])

In [7]:
df.head()

Unnamed: 0,model_type,accuracy,f1,total_time,average_time
0,pytorch,0.852,0.898769,246881.30000000002ms,246.8813ms
1,neuron,0.685,0.813056,2283.7999999999997ms,2.2838ms
2,torchscript,0.852,0.898769,277930.2ms,277.9302ms
