# BERT inference using Habana Gaudi and Hugging Face Transformers

In [1]:
import torch
import habana_frameworks.torch as ht
import habana_frameworks.torch.core as htcore

print(f"device available:{ht.hpu.is_available()}")
print(f"device_count:{ht.hpu.device_count()}")

device available:True
device_count:8


## BERT example

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# bert-base
# model_id="nlptown/bert-base-multilingual-uncased-sentiment"
# bert-large
model_id="Farshid/bert-large-uncased-financial-phrasebank-allagree2"

# set device and load model
device = torch.device('hpu')
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id).to(device)



Downloading tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

In [3]:
# define payload
payload = "i like you. I love you."

# tokenize input
enc = tokenizer(payload, return_tensors="pt")

# place encodings on hpu
enc = {key: value.to(device) for key,value in enc.items()}
assert enc["input_ids"].device.type == "hpu", "Inputs not on correct device"

In [4]:
# run forward pass
with torch.no_grad():
    output = model(**enc)
    # in lazy mode execution, :code:`mark_step()` must be added after model inference
    htcore.mark_step()
    score = output.logits.softmax(dim=-1)[0]
    
# created nice output
pred = {"label": model.config.id2label[score.argmax().cpu().item()], "score": score.max().cpu().item()}    

print(f'model predicted for input:\n"{payload}"\n"{pred}"')

model predicted for input:
"i like you. I love you."
"{'label': 'LABEL_1', 'score': 0.9986135363578796}"


In [5]:
def hpu_txt_pipeline(inputs,model,tokenizer):
    # tokenize input
    enc = tokenizer(inputs, return_tensors="pt")
    # place encodings on hpu
    enc = {key: value.to(device) for key,value in enc.items()}
    # run forward pass
    with torch.no_grad():
        output = model(**enc)
        # in lazy mode execution, :code:`mark_step()` must be added after model inference
        htcore.mark_step()
        score = output.logits.softmax(dim=-1)[0]
    return {"label": model.config.id2label[score.argmax().cpu().item()], "score": score.max().cpu().item()}    

In [8]:
%timeit hpu_txt_pipeline("I like you",model,tokenizer)
positive_sentiment = hpu_txt_pipeline("I like you",model,tokenizer)
print(positive_sentiment)

%timeit hpu_txt_pipeline("The movie was okay",model,tokenizer)
neutral_sentiment = hpu_txt_pipeline("The movie was okay",model,tokenizer)
print(neutral_sentiment)

%timeit hpu_txt_pipeline("The fish was horrible, i got sick",model,tokenizer)
negative_sentiment = hpu_txt_pipeline("The fish was horrible, i got sick",model,tokenizer)
print(negative_sentiment)


23 ms ± 38.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
{'label': 'LABEL_1', 'score': 0.9652733206748962}
23.4 ms ± 1e+03 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
{'label': 'LABEL_1', 'score': 0.9984086155891418}
23.7 ms ± 958 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
{'label': 'LABEL_0', 'score': 0.9787769317626953}


## Test performance

In [10]:
from time import perf_counter
import numpy as np

payload="Hello my name is Philipp. I am getting in touch with you because i didn't get a response from you. What do I need to do to get my new card which I have requested 2 weeks ago? Please help me and answer this email in the next 7 days. Best regards and have a nice weekend "*2
payload=payload*4
print(f'Payload sequence length is: {len(tokenizer(payload)["input_ids"])}')


def prep_payload(payload):
    enc = tokenizer(payload, return_tensors="pt")
    return {key: value.to(device) for key,value in enc.items()}


def measure_latency(model,payload):
    enc = prep_payload(payload)
    latencies = []
    # warm up
    for _ in range(10):
        with torch.no_grad():
            _ = model(**enc)
            htcore.mark_step()
    # Timed run
    for _ in range(300):
        start_time = perf_counter()
        with torch.no_grad():
            _ = model(**enc)
            htcore.mark_step()
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    time_p95_ms = 1000 * np.percentile(latencies,95)
    return f"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", time_p95_ms

hpu_run=measure_latency(model,payload)
print(f"model: {hpu_run[0]}")



Payload sequence length is: 506
model: P95 latency (ms) - 18.704093350032736; Average latency (ms) - 18.73 +\- 1.23;


### Results

**BERT-base**

```bash
Payload sequence length is: 128
model: P95 latency (ms) - 9.79173000002902; Average latency (ms) - 9.76 +\- 0.02;
Payload sequence length is: 506
model: P95 latency (ms) - 9.774564700057908; Average latency (ms) - 9.74 +\- 0.11;
```

**BERT-large**

```bash
Payload sequence length is: 128
model: P95 latency (ms) - 18.61147640003651; Average latency (ms) - 18.53 +\- 0.07;
Payload sequence length is: 506
model: P95 latency (ms) - 18.704093350032736; Average latency (ms) - 18.73 +\- 1.23;
```