# Accelerate Huggingface transformers inference

## Import a pre-trained Hugginface model

We selected GPT2 as pretrained model we want to optimize. Let's download from the huggingface model hub both the pre-trained architecture and tokenizer. 

In [None]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
text = "Replace me by any text you'd like."

Let's run the model 100 times for computing the average latency.

In [None]:
import time
import torch

In [None]:
encoded_input = tokenizer(text, return_tensors='pt')
times = []
for _ in range(100):
    st = time.time()
    with torch.no_grad():
        output = model(**encoded_input)
    times.append(time.time()-st)
vanilla_short_token_time = sum(times)/len(times)*1000
print(f"Model run 100 times with average latency {vanilla_short_token_time}")

In [None]:
long_text = " ".join([text]*100)
long_encoded_input = tokenizer(long_text, return_tensors='pt', truncation=True)

In [None]:
times = []
for _ in range(100):
    st = time.time()
    with torch.no_grad():
        new_out = model(**long_encoded_input)
    times.append(time.time()-st)
vanilla_long_token_time = sum(times)/len(times)*1000
print(f"Model run 100 times with average latency {vanilla_long_token_time}")

## Nebullvm optimization

It's now time of improving a bit the performance in terms of speed. Let's use `nebullvm`!

In [None]:
from nebullvm.api.frontend.huggingface import optimize_huggingface_model

To use nebullvm is quite easy and straightforward! Just use the `optimize_huggingface_model` function and give as inputs the model, the tokenizer and example of text for the model input, the batch size, the maximum input sizes for each input (excluding the already defined batch size) and a directory where you want to save the optimized model. 

Furthermore, the function takes as input some context information about the model. In this case for instance we need to specify that attention values can be either 0 or 1 (in the `extra_input_info` dictionary).

In [None]:
optimized_model = optimize_huggingface_model(
    model=model,
    tokenizer=tokenizer,
    input_texts=[text],
    batch_size=1,
    max_input_sizes=[tuple(value.size()[1:]) for value in long_encoded_input.values()],
    save_dir=".",
    extra_input_info=[{}, {"max_value": 1, "min_value": 0}],
    use_torch_api=False,
    tokenizer_args={"truncation": True},
    perf_loss_ths=3,
)

In [None]:
times = []
for _ in range(100):
    st = time.time()
    with torch.no_grad():
        final_out = optimized_model(**encoded_input)
    times.append(time.time()-st)
optimized_short_token_time = sum(times)/len(times)*1000
print(f"Model run 100 times with average latency {optimized_short_token_time}")

In [None]:
times = []
for _ in range(100):
    st = time.time()
    with torch.no_grad():
        final_new_out = optimized_model(**long_encoded_input)
    times.append(time.time()-st)
optimized_long_token_time = sum(times)/len(times)*1000
print(f"Model run 100 times with average latency {optimized_long_token_time}")

In [None]:
# Put here your username
your_username = "DiegoFiori"

In [None]:
# Decomment the following line for installing gputil (if you are running on an NVIDIA GPU)
#!pip install gputil

In [None]:
import cpuinfo
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
cpu_info = cpuinfo.get_cpu_info()['brand_raw']
gpu_info = "no"
if torch.cuda.is_available():
    import GPUtil
    gpus = GPUtil.getGPUs()
    gpu_info = list(gpus)[0].name

In [None]:
message = f"""
Hello, I'm {your_username}!
I've tested nebullvm on the following setup:
Hardware: {cpu_info} CPU and {gpu_info} GPU.
Model: GPT2 - HuggingFace - tokens {encoded_input['input_ids'].shape[1]}
Vanilla performance: {round(vanilla_short_token_time, 2)}ms
Optimized performance: {round(optimized_short_token_time, 2)}ms
Acceleration: {round(vanilla_short_token_time/optimized_short_token_time, 1)}x
"""
print(message)

In [None]:
message = f"""
Hello, I'm {your_username}!
I've tested nebullvm on the following setup:
Hardware: {cpu_info} CPU and {gpu_info} GPU.
Model: GPT2 - HuggingFace - tokens {long_encoded_input['input_ids'].shape[1]}
Vanilla performance: {round(vanilla_long_token_time, 2)}ms
Optimized performance: {round(optimized_long_token_time, 2)}ms
Acceleration: {round(vanilla_long_token_time/optimized_long_token_time, 1)}x
"""
print(message)

# Bert

Let's see the nebullvm performance on another model. Let's optimize the popular Bert.

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [None]:
text = "Hello, my dog is cute"
inputs = tokenizer(text, return_tensors="pt")

In [None]:
times = []
for _ in range(100):
    st = time.time()
    with torch.no_grad():
        outputs = model(**inputs)
    times.append(time.time()-st)
vanilla_bert_short = sum(times)/len(times)*1000
print(f"Model run 100 times with average latency {vanilla_bert_short}")

In [None]:
long_text = ". ".join(["Hello, my dog is cute"]*100)
new_inputs = tokenizer(long_text, return_tensors='pt', padding=True, truncation=True)

In [None]:
times = []
for _ in range(100):
    st = time.time()
    with torch.no_grad():
        new_outputs = model(**new_inputs)
    times.append(time.time()-st)
vanilla_bert_long = sum(times)/len(times)*1000
print(f"Model run 100 times with average latency {vanilla_bert_long}")

In [None]:
optimized_model = optimize_huggingface_model(
    model=model,
    tokenizer=tokenizer,
    input_texts=[text],
    batch_size=1,
    max_input_sizes=[tuple(value.size()[1:]) for value in new_inputs.values()],
    save_dir=".",
    extra_input_info=[{}, {"max_value": 1, "min_value": 0}, {"max_value": 1, "min_value": 0}],
    use_torch_api=False,
    tokenizer_args={"truncation": True},
    perf_loss_ths=3,
)

In [None]:
times = []
for _ in range(100):
    st = time.time()
    with torch.no_grad():
        outputs = optimized_model(**inputs)
    times.append(time.time()-st)
optimized_bert_short = sum(times)/len(times)*1000
print(f"Model run 100 times with average latency {optimized_bert_short}")

In [None]:
times = []
for _ in range(100):
    st = time.time()
    with torch.no_grad():
        outputs = optimized_model(**new_inputs)
    times.append(time.time()-st)
optimized_bert_long = sum(times)/len(times)*1000
print(f"Model run 100 times with average latency {optimized_bert_long}")

Copy - paste the message on the main discussion :)

In [None]:
message = f"""
Hello, I'm {your_username}!
I've tested nebullvm on the following setup:
Hardware: {cpu_info} CPU and {gpu_info} GPU.
Model: BERT - HuggingFace - tokens {inputs['input_ids'].shape[1]}
Vanilla performance: {round(vanilla_bert_short, 2)}ms
Optimized performance: {round(optimized_bert_short, 2)}ms
Acceleration: {round(vanilla_bert_short/optimized_bert_short, 1)}x
"""
print(message)

In [None]:
message = f"""
Hello, I'm {your_username}!
I've tested nebullvm on the following setup:
Hardware: {cpu_info} CPU and {gpu_info} GPU.
Model: BERT - HuggingFace - tokens {new_inputs['input_ids'].shape[1]}
Vanilla performance: {round(vanilla_bert_long, 2)}ms
Optimized performance: {round(optimized_bert_long, 2)}ms
Acceleration: {round(vanilla_bert_long/optimized_bert_long, 1)}x
"""
print(message)