In [5]:
# !pip install transformers --upgrade

In [6]:
model_dir = "../models/bloom-3b"

In [7]:
import torch

In [8]:
from transformers import BloomForCausalLM
model = BloomForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)

In [10]:
from transformers import pipeline, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_dir)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

## Greedy Search

In [11]:
start_text = "Testing BLOOM-3B without DeepSpeed (greedy)"
tokens_start_text = len(tokenizer(start_text, return_tensors="pt").input_ids[0])
tokens_start_text

13

In [12]:
import time

new_tokens = 1000
gen_length = new_tokens + tokens_start_text

t0 = time.time()
gen_text = pipe(start_text, min_length=gen_length, max_length=gen_length)[0]['generated_text']
t1 = time.time()
tokens_gen_text = len(tokenizer(gen_text, return_tensors="pt").input_ids[0])

In [13]:
total_new_tokens_generated = tokens_gen_text - tokens_start_text
throughput = (total_new_tokens_generated) / (t1 - t0)
print(f"Tokens generated: {total_new_tokens_generated}; Time: {t1 - t0:.1f} seconds; Tokens per second: {throughput:.1f}; Latency: {1000 / throughput:.0f} ms")

Tokens generated: 1000; Time: 20.4 seconds; Tokens per second: 49.1; Latency: 20 ms


## Sampling

In [14]:
start_text = "Testing BLOOM-3B without DeepSpeed (sampling)"
tokens_start_text = len(tokenizer(start_text, return_tensors="pt").input_ids[0])
tokens_start_text

13

In [15]:
new_tokens = 1000
gen_length = new_tokens + tokens_start_text

t0 = time.time()
gen_text = pipe(start_text, min_length=gen_length, max_length=gen_length, do_sample=True, top_k=50)[0]['generated_text']
t1 = time.time()
tokens_gen_text = len(tokenizer(gen_text, return_tensors="pt").input_ids[0])

In [16]:
total_new_tokens_generated = tokens_gen_text - tokens_start_text
throughput = (total_new_tokens_generated) / (t1 - t0)
print(f"Tokens generated: {total_new_tokens_generated}; Time: {t1 - t0:.1f} seconds; Tokens per second: {throughput:.1f}; Latency: {1000 / throughput:.0f} ms")

Tokens generated: 1000; Time: 21.4 seconds; Tokens per second: 46.7; Latency: 21 ms
