In [1]:
from transformers import pipeline, AutoTokenizer

model_dir = "../mii/bloom-3b"
pipe = pipeline("text-generation", model=model_dir, device=0)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

## Greedy Search

In [2]:
start_text = "Testing BLOOM-3B without DeepSpeed (greedy)"
tokens_start_text = len(tokenizer(start_text, return_tensors="pt").input_ids[0])
tokens_start_text

13

In [3]:
import time

new_tokens = 1000
gen_length = new_tokens + tokens_start_text

t0 = time.time()
gen_text = pipe(start_text, min_length=gen_length, max_length=gen_length)[0]['generated_text']
t1 = time.time()
tokens_gen_text = len(tokenizer(gen_text, return_tensors="pt").input_ids[0])

In [4]:
total_new_tokens_generated = tokens_gen_text - tokens_start_text
througput = (total_new_tokens_generated) / (t1 - t0)
print(f"Tokens generated: {total_new_tokens_generated}; Time: {t1 - t0:.1f} seconds; Tokens per second: {througput:.1f}")

Tokens generated: 1000; Time: 33.1 seconds; Tokens per second: 30.2


## Sampling

In [5]:
start_text = "Testing BLOOM-3B without DeepSpeed (sampling)"
tokens_start_text = len(tokenizer(start_text, return_tensors="pt").input_ids[0])
tokens_start_text

12

In [6]:
new_tokens = 1000
gen_length = new_tokens + tokens_start_text

t0 = time.time()
gen_text = pipe(start_text, min_length=gen_length, max_length=gen_length, do_sample=True, top_k=50)[0]['generated_text']
t1 = time.time()
tokens_gen_text = len(tokenizer(gen_text, return_tensors="pt").input_ids[0])

In [7]:
total_new_tokens_generated = tokens_gen_text - tokens_start_text
througput = (total_new_tokens_generated) / (t1 - t0)
print(f"Tokens generated: {total_new_tokens_generated}; Time: {t1 - t0:.1f} seconds; Tokens per second: {througput:.1f}")

Tokens generated: 1000; Time: 34.4 seconds; Tokens per second: 29.1
