In [1]:
# !pip install deepspeed-mii

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import mii
model_dir = "../mii/bloom-3b"
mii_configs = {"tensor_parallel": 1, "dtype": "fp16"}
mii.deploy(task="text-generation",
           model="bigscience/bloom-3b",
           model_path=model_dir,
           deployment_name="bloom3b_deployment",
           mii_config=mii_configs)

[2022-11-12 11:53:36,414] [INFO] [deployment.py:85:deploy] ************* MII is using DeepSpeed Optimizations to accelerate your model *************
[2022-11-12 11:53:36,441] [INFO] [server_client.py:217:_initialize_service] MII using multi-gpu deepspeed launcher:
 ------------------------------------------------------------
 task-name .................... text-generation 
 model ........................ bigscience/bloom-3b 
 model-path ................... ../mii/bloom-3b 
 port ......................... 50050 
 provider ..................... hugging-face 
 ------------------------------------------------------------
[2022-11-12 11:53:37,404] [INFO] [runner.py:507:main] cmd = /home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 --no_python --no_local_rank /home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m mii.launch.multi_gpu_server --task-name text-generation --model 

In [None]:
generator = mii.mii_query_handle("bloom3b_deployment")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir)

## Greedy Search

In [None]:
start_text = "Testing BLOOM-3B with DeepSpeed MII (greedy)"
tokens_start_text = len(tokenizer(start_text, return_tensors="pt").input_ids[0])
tokens_start_text

In [None]:
import time

new_tokens = 1000
gen_length = new_tokens + tokens_start_text
t0 = time.time()
result = generator.query({"query": start_text}, min_length=gen_length, max_length=gen_length)
t1 = time.time()

In [None]:
import re

text=str(result)
pattern='(")(.*)(")'

g=re.search(pattern,text)
gen_text = g.group(2)
tokens_gen_text = len(tokenizer(gen_text, return_tensors="pt").input_ids[0])

In [None]:
total_new_tokens_generated = tokens_gen_text - tokens_start_text
througput = (total_new_tokens_generated) / (t1 - t0)
print(f"Tokens generated: {total_new_tokens_generated}; Time: {t1 - t0:.1f} seconds; Tokens per second: {througput:.1f}")

## Sampling

In [None]:
start_text = "Testing BLOOM-3B with DeepSpeed MII (sampling)"
tokens_start_text = len(tokenizer(start_text, return_tensors="pt").input_ids[0])
tokens_start_text

In [None]:
import time

new_tokens = 1000
gen_length = new_tokens + tokens_start_text
t0 = time.time()
result = generator.query({"query": start_text}, min_length=gen_length, max_length=gen_length, do_sample=True, top_k=50)
t1 = time.time()

In [None]:
import re

text=str(result)
pattern='(")(.*)(")'

g=re.search(pattern,text)
gen_text = g.group(2)
tokens_gen_text = len(tokenizer(gen_text, return_tensors="pt").input_ids[0])

In [None]:
total_new_tokens_generated = tokens_gen_text - tokens_start_text
througput = (total_new_tokens_generated) / (t1 - t0)
print(f"Tokens generated: {total_new_tokens_generated}; Time: {t1 - t0:.1f} seconds; Tokens per second: {througput:.1f}")

## Cleanup

In [None]:
mii.terminate("bloom3b_deployment")