In [1]:
# !pip install deepspeed-mii --upgrade

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
import mii
model_dir = "../models/bloom-3b"
mii_configs = {"tensor_parallel": 1, "dtype": "int8"}
mii.deploy(task="text-generation",
           model="bigscience/bloom-3b",
           model_path=model_dir,
           deployment_name="bloom3b_int8_deployment",
           mii_config=mii_configs)

[2022-11-12 15:32:02,879] [INFO] [deployment.py:85:deploy] ************* MII is using DeepSpeed Optimizations to accelerate your model *************
[2022-11-12 15:32:02,905] [INFO] [server_client.py:217:_initialize_service] MII using multi-gpu deepspeed launcher:
 ------------------------------------------------------------
 task-name .................... text-generation 
 model ........................ bigscience/bloom-3b 
 model-path ................... ../models/bloom-3b 
 port ......................... 50050 
 provider ..................... hugging-face 
 ------------------------------------------------------------
[2022-11-12 15:32:03,882] [INFO] [runner.py:507:main] cmd = /home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 --no_python --no_local_rank /home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m mii.launch.multi_gpu_server --task-name text-generation --mod

In [4]:
generator = mii.mii_query_handle("bloom3b_int8_deployment")

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir)

## Greedy Search

In [6]:
start_text = "Testing BLOOM-3B with DeepSpeed MII (greedy)"
tokens_start_text = len(tokenizer(start_text, return_tensors="pt").input_ids[0])
tokens_start_text

15

In [7]:
import time

new_tokens = 1000
gen_length = new_tokens + tokens_start_text
t0 = time.time()
result = generator.query({"query": start_text}, min_length=gen_length, max_length=gen_length)
t1 = time.time()

Free memory : 7228948480 (Bytes)  Total memory: 23836033024 (Bytes)  Setting maximum total tokens (input + output) to 2285 


AioRpcError: <AioRpcError of RPC that terminated with:
	status = StatusCode.UNKNOWN
	details = "Exception calling application: shape '[1, 15, 32, 240]' is invalid for input of size 38400"
	debug_error_string = "UNKNOWN:Error received from peer ipv6:%5B::1%5D:50050 {grpc_message:"Exception calling application: shape \'[1, 15, 32, 240]\' is invalid for input of size 38400", grpc_status:2, created_time:"2022-11-12T15:32:53.987002085+00:00"}"
>

In [None]:
import re

text=str(result)
pattern='(")(.*)(")'

g=re.search(pattern,text)
gen_text = g.group(2)
tokens_gen_text = len(tokenizer(gen_text, return_tensors="pt").input_ids[0])

In [None]:
total_new_tokens_generated = tokens_gen_text - tokens_start_text
throughput = (total_new_tokens_generated) / (t1 - t0)
print(f"Tokens generated: {total_new_tokens_generated}; Time: {t1 - t0:.1f} seconds; Tokens per second: {throughput:.1f}; Latency: {1000 / throughput:.0f} ms")

## Sampling

In [None]:
start_text = "Testing BLOOM-3B with DeepSpeed MII (sampling)"
tokens_start_text = len(tokenizer(start_text, return_tensors="pt").input_ids[0])
tokens_start_text

In [None]:
import time

new_tokens = 1000
gen_length = new_tokens + tokens_start_text
t0 = time.time()
result = generator.query({"query": start_text}, min_length=gen_length, max_length=gen_length, do_sample=True, top_k=50)
t1 = time.time()

In [None]:
import re

text=str(result)
pattern='(")(.*)(")'

g=re.search(pattern,text)
gen_text = g.group(2)
tokens_gen_text = len(tokenizer(gen_text, return_tensors="pt").input_ids[0])

In [None]:
total_new_tokens_generated = tokens_gen_text - tokens_start_text
throughput = (total_new_tokens_generated) / (t1 - t0)
print(f"Tokens generated: {total_new_tokens_generated}; Time: {t1 - t0:.1f} seconds; Tokens per second: {throughput:.1f}; Latency: {1000 / throughput:.0f} ms")

## Cleanup

In [None]:
mii.terminate("bloom3b_int8_deployment")