In [None]:
# !pip install deepspeed-mii --upgrade

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
import mii
model_dir = "../models/bloom-3b"
mii_configs = {"tensor_parallel": 1, "dtype": "fp16"}
mii.deploy(task="text-generation",
           model="bigscience/bloom-3b",
           model_path=model_dir,
           deployment_name="bloom3b_deployment",
           mii_config=mii_configs)

[2022-11-12 15:29:41,315] [INFO] [deployment.py:85:deploy] ************* MII is using DeepSpeed Optimizations to accelerate your model *************
[2022-11-12 15:29:41,346] [INFO] [server_client.py:217:_initialize_service] MII using multi-gpu deepspeed launcher:
 ------------------------------------------------------------
 task-name .................... text-generation 
 model ........................ bigscience/bloom-3b 
 model-path ................... ../models/bloom-3b 
 port ......................... 50050 
 provider ..................... hugging-face 
 ------------------------------------------------------------
[2022-11-12 15:29:42,308] [INFO] [runner.py:507:main] cmd = /home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 --no_python --no_local_rank /home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m mii.launch.multi_gpu_server --task-name text-generation --mod

In [4]:
generator = mii.mii_query_handle("bloom3b_deployment")

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir)

## Greedy Search

In [6]:
start_text = "Testing BLOOM-3B with DeepSpeed MII (greedy)"
tokens_start_text = len(tokenizer(start_text, return_tensors="pt").input_ids[0])
tokens_start_text

15

In [7]:
import time

new_tokens = 1000
gen_length = new_tokens + tokens_start_text
t0 = time.time()
result = generator.query({"query": start_text}, min_length=gen_length, max_length=gen_length)
t1 = time.time()

Free memory : 8864727040 (Bytes)  Total memory: 23836033024 (Bytes)  Setting maximum total tokens (input + output) to 2843 


In [8]:
import re

text=str(result)
pattern='(")(.*)(")'

g=re.search(pattern,text)
gen_text = g.group(2)
tokens_gen_text = len(tokenizer(gen_text, return_tensors="pt").input_ids[0])

In [9]:
total_new_tokens_generated = tokens_gen_text - tokens_start_text
throughput = (total_new_tokens_generated) / (t1 - t0)
print(f"Tokens generated: {total_new_tokens_generated}; Time: {t1 - t0:.1f} seconds; Tokens per second: {throughput:.1f}; Latency: {1000 / throughput:.0f} ms")

Tokens generated: 913; Time: 16.2 seconds; Tokens per second: 56.5; Latency: 18 ms


## Sampling

In [10]:
start_text = "Testing BLOOM-3B with DeepSpeed MII (sampling)"
tokens_start_text = len(tokenizer(start_text, return_tensors="pt").input_ids[0])
tokens_start_text

15

In [11]:
import time

new_tokens = 1000
gen_length = new_tokens + tokens_start_text
t0 = time.time()
result = generator.query({"query": start_text}, min_length=gen_length, max_length=gen_length, do_sample=True, top_k=50)
t1 = time.time()

In [12]:
import re

text=str(result)
pattern='(")(.*)(")'

g=re.search(pattern,text)
gen_text = g.group(2)
tokens_gen_text = len(tokenizer(gen_text, return_tensors="pt").input_ids[0])

In [13]:
total_new_tokens_generated = tokens_gen_text - tokens_start_text
throughput = (total_new_tokens_generated) / (t1 - t0)
print(f"Tokens generated: {total_new_tokens_generated}; Time: {t1 - t0:.1f} seconds; Tokens per second: {throughput:.1f}; Latency: {1000 / throughput:.0f} ms")

Tokens generated: 1015; Time: 17.2 seconds; Tokens per second: 59.1; Latency: 17 ms


## Cleanup

In [14]:
mii.terminate("bloom3b_deployment")

[2022-11-12 15:31:00,281] [INFO] [terminate.py:8:terminate] Terminating server for bloom3b_deployment
[2022-11-12 15:31:01,356] [INFO] [launch.py:286:sigkill_handler] Killing subprocess 15182
[2022-11-12 15:31:01,356] [ERROR] [launch.py:292:sigkill_handler] ['/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python', '-m', 'mii.launch.multi_gpu_server', '--task-name', 'text-generation', '--model', 'bigscience/bloom-3b', '--model-path', '../models/bloom-3b', '--port', '50050', '--ds-optimize', '--provider', 'hugging-face', '--config', 'eyJ0ZW5zb3JfcGFyYWxsZWwiOiAxLCAicG9ydF9udW1iZXIiOiA1MDA1MCwgImR0eXBlIjogImZwMTYiLCAiZW5hYmxlX2N1ZGFfZ3JhcGgiOiBmYWxzZSwgImNoZWNrcG9pbnRfZGljdCI6IG51bGwsICJkZXBsb3lfcmFuayI6IFswXSwgInRvcmNoX2Rpc3RfcG9ydCI6IDI5NTAwLCAiaGZfYXV0aF90b2tlbiI6IG51bGwsICJyZXBsYWNlX3dpdGhfa2VybmVsX2luamVjdCI6IHRydWUsICJwcm9maWxlX21vZGVsX3RpbWUiOiBmYWxzZX0='] exits with return code = -15
