In [1]:
# !pip install transformers deepspeed-mii --upgrade

In [6]:
import nest_asyncio
import mii
import time
from transformers import pipeline, AutoTokenizer

# otherwise we get an error message, see https://github.com/microsoft/DeepSpeed-MII/issues/100
nest_asyncio.apply()

In [3]:
mii_configs = {"tensor_parallel": 1, "dtype": "fp16"}
mii.deploy(task="text-generation",
           model="bigscience/bloom-560m",
           deployment_name="bloom560m_deployment",
           mii_config=mii_configs)

generator = mii.mii_query_handle("bloom560m_deployment")

[2022-11-17 08:18:31,188] [INFO] [deployment.py:87:deploy] ************* MII is using DeepSpeed Optimizations to accelerate your model *************
[2022-11-17 08:18:31,373] [INFO] [server_client.py:219:_initialize_service] MII using multi-gpu deepspeed launcher:
 ------------------------------------------------------------
 task-name .................... text-generation 
 model ........................ bigscience/bloom-560m 
 model-path ................... /tmp/mii_models 
 port ......................... 50050 
 provider ..................... hugging-face 
 ------------------------------------------------------------
[2022-11-17 08:18:32,627] [INFO] [runner.py:508:main] cmd = /home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 --no_python --no_local_rank /home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m mii.launch.multi_gpu_server --task-name text-generation --mode

In [4]:
generator = mii.mii_query_handle("bloom560m_deployment")

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
start_text = "Testing BLOOM-560M with DeepSpeed MII"
tokens_start_text = len(tokenizer(start_text, return_tensors="pt").input_ids[0])

In [8]:
new_tokens = 500

t0 = time.time()
result = generator.query({"query": start_text}, max_new_tokens=new_tokens)
t1 = time.time()

------------------------------------------------------
Free memory : 10.848145 (GigaBytes)  
Total memory: 14.560913 (GigaBytes)  
Requested memory: 0.093750 (GigaBytes) 
Setting maximum total tokens (input + output) to 1024 
------------------------------------------------------


In [9]:
result

response: "Testing BLOOM-560M with DeepSpeed MII. The BLOOM-560M is a high-performance, low-power, and low-temperature, high-resolution, and high-sensitivity, high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and high-resolution, and high-sensitivity, and hi

In [10]:
type(result)

modelresponse_pb2.MultiStringReply

In [11]:
import re

text=str(result)
pattern='(")(.*)(")'

g=re.search(pattern,text)
gen_text = g.group(2)

In [12]:
tokens_gen_text = len(tokenizer(gen_text, return_tensors="pt").input_ids[0])
total_new_tokens_generated = tokens_gen_text - tokens_start_text
throughput = (total_new_tokens_generated) / (t1 - t0)

In [13]:
print(f"""Tokens generated: {total_new_tokens_generated}
Time: {t1 - t0:.1f} seconds
Tokens per second: {throughput:.1f}
Latency: {1000 / throughput:.1f} ms""")

Tokens generated: 500
Time: 7.2 seconds
Tokens per second: 69.1
Latency: 14.5 ms


In [None]:
mii.terminate("bloom-560m-mii")