# 1. Environment Setup

## Install DashInfer

In [None]:
# install dashinfer
!pip install dashinfer

# uninstall tensorflow (if present) to avoid conflict
!pip uninstall tensorflow -y

## Prepare Qwen Model Dependencies

In [None]:
# install Qwen dependencies
!pip install sentencepiece accelerate transformers_stream_generator tiktoken

# 2. Model Download and Chat-Template Preparation

## 2.1 Model Download

In [None]:
## download model from modelscope
!pip install modelscope
from modelscope import snapshot_download
torch_model_dir = snapshot_download("qwen/Qwen-1_8B-Chat", revision="v1.0.0")

## download model from huggingface
# !pip install huggingface_hub einops
# from huggingface_hub import snapshot_download
# torch_model_dir = snapshot_download(repo_id="Qwen/Qwen-1_8B-Chat")

## 2.2 Chat Template Composition

Use jinja template to format chat template for Qwen model.

In [4]:
import os
import sys
import copy
import time
import random
from jinja2 import Template


def apply_chatml_template(inputs, default_gen_cfg=None):
    start_text = "<|im_start|>"
    end_text = "<|im_end|>"
    system_msg = {"role": "system", "content": "You are a helpful assistant."}
    user_msg = {"role": "user", "content": ""}
    assistant_msg = {"role": "assistant", "content": ""}

    prompt_template = Template(
        "{{start_text}}" + "{{system_role}}\n" + "{{system_content}}" + "{{end_text}}\n" +
        "{{start_text}}" + "{{user_role}}\n" + "{{user_content}}" + "{{end_text}}\n" +
        "{{start_text}}" + "{{assistant_role}}\n")

    gen_cfg_list = []
    user_msg["content"] = copy.deepcopy(inputs)

    prompt = prompt_template.render(start_text=start_text, end_text=end_text,
                                    system_role=system_msg["role"], system_content=system_msg["content"],
                                    user_role=user_msg["role"], user_content=user_msg["content"],
                                    assistant_role=assistant_msg["role"])

    if default_gen_cfg != None:
        gen_cfg = copy.deepcopy(default_gen_cfg)
        gen_cfg["seed"] = random.randint(0, 10000)
        gen_cfg_list.append(gen_cfg)

    return [prompt], gen_cfg_list

#  3. DashInfer Setup: engine initialization and model preparation

- Convert downloaded model to be loaded by DashInfer.

- Warm-Up: DashInfer warms up the engine with random tokens, up to the `max_length` specified in configuration. This allocates necessary resources to facilitate fast inference, for request(s) of length up to the `max_length`.



In [5]:
from IPython.display import display, clear_output

from dashinfer.helper import EngineHelper

# model-specific configuration
config = {
  "model_name": "Qwen-1_8B-Chat",
  "model_type": "Qwen_v10",
  "model_path": "./dashinfer_models/",
  "generation_config": {
    "temperature": 1.0,
    "early_stopping": True,
    "top_k": 1024,
    "top_p": 0.8,
    "repetition_penalty": 1.1,
    "presence_penalty": 0.0,
    "max_length": 2048,
    "eos_token_id": 151643,
    "stop_words_ids": [[151643], [151644], [151645]]
  }
}

# init EngineHelper
engine_helper = EngineHelper(config)
engine_helper.verbose = True
engine_helper.init_tokenizer(torch_model_dir)

# convert model to DashInfer format
if not engine_helper.check_model_exist():
    engine_helper.convert_model(torch_model_dir)

# init engine
engine_helper.init_engine()

I20240513 10:46:20.667255   732 thread_pool.h:46] ThreadPool created with: 1
I20240513 10:46:20.667313   732 as_engine.cpp:232] AllSpark Init with Version: 1.0.4/(GitSha1:0549ab25-dirty)


### convert_config: {'do_dynamic_quantize_convert': False}
### engine_config: {'engine_max_length': 2048, 'engine_max_batch': 8, 'do_profiling': False, 'num_threads': 0, 'matmul_precision': 'medium'}

No such file or directory: ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.asgraph



E20240513 10:46:21.152429   732 as_engine.cpp:942] workers is empty
Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.51s/it]


trans model from huggingface model: /mnt/workspace/.cache/modelscope/qwen/Qwen-1_8B-Chat
Dashinfer model will save to  ./dashinfer_models/
### model_config: {'vocab_size': 151936, 'hidden_size': 2048, 'intermediate_size': 11008, 'num_hidden_layers': 24, 'num_attention_heads': 16, 'emb_dropout_prob': 0.0, 'attn_dropout_prob': 0.0, 'layer_norm_epsilon': 1e-06, 'initializer_range': 0.02, 'scale_attn_weights': True, 'use_cache': True, 'max_position_embeddings': 8192, 'bf16': False, 'fp16': False, 'fp32': True, 'kv_channels': 128, 'rotary_pct': 1.0, 'rotary_emb_base': 10000, 'use_dynamic_ntk': True, 'use_logn_attn': True, 'use_flash_attn': False, 'no_bias': True, 'use_cache_quantization': False, 'use_cache_kernel': False, 'softmax_in_fp32': False, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward':

I20240513 10:46:39.666299   732 as_engine.cpp:384] Build model use following config:
AsModelConfig :
	model_name: Qwen-1_8B-Chat_cpu_single_float32
	model_path: ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.asgraph
	weights_path: ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.asparam
	compute_unit: CPU:0
	num_threads: 32
	matmul_precision: medium
	prefill_mode: AsPrefillDefault
	cache_mode: AsCacheDefault
	engine_max_length = 2048
	engine_max_batch = 8

I20240513 10:46:39.666332   732 as_engine.cpp:388] Load model from : ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.asgraph
I20240513 10:46:39.666359   732 as_engine.cpp:303] SetDeviceIds: DeviceIDs.size() 1
I20240513 10:46:39.666363   732 as_engine.cpp:310] Start create 1 Device: CPU workers.
I20240513 10:46:39.666476   988 cpu_context.cpp:114] CPUContext::InitMCCL() rank: 0 nRanks: 1
I20240513 10:46:39.719002   732 as_param_check.hpp:342] AsParamGuard check level = CHECKER_NORMAL. Engine version = 1.0 . Weight versi

build model over, build time is 3.6590442657470703


success. 
I20240513 10:46:39.721506   990 weight_manager.cpp:107] Weight file header parse success...195 weight tensors are going to load. 
I20240513 10:46:43.313182   990 weight_manager.cpp:257] finish weight load for model RankInfo[0/1] time  spend: 3.592 seconds.
I20240513 10:46:43.314674   990 as_engine.cpp:525] Finish Build model for rank: 0
I20240513 10:46:43.315203   732 as_engine.cpp:680] StartModel: warming up...
I20240513 10:46:43.315230   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 0 Pending: 0 	 Prompt: 0 T/s  Gen: 0 T/s 
I20240513 10:47:19.548533   902 model.cpp:431] RunDecoderContext() Success ID: 0000000000000000000000000000000
I20240513 10:47:19.847383   902 model.cpp:483] Stop request with request id: 0000000000000000000000000000000
I20240513 10:47:19.847442   993 as_engine.cpp:1606] [Qwen-1_8B-Chat_cpu_single_float32] request finished with uuid: 0000000000000000000000000000000
I20240513 10:47:19.847461   993 as_engine.cpp:1641] | AllsparkStat | Req: Running

# 4. Model-Inference via DashInfer

- Inference information: available in log, such as `Prompt: ?? T/s`,  `Gen: ?? T/s`(genration speed), and `Running: ??`(concurrent running requests).

- **Exit Command:**  Type `exit` to exit.


In [6]:
try:
    while True:
        input_value = input("Type in your prompt: ")
        if input_value.lower() == 'exit':
            print("Exiting the program.")
            break

        prompt_list, gen_cfg_list = apply_chatml_template(
            input_value, engine_helper.default_gen_cfg)
        request_list = engine_helper.create_request(prompt_list, gen_cfg_list)
        request = request_list[0]

        gen = engine_helper.process_one_request_stream(request)
        for part in gen:
            clear_output(wait=True)
            print(f"Input: {input_value}")
            print(f"Response:\n{part}")
            sys.stdout.flush()
        print()
        time.sleep(1)

except KeyboardInterrupt:
    sys.stdout.write("\nProgram interrupted. Exiting...\n")
    sys.exit()

Input: who are you
Response:
I am QianWen, a pre-trained language model developed by Alibaba Cloud. I was trained on a massive amount of text data to perform various tasks such as answering questions, generating text, and translating languages. How can I assist you today?



Type in your prompt:  exit


Exiting the program.


# 5. Engine Un-initialization


In [7]:
# release all resources allocated to DashInfer engine
engine_helper.uninit_engine()

I20240513 10:48:36.460870   993 as_engine.cpp:1641] | AllsparkStat | Req: Running: 0 Pending: 0 	 Prompt: 0 T/s  Gen: 0.249401 T/s 
I20240513 10:48:36.460969   732 as_engine.cpp:859] [Qwen-1_8B-Chat_cpu_single_float32] waiting to join loop thread
I20240513 10:48:36.460994   732 as_engine.cpp:862] [Qwen-1_8B-Chat_cpu_single_float32] loop thread joined
