In [1]:
!export LANG=C && export LC_ALL=C

In [2]:
# The tensorflow in the environment contains some CUDA-related content that can cause conflicts
!pip uninstall -y tensorflow tensorflow-estimator tensorflow-io-gcs-filesystem

Found existing installation: tensorflow 2.14.0
Uninstalling tensorflow-2.14.0:
  Successfully uninstalled tensorflow-2.14.0
Found existing installation: tensorflow-estimator 2.14.0
Uninstalling tensorflow-estimator-2.14.0:
  Successfully uninstalled tensorflow-estimator-2.14.0
Found existing installation: tensorflow-io-gcs-filesystem 0.35.0
Uninstalling tensorflow-io-gcs-filesystem-0.35.0:
  Successfully uninstalled tensorflow-io-gcs-filesystem-0.35.0
[0m

Install DashInfer and qwen model dependencies.

In [3]:
# install dashinfer
!pip install dashinfer

# install model dependencies
!pip install sentencepiece accelerate transformers_stream_generator tiktoken

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Collecting dashinfer
  Downloading https://mirrors.aliyun.com/pypi/packages/80/ed/836e8fd62fa3f02551055e450bdd52878842153110729f7d64cac2ed02cc/dashinfer-1.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.1/25.1 MB[0m [31m491.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hCollecting protobuf==3.18 (from dashinfer)
  Downloading https://mirrors.aliyun.com/pypi/packages/74/4e/9f3cb458266ef5cdeaa1e72a90b9eda100e3d1803cbd7ec02f0846da83c3/protobuf-3.18.0-py2.py3-none-any.whl (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.2/174.2 kB[0m [31m483.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Reason for being yanked: This version claims to support Python 2 but does not[0m[33m
[0m[33mDEPRECATION: pytorch-lightning 1.7.7 has a non-standard dependency specifier torch>=1.9.*. pip 24.1 will enforce this beha

Define some functions to download models, organize prompts and print output text.

In [4]:
#
# Copyright (c) Alibaba, Inc. and its affiliates.
# @file    basic_example_qwen_v10_io.py
#
import os
import sys
import copy
import time
import queue
import random
import subprocess
from concurrent.futures import ThreadPoolExecutor
from IPython.display import display, clear_output

# os.environ['GLOG_minloglevel'] = '2' # disable LOG(INFO) logging
from dashinfer.helper import EngineHelper


def download_model(model_id, revision, source="modelscope"):
    print(f"Downloading model {model_id} (revision: {revision}) from {source}")
    if source == "modelscope":
        from modelscope import snapshot_download
        model_dir = snapshot_download(model_id, revision=revision)
    elif source == "huggingface":
        from huggingface_hub import snapshot_download
        model_dir = snapshot_download(repo_id=model_id)
    else:
        raise ValueError("Unknown source")

    print(f"Save model to path {model_dir}")

    return model_dir


def create_test_prompt(inputs, default_gen_cfg=None):
    gen_cfg_list = []
    prompt = copy.deepcopy(inputs)
    prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \
                     + prompt + "<|im_end|>\n<|im_start|>assistant\n"
    if default_gen_cfg != None:
        gen_cfg = copy.deepcopy(default_gen_cfg)
        gen_cfg["seed"] = random.randint(0, 10000)
        gen_cfg_list.append(gen_cfg)

    return [prompt], gen_cfg_list


def print_in_place(generator, user_input):
    for part in generator:
        clear_output(wait=True)  # 清除当前的输出，并等待新的输出
        print(f"User: {user_input}")
        print(f"Answer:\n{part}")
        sys.stdout.flush()
    print()

  from .autonotebook import tqdm as notebook_tqdm


Use a Dict() to keep inference parameters.

In [5]:
config = {
  "model_name": "Qwen-1_8B-Chat",
  "model_type": "Qwen_v10",
  "model_path": "./dashinfer_models/",
  "data_type": "float32",
  "device_type": "CPU",
  "device_ids": [0],
  "multinode_mode": False,
  "convert_config": {
    "do_dynamic_quantize_convert": False
  },
  "engine_config": {
    "engine_max_length": 2048,
    "engine_max_batch": 8,
    "do_profiling": False,
    "num_threads": 0,
    "matmul_precision": "medium"
  },
  "generation_config": {
    "temperature": 1.0,
    "early_stopping": True,
    "top_k": 1024,
    "top_p": 0.8,
    "repetition_penalty": 1.1,
    "presence_penalty": 0.0,
    "min_length": 0,
    "max_length": 2048,
    "no_repeat_ngram_size": 0,
    "eos_token_id": 151643,
    "seed": 1234,
    "stop_words_ids": [[151643], [151644], [151645]]
  },
  "quantization_config": None
}

Set ENV for inference.

In [6]:
cmd = f"pip show dashinfer | grep 'Location' | cut -d ' ' -f 2"
package_location = subprocess.run(cmd,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE,
                                  shell=True,
                                  text=True)
package_location = package_location.stdout.strip()
os.environ["AS_DAEMON_PATH"] = package_location + "/dashinfer/allspark/bin"
os.environ["AS_NUMA_NUM"] = str(len(config["device_ids"]))
os.environ["AS_NUMA_OFFSET"] = str(config["device_ids"][0])

Download models from modelscope or huggingface.

In [7]:
## download original model
## download model from huggingface
# original_model = {
#     "source": "huggingface",
#     "model_id": "Qwen/Qwen-1_8B-Chat",
#     "revision": "",
#     "model_path": ""
# }

## download model from modelscope
original_model = {
    "source": "modelscope",
    "model_id": "qwen/Qwen-1_8B-Chat",
    "revision": "v1.0.0",
    "model_path": ""
}
original_model["model_path"] = download_model(original_model["model_id"],
                                              original_model["revision"],
                                              original_model["source"])

2024-04-28 13:38:06,294 - modelscope - INFO - PyTorch version 2.1.2+cpu Found.
2024-04-28 13:38:06,296 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer
2024-04-28 13:38:06,296 - modelscope - INFO - No valid ast index found from /mnt/workspace/.cache/modelscope/ast_indexer, generating ast index from prebuilt!
2024-04-28 13:38:06,343 - modelscope - INFO - Loading done! Current index file version is 1.14.0, with md5 aacbf9e8ebe525a5896d4c89570c0097 and a total number of 976 components indexed


Downloading model qwen/Qwen-1_8B-Chat (revision: v1.0.0) from modelscope


    ImportError: cannot import name 'builder' from 'google.protobuf.internal' (/opt/conda/lib/python3.10/site-packages/google/protobuf/internal/__init__.py)
  warn(message, cls)
    ImportError: cannot import name 'builder' from 'google.protobuf.internal' (/opt/conda/lib/python3.10/site-packages/google/protobuf/internal/__init__.py)
  warn(message, cls)
    ImportError: cannot import name 'builder' from 'google.protobuf.internal' (/opt/conda/lib/python3.10/site-packages/google/protobuf/internal/__init__.py)
  warn(message, cls)
2024-04-28 13:38:07,505 - modelscope - INFO - Use user-specified model revision: v1.0.0
Downloading: 100%|██████████| 8.21k/8.21k [00:00<00:00, 30.9MB/s]
Downloading: 100%|██████████| 50.8k/50.8k [00:00<00:00, 14.5MB/s]
Downloading: 100%|██████████| 244k/244k [00:00<00:00, 13.8MB/s]
Downloading: 100%|██████████| 135k/135k [00:00<00:00, 10.6MB/s]
Downloading: 100%|██████████| 910/910 [00:00<00:00, 7.30MB/s]
Downloading: 100%|██████████| 77.0/77.0 [00:00<00:00, 62

Save model to path /mnt/workspace/.cache/modelscope/qwen/Qwen-1_8B-Chat





Initialize DashInfer engine.
- Huggingface models will be converted to DashInfer models at the initial run.
- In init_engine(), DashInfer will warm-up with random numbers, which takes some time.


In [8]:
## init EngineHelper class
engine_helper = EngineHelper(config)
engine_helper.verbose = True
engine_helper.init_tokenizer(original_model["model_path"])
engine_helper.init_torch_model(original_model["model_path"])

## convert huggingface model to dashinfer model
## only one conversion is required
if engine_helper.check_model_exist() == False:
    engine_helper.convert_model(original_model["model_path"])

## init engine
engine_helper.init_engine()

I20240428 13:38:47.209024   588 as_engine.cpp:226] AllSpark Init with Version: 1.0.2/(GitSha1:3a5cfb7a-dirty)


### convert_config: {'do_dynamic_quantize_convert': False}
### engine_config: {'engine_max_length': 2048, 'engine_max_batch': 8, 'do_profiling': False, 'num_threads': 0, 'matmul_precision': 'medium'}


Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.59s/it]
E20240428 13:38:57.298009   588 as_engine.cpp:924] workers is empty



No such file or directory: ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.asgraph

trans model from huggingface model: /mnt/workspace/.cache/modelscope/qwen/Qwen-1_8B-Chat
Dashinfer model will save to  ./dashinfer_models/
### model_config: {'vocab_size': 151936, 'hidden_size': 2048, 'intermediate_size': 11008, 'num_hidden_layers': 24, 'num_attention_heads': 16, 'emb_dropout_prob': 0.0, 'attn_dropout_prob': 0.0, 'layer_norm_epsilon': 1e-06, 'initializer_range': 0.02, 'scale_attn_weights': True, 'use_cache': True, 'max_position_embeddings': 8192, 'bf16': False, 'fp16': False, 'fp32': True, 'kv_channels': 128, 'rotary_pct': 1.0, 'rotary_emb_base': 10000, 'use_dynamic_ntk': True, 'use_logn_attn': True, 'use_flash_attn': False, 'no_bias': True, 'use_cache_quantization': False, 'use_cache_kernel': False, 'softmax_in_fp32': False, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_

I20240428 13:39:06.234314   588 as_engine.cpp:378] Build model use following config:
AsModelConfig :
	model_name: Qwen-1_8B-Chat_cpu_single_float32
	model_path: ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.asgraph
	weights_path: ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.asparam
	compute_unit: CPU:0
	num_threads: 0
	matmul_precision: medium
	prefill_mode: AsPrefillDefault
	cache_mode: AsCacheDefault
	engine_max_length = 2048
	engine_max_batch = 8

I20240428 13:39:06.234341   588 as_engine.cpp:382] Load model from : ./dashinfer_models/Qwen-1_8B-Chat_cpu_single_float32.asgraph
I20240428 13:39:06.234362   588 as_engine.cpp:297] SetDeviceIds: DeviceIDs.size() 1
I20240428 13:39:06.234364   588 as_engine.cpp:304] Start create 1 Device: CPU workers.
I20240428 13:39:06.234476   886 cpu_context.cpp:114] CPUContext::InitMCCL() rank: 0 nRanks: 1
I20240428 13:39:06.287034   588 as_param_check.hpp:342] AsParamGuard check level = CHECKER_NORMAL. Engine version = 1.0 . Weight versio

Read user inputs from terminal, and call DashInfer to generate results.
- In this example, users can interact multiple times with the model, but history will not be involved in a new conversation.
- Input `exit` to exit the loop.

In [9]:
try:
    while True:
        input_value = input("Please enter your inputs: ")
        if input_value.lower() == 'exit':
            print("Exiting program.")
            break

        prompt_list, gen_cfg_list = create_test_prompt(
            input_value, engine_helper.default_gen_cfg)
        request_list = engine_helper.create_request(prompt_list, gen_cfg_list)
        request = request_list[0]

        gen = engine_helper.process_one_request_stream(request)
        print_in_place(gen, input_value)
        time.sleep(1)

except KeyboardInterrupt:
    sys.stdout.write("\nProgram interrupted. Exiting...\n")
    sys.exit()

User: who are you
Answer:
I am an artificial intelligence language model designed to assist with a wide range of tasks, from answering questions and providing information to generating text and even performing creative tasks such as writing stories and songs. My purpose is to provide useful and informative responses to your inquiries to the best of my abilities based on my training data. How may I assist you today?



Please enter your inputs:  exit


Exiting program.


Stop the engine and release resources.

In [10]:
# uninit engine
engine_helper.uninit_engine()

I20240428 13:41:47.392686   890 as_engine.cpp:1616] | AllsparkStat | Req: Running: 0 Pending: 0 	 Prompt: 0 T/s  Gen: 1.96627 T/s 
I20240428 13:41:47.392802   588 as_engine.cpp:841] [Qwen-1_8B-Chat_cpu_single_float32] waiting to join loop thread
I20240428 13:41:47.392835   588 as_engine.cpp:844] [Qwen-1_8B-Chat_cpu_single_float32] loop thread joined
