Choose T4 Runtime to start GPU example,  

Runtime can change by:
1. Menu: Code Execution -> Change Runtime Type
2. Choose: `Python 3` , Hardware Accelerator : `T4`


In [None]:
!nvidia-smi


Tue Jan 14 07:34:48 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!python --version


Python 3.10.12


In [None]:
!pip install https://github.com/modelscope/dash-infer/releases/download/v2.0.0-rc3/dashinfer-2.0.0rc3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

# check dashinfer release url in : https://github.com/modelscope/dash-infer/releases/tag/v2.0.0-rc3

Collecting dashinfer==2.0.0rc3
  Downloading https://github.com/modelscope/dash-infer/releases/download/v2.0.0-rc3/dashinfer-2.0.0rc3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (687.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m687.2/687.2 MB[0m [31m834.0 kB/s[0m eta [36m0:00:00[0m
Collecting ruamel.yaml (from dashinfer==2.0.0rc3)
  Downloading ruamel.yaml-0.18.10-py3-none-any.whl.metadata (23 kB)
Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml->dashinfer==2.0.0rc3)
  Downloading ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Downloading ruamel.yaml-0.18.10-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.7/117.7 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (722 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m722.2/722.2 k

In [None]:
!pip install modelscope

Collecting modelscope
  Downloading modelscope-1.22.0-py3-none-any.whl.metadata (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Downloading modelscope-1.22.0-py3-none-any.whl (5.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: modelscope
Successfully installed modelscope-1.22.0


In [None]:
import os

from dashinfer import allspark
from dashinfer.allspark import *
from dashinfer.allspark.engine import *
from dashinfer.allspark.prompt_utils import PromptTemplate

# Configuration
in_memory = False
device_list = [0] # single card by default, 4 cards replace with [0,1,2,3]
model_name = "qwen/Qwen2.5-1.5B-Instruct"
output_base_folder = "model_output"
user_data_type = "float16" # most device supports float16
use_modelscope = False

# Download and prepare the model
if use_modelscope:
  import modelscope
  from modelscope.utils.constant import DEFAULT_MODEL_REVISION
  model_local_path = modelscope.snapshot_download(model_name, DEFAULT_MODEL_REVISION)
else:
  model_local_path = model_name

safe_model_name = model_name.replace("/", "_")
model_convert_folder = os.path.join(output_base_folder, safe_model_name)

# Initialize model and engine
model_loader = allspark.HuggingFaceModel(model_local_path, safe_model_name,
                                         in_memory_serialize=in_memory,
                                         user_set_data_type=user_data_type)
engine = allspark.Engine()

# Load and serialize the model
model_loader.load_model().serialize(engine, model_output_dir=output_base_folder).free_model()

# Configure runtime settings
runtime_cfg = model_loader.create_reference_runtime_config_builder(
    safe_model_name, TargetDevice.CUDA, device_list, max_batch=8).max_length(2048).build()
engine.install_model(runtime_cfg)
engine.start_model(safe_model_name)

if in_memory: model_loader.free_memory_serialize_file()

# Prepare input
input_str = "How to protect our planet and build a green future?"
messages = [
   {"role": "system", "content": "You are a helpful assistant."},
   {"role": "user", "content": PromptTemplate.apply_chatml_template(input_str)}
]
templated_input_str = model_loader.init_tokenizer().get_tokenizer().apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True)

# Configure generation settings
gen_cfg = model_loader.create_reference_generation_config_builder(runtime_cfg)
gen_cfg.update({"top_k": 1})

# Generate response
status, handle, queue = engine.start_request_text(
    safe_model_name, model_loader, templated_input_str, gen_cfg)
generated_ids = []

while True:
   elements = queue.Get()
   if elements:
      generated_ids += elements.ids_from_generate
   status = queue.GenerateStatus()
   if status in [GenerateRequestStatus.GenerateFinished,
                 GenerateRequestStatus.GenerateInterrupted]:
      break

# Decode and print output
output_text = model_loader.init_tokenizer().get_tokenizer().decode(generated_ids)
print(f"Model: {model_name}\nInput: {input_str}\nOutput: {output_text}")

# Clean up
engine.release_request(safe_model_name, handle)
engine.stop_model(safe_model_name)
print(f"Model: {model_name} have been released.")

AllSpark python package start init.
[Info] No Multi-NUMA support on CUDA Version.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

model config:
{'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 1536, 'intermediate_size': 8960, 'num_hidden_layers': 28, 'num_attention_heads': 12, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 21, 'num_key_value_heads': 2, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': True, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': torch.bfloat16, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperatu

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

                type(BPE), output may be abnormal if using JSON Mode
rotary base:  1000000.0
serialize_model_from_torch: quant config:None
save asgraph to  model_output/qwen_Qwen2.5-1.5B-Instruct.asgraph
save asparam to  model_output/qwen_Qwen2.5-1.5B-Instruct.asparam
parse weight time:  143.47967290878296
current allspark version full: 2.0.0/(GitSha1:163850f7) major[ 2 ] minor[ 0 ] patch[ 0 ] commit =  163850f7
calculate md5 of asgraph =  e9b1457b6bf53984b8aa639ae3812f79
torch build meta: 	 only_convert_lora 	:  False
build_model_from_torch: save model = true, time :  143.89367246627808
Model qwen_Qwen2.5-1.5B-Instruct serialize finished, consume 143.96698021888733 seconds...
stop word ids: [[151645], [151643]]
Start Request with Generate Config:
{'top_k': 1, 'top_p': 0.8, 'do_sample': True, 'early_stopping': True, 'repetition_penalty': 1.1, 'presence_penalty': 0.0, 'length_penalty': 1.0, 'temperature': 0.7, 'min_length': 0, 'no_repeat_ngram_size': 0, 'eos_token_id': 151645, 'seed': 7