In [None]:
%%bash

# 创建 Python 虚拟环境并激活
conda create -n vllm python=3.10
conda activate vllm

# 后续操作都在虚拟环境中

In [None]:
%%bash

# 有梯子，用 Hugging Face（参考 https://huggingface.co/docs/huggingface_hub/cn/quick-start）
pip install --upgrade huggingface_hub

# 验证安装
python -c "from huggingface_hub import model_info; print(model_info('gpt2'))"

# 登录
huggingface-cli login --token $HUGGINGFACE_TOKEN --add-to-git-credential

# 下载


In [None]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

# 加载 .env 文件
load_dotenv()

# 从环境变量中获取 HUGGINGFACE_TOKEN
token = os.environ.get('HUGGINGFACE_TOKEN')

if token:
    # 使用获取到的 token 进行登录
    login()
    print("登录成功")
else:
    print("未找到 HUGGINGFACE_TOKEN 环境变量，请先设置该环境变量。")

# 下载模型
from huggingface_hub import hf_hub_download
hf_hub_download(
    repo_id="bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF",
    filename="DeepSeek-R1-Distill-Qwen-32B-Q4_K_M.gguf"
)

In [None]:
%%bash

# 没梯子，用魔搭（参考 https://www.modelscope.cn/docs/intro/quickstart）
pip install modelscope --upgrade

# 验证安装
python -c "from modelscope.pipelines import pipeline;print(pipeline('word-segmentation')('今天天气不错，适合 出去游玩'))"

# 下载模型，但是魔搭上没有量化后的模型
#modelscope download --model 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B'
# 注意这里下载的是分割模型，需要合并才能使用
modelscope download --model 'Qwen/QwQ-32B-GGUF' --include 'qwq-32b-q4_k_m-*.gguf'

# 编译 llama.cpp 获得相关工具，github 上也可以下载到预编译的结果，但可能无法覆盖所有操作系统情况
git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp

# 以 CUDA 兼容的方式编译，参考 https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#cuda
cmake -B build -DGGML_CUDA=ON
# 如果 nvcc 没能识别硬件架构，可以强制指定
# 架构代号参考 https://developer.nvidia.com/cuda-gpus 比如 2080Ti 的架构代号为 75
#cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="75"
cmake --build build --config Release

# 合并下载的 gguf 文件
# 编译后的 llama-gguf-split 工具在 llama.cpp/build/bin 目录下
./llama-gguf-split --merge ~/.cache/modelscope/hub/models/Qwen/QwQ-32B-GGUF/qwq-32b-q4_k_m-00001-of-00005.gguf ~/qwq-32b-q4_k_m.gguf


In [None]:
%%bash

# 安装 vllm，从阿里镜像装有问题，所以改回官方
pip install vllm -i https://pypi.org/simple

In [None]:
%%bash

# vllm 运行 llama.cpp 量化后的模型，参考：https://docs.vllm.ai/en/latest/features/quantization/gguf.html
vllm serve ./DeepSeek-R1-Distill-Qwen-32B-Q4_K_M.gguf \
    --tokenizer deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \
    --hf-config-path deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \
    --gpu-memory-utilization 0.95 \ # GPU显存使用率
    --tensor-parallel-size 1 \ # GPU并行度
    --quantization gguf \ # 量化类型
    --max-model-len 1024 \ # 最大上下文长度
    --max-num-batched-tokens 1024 \ # 一批次推理的最大token数
    --max-num-seqs 1024 \ # 一次推理的最大序列数
    --enable-chunked-prefill 1 \ # 启用分块预填，减少显存碎片
    --cpu-offload-gb 4 # 将部分数据卸载到 CPU 内存中

In [None]:
%%bash

vllm serve ./DeepSeek-R1-Distill-Qwen-32B-Q4_K_M.gguf \
    --tokenizer deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \
    --gpu-memory-utilization 0.9 \
    --quantization gguf \
    --max-model-len 1024 \
    --max-num-batched-tokens 1024 \
    --enable-chunked-prefill \
    --max-num-seqs 1024 \
    --cpu-offload-gb 4


vllm serve ./DeepSeek-R1-Distill-Qwen-32B-Q4_K_M.gguf \
    --tokenizer deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \
    --gpu-memory-utilization 0.9 \
    --quantization gguf \
    --max-model-len 1024 \
    --max-num-batched-tokens 1024 \
    --enable-chunked-prefill \
    --max-num-seqs 1024 \
    --cpu-offload-gb 4

In [None]:
%%bash

./llama-cli -m ~/qwq-32b-q4_k_m.gguf --n-gpu-layers 80

```shell
(vllm) sam@LIYUAN-PC:/mnt/f/ai/llama.cpp/build/bin$ ./llama-cli -m /mnt/f/ai/qwq-32b-q4_k_m.gguf --n-gpu-lay
ers 80
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 2080 Ti, compute capability 7.5, VMM: yes
build: 4866 (89b2b56e) with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: llama backend init
main: load the model and apply lora adapter, if any
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 2080 Ti) - 21310 MiB free
llama_model_loader: loaded meta data with 27 key-value pairs and 771 tensors from /mnt/f/ai/qwq-32b-q4_k_m.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Tmp_Quanted_Model AWInt4 Groupsize128
llama_model_loader: - kv   3:                         general.size_label str              = 33B
llama_model_loader: - kv   4:                          qwen2.block_count u32              = 64
llama_model_loader: - kv   5:                       qwen2.context_length u32              = 131072
llama_model_loader: - kv   6:                     qwen2.embedding_length u32              = 5120
llama_model_loader: - kv   7:                  qwen2.feed_forward_length u32              = 27648
llama_model_loader: - kv   8:                 qwen2.attention.head_count u32              = 40
llama_model_loader: - kv   9:              qwen2.attention.head_count_kv u32              = 8
llama_model_loader: - kv  10:                       qwen2.rope.freq_base f32              = 1000000.000000
llama_model_loader: - kv  11:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000010
llama_model_loader: - kv  12:                          general.file_type u32              = 15
llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = qwen2
llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,152064]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,152064]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
llama_model_loader: - kv  18:                tokenizer.ggml.eos_token_id u32              = 151645
llama_model_loader: - kv  19:            tokenizer.ggml.padding_token_id u32              = 151643
llama_model_loader: - kv  20:                tokenizer.ggml.bos_token_id u32              = 151643
llama_model_loader: - kv  21:               tokenizer.ggml.add_bos_token bool             = false
llama_model_loader: - kv  22:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
llama_model_loader: - kv  23:               general.quantization_version u32              = 2
llama_model_loader: - kv  24:                                   split.no u16              = 0
llama_model_loader: - kv  25:                        split.tensors.count i32              = 771
llama_model_loader: - kv  26:                                split.count u16              = 0
llama_model_loader: - type  f32:  321 tensors
llama_model_loader: - type q4_K:  385 tensors
llama_model_loader: - type q6_K:   65 tensors
print_info: file format = GGUF V3 (latest)
print_info: file type   = Q4_K - Medium
print_info: file size   = 18.48 GiB (4.85 BPW)
load: special tokens cache size = 26
load: token to piece cache size = 0.9311 MB
print_info: arch             = qwen2
print_info: vocab_only       = 0
print_info: n_ctx_train      = 131072
print_info: n_embd           = 5120
print_info: n_layer          = 64
print_info: n_head           = 40
print_info: n_head_kv        = 8
print_info: n_rot            = 128
print_info: n_swa            = 0
print_info: n_embd_head_k    = 128
print_info: n_embd_head_v    = 128
print_info: n_gqa            = 5
print_info: n_embd_k_gqa     = 1024
print_info: n_embd_v_gqa     = 1024
print_info: f_norm_eps       = 0.0e+00
print_info: f_norm_rms_eps   = 1.0e-05
print_info: f_clamp_kqv      = 0.0e+00
print_info: f_max_alibi_bias = 0.0e+00
print_info: f_logit_scale    = 0.0e+00
print_info: n_ff             = 27648
print_info: n_expert         = 0
print_info: n_expert_used    = 0
print_info: causal attn      = 1
print_info: pooling type     = 0
print_info: rope type        = 2
print_info: rope scaling     = linear
print_info: freq_base_train  = 1000000.0
print_info: freq_scale_train = 1
print_info: n_ctx_orig_yarn  = 131072
print_info: rope_finetuned   = unknown
print_info: ssm_d_conv       = 0
print_info: ssm_d_inner      = 0
print_info: ssm_d_state      = 0
print_info: ssm_dt_rank      = 0
print_info: ssm_dt_b_c_rms   = 0
print_info: model type       = 32B
print_info: model params     = 32.76 B
print_info: general.name     = Tmp_Quanted_Model AWInt4 Groupsize128
print_info: vocab type       = BPE
print_info: n_vocab          = 152064
print_info: n_merges         = 151387
print_info: BOS token        = 151643 '<|endoftext|>'
print_info: EOS token        = 151645 '<|im_end|>'
print_info: EOT token        = 151645 '<|im_end|>'
print_info: PAD token        = 151643 '<|endoftext|>'
print_info: LF token         = 198 'Ċ'
print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
print_info: FIM MID token    = 151660 '<|fim_middle|>'
print_info: FIM PAD token    = 151662 '<|fim_pad|>'
print_info: FIM REP token    = 151663 '<|repo_name|>'
print_info: FIM SEP token    = 151664 '<|file_sep|>'
print_info: EOG token        = 151643 '<|endoftext|>'
print_info: EOG token        = 151645 '<|im_end|>'
print_info: EOG token        = 151662 '<|fim_pad|>'
print_info: EOG token        = 151663 '<|repo_name|>'
print_info: EOG token        = 151664 '<|file_sep|>'
print_info: max token length = 256
load_tensors: loading model tensors, this can take a while... (mmap = true)

```

在 USB 3.0 接口的移动硬盘上数据传输速度 21MB/s，理论上20G模型文件要加载 16 分钟

```shell
load_tensors: offloading 64 repeating layers to GPU
load_tensors: offloading output layer to GPU
load_tensors: offloaded 65/65 layers to GPU
load_tensors:        CUDA0 model buffer size = 18508.35 MiB
load_tensors:   CPU_Mapped model buffer size =   417.66 MiB
................................................................................................
llama_init_from_model: n_seq_max     = 1
llama_init_from_model: n_ctx         = 4096
llama_init_from_model: n_ctx_per_seq = 4096
llama_init_from_model: n_batch       = 2048
llama_init_from_model: n_ubatch      = 512
llama_init_from_model: flash_attn    = 0
llama_init_from_model: freq_base     = 1000000.0
llama_init_from_model: freq_scale    = 1
llama_init_from_model: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_init: kv_size = 4096, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 64, can_shift = 1
llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
llama_init_from_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
llama_init_from_model:  CUDA_Host  output buffer size =     0.58 MiB
llama_init_from_model:      CUDA0 compute buffer size =   368.00 MiB
llama_init_from_model:  CUDA_Host compute buffer size =    18.01 MiB
llama_init_from_model: graph nodes  = 2246
llama_init_from_model: graph splits = 2
common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
main: llama threadpool init, n_threads = 6
main: chat template is available, enabling conversation mode (disable it with -no-cnv)
main: chat template example:
<|im_start|>system
You are a helpful assistant<|im_end|>
<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there<|im_end|>
<|im_start|>user
How are you?<|im_end|>
<|im_start|>assistant


system_info: n_threads = 6 (n_threads_batch = 6) / 12 | CUDA : ARCHS = 500,610,700,750,800 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | LLAMAFILE = 1 | OPENMP = 1 | AARCH64_REPACK = 1 |

main: interactive mode on.
sampler seed: 1219659126
sampler params:
        repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
        dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
        top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
        mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
sampler chain: logits -> logit-bias -> penalties -> dry -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist
generate: n_ctx = 4096, n_batch = 2048, n_predict = -1, n_keep = 0

== Running in interactive mode. ==
 - Press Ctrl+C to interject at any time.
 - Press Return to return control to the AI.
 - To return control without starting a new line, end your input with '/'.
 - If you want to submit another line, end your input with '\'.
 - Not using system message. To change it, set a different value via -sys PROMPT


>
```

In [None]:
%%bash


vllm serve "/mnt/f/ai/QwQ-32B-AWQ" --load-format "safetensors" --gpu-memory-utilization 0.8 --port 8000
vllm serve "/mnt/f/ai/QwQ-32B-AWQ" --load-format "safetensors" --cpu-offload-gb 4
vllm serve "/mnt/f/ai/QwQ-32B-AWQ" --load-format "safetensors" --cpu-offload-gb 4 --max-model-len 1024 --max-num-batched-tokens 1024 --max-num-seqs 1024

```shell
(vllm) sam@LIYUAN-PC:/mnt/f/ai$ vllm serve "/mnt/f/ai/QwQ-32B-AWQ" --load-format "safetensors" --port 8000
INFO 03-19 16:33:04 __init__.py:207] Automatically detected platform cuda.
INFO 03-19 16:33:04 api_server.py:912] vLLM API server version 0.7.3
INFO 03-19 16:33:04 api_server.py:913] args: Namespace(subparser='serve', model_tag='/mnt/f/ai/QwQ-32B-AWQ', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, enable_reasoning=False, reasoning_parser=None, tool_call_parser=None, tool_parser_plugin='', model='/mnt/f/ai/QwQ-32B-AWQ', task='auto', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, allowed_local_media_path=None, download_dir=None, load_format='safetensors', config_format=<ConfigFormat.AUTO: 'auto'>, dtype='auto', kv_cache_dtype='auto', max_model_len=None, guided_decoding_backend='xgrammar', logits_processor_pattern=None, model_impl='auto', distributed_executor_backend=None, pipeline_parallel_size=1, tensor_parallel_size=1, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=None, enable_prefix_caching=None, disable_sliding_window=False, use_v2_block_manager=True, num_lookahead_slots=0, seed=0, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.9, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_partial_prefills=1, max_long_partial_prefills=1, long_prefill_token_threshold=0, max_num_seqs=None, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, hf_overrides=None, enforce_eager=False, max_seq_len_to_capture=8192, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_model=None, speculative_model_quantization=None, num_speculative_tokens=None, speculative_disable_mqa_scorer=False, speculative_draft_tensor_parallel_size=None, speculative_max_model_len=None, speculative_disable_by_batch_size=None, ngram_prompt_lookup_max=None, ngram_prompt_lookup_min=None, spec_decoding_acceptance_method='rejection_sampler', typical_acceptance_sampler_posterior_threshold=None, typical_acceptance_sampler_posterior_alpha=None, disable_logprobs_during_spec_decoding=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=None, qlora_adapter_name_or_path=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, scheduling_policy='fcfs', scheduler_cls='vllm.core.scheduler.Scheduler', override_neuron_config=None, override_pooler_config=None, compilation_config=None, kv_transfer_config=None, worker_cls='auto', generation_config=None, override_generation_config=None, enable_sleep_mode=False, calculate_kv_scales=False, additional_config=None, disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False, dispatch_function=<function ServeSubcommand.cmd at 0x7fe60f266e60>)
INFO 03-19 16:33:04 api_server.py:209] Started engine process with PID 1409
INFO 03-19 16:33:08 __init__.py:207] Automatically detected platform cuda.
INFO 03-19 16:33:09 config.py:549] This model supports multiple tasks: {'score', 'classify', 'reward', 'embed', 'generate'}. Defaulting to 'generate'.
WARNING 03-19 16:33:10 config.py:628] awq quantization is not fully optimized yet. The speed can be slower than non-quantized models.
WARNING 03-19 16:33:10 arg_utils.py:1187] Chunked prefill is enabled by default for models with max_model_len > 32K. Currently, chunked prefill might not work with some features or models. If you encounter any issues, please disable chunked prefill by setting --enable-chunked-prefill=False.
INFO 03-19 16:33:10 config.py:1555] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 03-19 16:33:13 config.py:549] This model supports multiple tasks: {'generate', 'embed', 'classify', 'score', 'reward'}. Defaulting to 'generate'.
WARNING 03-19 16:33:13 config.py:628] awq quantization is not fully optimized yet. The speed can be slower than non-quantized models.
WARNING 03-19 16:33:13 arg_utils.py:1187] Chunked prefill is enabled by default for models with max_model_len > 32K. Currently, chunked prefill might not work with some features or models. If you encounter any issues, please disable chunked prefill by setting --enable-chunked-prefill=False.
INFO 03-19 16:33:13 config.py:1555] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 03-19 16:33:13 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/mnt/f/ai/QwQ-32B-AWQ', speculative_config=None, tokenizer='/mnt/f/ai/QwQ-32B-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=40960, download_dir=None, load_format=safetensors, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/mnt/f/ai/QwQ-32B-AWQ, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=True, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=True,
WARNING 03-19 16:33:14 interface.py:304] Using 'pin_memory=False' as WSL is detected. This may slow down the performance.
INFO 03-19 16:33:14 cuda.py:178] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 03-19 16:33:14 cuda.py:226] Using XFormers backend.
INFO 03-19 16:33:15 model_runner.py:1110] Starting to load model /mnt/f/ai/QwQ-32B-AWQ...
Loading safetensors checkpoint shards:   0% Completed | 0/5 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  20% Completed | 1/5 [03:07<12:30, 187.72s/it]
Loading safetensors checkpoint shards:  40% Completed | 2/5 [06:25<09:41, 193.89s/it]
Loading safetensors checkpoint shards:  60% Completed | 3/5 [09:36<06:24, 192.46s/it]
Loading safetensors checkpoint shards:  80% Completed | 4/5 [12:48<03:12, 192.12s/it]
Loading safetensors checkpoint shards: 100% Completed | 5/5 [15:36<00:00, 183.53s/it]
Loading safetensors checkpoint shards: 100% Completed | 5/5 [15:36<00:00, 187.32s/it]

INFO 03-19 16:48:52 model_runner.py:1115] Loading model weights took 18.1467 GB
INFO 03-19 16:48:56 worker.py:267] Memory profiling takes 3.78 seconds
INFO 03-19 16:48:56 worker.py:267] the current vLLM instance can use total_gpu_memory (22.00GiB) x gpu_memory_utilization (0.90) = 19.80GiB
INFO 03-19 16:48:56 worker.py:267] model weights take 18.15GiB; non_torch_memory takes 0.34GiB; PyTorch activation peak memory takes 1.41GiB; the rest of the memory reserved for KV Cache is -0.10GiB.
INFO 03-19 16:48:57 executor_base.py:111] # cuda blocks: 0, # CPU blocks: 1024
INFO 03-19 16:48:57 executor_base.py:116] Maximum concurrency for 40960 tokens per request: 0.00x
ERROR 03-19 16:48:57 engine.py:400] No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.
ERROR 03-19 16:48:57 engine.py:400] Traceback (most recent call last):
ERROR 03-19 16:48:57 engine.py:400]   File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 391, in run_mp_engine
ERROR 03-19 16:48:57 engine.py:400]     engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
ERROR 03-19 16:48:57 engine.py:400]   File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 124, in from_engine_args
ERROR 03-19 16:48:57 engine.py:400]     return cls(ipc_path=ipc_path,
ERROR 03-19 16:48:57 engine.py:400]   File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 76, in __init__
ERROR 03-19 16:48:57 engine.py:400]     self.engine = LLMEngine(*args, **kwargs)
ERROR 03-19 16:48:57 engine.py:400]   File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 276, in __init__
ERROR 03-19 16:48:57 engine.py:400]     self._initialize_kv_caches()
ERROR 03-19 16:48:57 engine.py:400]   File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 434, in _initialize_kv_caches
ERROR 03-19 16:48:57 engine.py:400]     self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
ERROR 03-19 16:48:57 engine.py:400]   File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 122, in initialize_cache
ERROR 03-19 16:48:57 engine.py:400]     self.collective_rpc("initialize_cache",
ERROR 03-19 16:48:57 engine.py:400]   File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
ERROR 03-19 16:48:57 engine.py:400]     answer = run_method(self.driver_worker, method, args, kwargs)
ERROR 03-19 16:48:57 engine.py:400]   File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/utils.py", line 2196, in run_method
ERROR 03-19 16:48:57 engine.py:400]     return func(*args, **kwargs)
ERROR 03-19 16:48:57 engine.py:400]   File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/worker/worker.py", line 291, in initialize_cache
ERROR 03-19 16:48:57 engine.py:400]     raise_if_cache_size_invalid(num_gpu_blocks,
ERROR 03-19 16:48:57 engine.py:400]   File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/worker/worker.py", line 539, in raise_if_cache_size_invalid
ERROR 03-19 16:48:57 engine.py:400]     raise ValueError("No available memory for the cache blocks. "
ERROR 03-19 16:48:57 engine.py:400] ValueError: No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.
Process SpawnProcess-1:
Traceback (most recent call last):
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 402, in run_mp_engine
    raise e
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 391, in run_mp_engine
    engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 124, in from_engine_args
    return cls(ipc_path=ipc_path,
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/multiprocessing/engine.py", line 76, in __init__
    self.engine = LLMEngine(*args, **kwargs)
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 276, in __init__
    self._initialize_kv_caches()
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 434, in _initialize_kv_caches
    self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 122, in initialize_cache
    self.collective_rpc("initialize_cache",
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
    answer = run_method(self.driver_worker, method, args, kwargs)
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/utils.py", line 2196, in run_method
    return func(*args, **kwargs)
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/worker/worker.py", line 291, in initialize_cache
    raise_if_cache_size_invalid(num_gpu_blocks,
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/worker/worker.py", line 539, in raise_if_cache_size_invalid
    raise ValueError("No available memory for the cache blocks. "
ValueError: No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.
[rank0]:[W319 16:48:57.220161321 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())
Traceback (most recent call last):
  File "/home/sam/miniconda3/envs/vllm/bin/vllm", line 8, in <module>
    sys.exit(main())
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/cli/main.py", line 73, in main
    args.dispatch_function(args)
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/cli/serve.py", line 34, in cmd
    uvloop.run(run_server(args))
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/uvloop/__init__.py", line 82, in run
    return loop.run_until_complete(wrapper())
  File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/uvloop/__init__.py", line 61, in wrapper
    return await main
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 947, in run_server
    async with build_async_engine_client(args) as engine_client:
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/contextlib.py", line 199, in __aenter__
    return await anext(self.gen)
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 139, in build_async_engine_client
    async with build_async_engine_client_from_engine_args(
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/contextlib.py", line 199, in __aenter__
    return await anext(self.gen)
  File "/home/sam/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 233, in build_async_engine_client_from_engine_args
    raise RuntimeError(
RuntimeError: Engine process failed to start. See stack trace for the root cause.
```

```shell
curl http://localhost:8000/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "Qwen/Qwen2.5-1.5B-Instruct",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Who won the world series in 2020?"}
        ],
        "prompt": "San Francisco is a",
        "max_tokens": 7,
        "temperature": 0
    }'

curl http://localhost:8000/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "Qwen/Qwen2.5-1.5B-Instruct",
        "prompt": "介绍一下你自己"
    }'
```