## Run

In [None]:
!python inference.py \
  --output outdir/results.jsonl \
  --langs python,java \
  --cache_dir humaneval_cache \
  --host localhost \
  --model /models/qwen2 \
  --max_tokens 512 \
  --temperature 0.2 \
  --timeout 2000 \
  --ports 8001,8002,8003,8004,8005

In [None]:
python inference-multi.py 
--output outdir/results.jsonl 
--langs python,java 
--cache_dir humaneval_cache 
--cpu-cores 60 
--instances 20 
--base-port 8010 
--concurrency 1 
--model /models/qwen2 
--pass-k 2 
--repeat 2 
--timeout 4000 
--resume True 

## Evaluation

In [None]:
docker run --rm -it --gpus all `
  --mount type=bind,source="$(pwd)/CodeGeeX",target="/workspace/CodeGeeX" `
  --mount type=bind,source="$(pwd)/outdir",target="/workspace/CodeGeeX/results" `
  rishubi/codegeex:latest `
  bash -c "cd /workspace/CodeGeeX && `
    bash scripts/evaluate_humaneval_x.sh results/results_python.jsonl python 4"

In [None]:
from huggingface_hub import snapshot_download

# 这里会把整个仓库下载到本地目录 `./Qwen2.5-Coder-0.5B-Instruct/`
local_dir = snapshot_download(
    repo_id="Qwen/Qwen2.5-Coder-0.5B-Instruct",
    repo_type="model",        # 明确指定这是 model repo
    cache_dir="./models_cache",  # 可选：自定义缓存目录
    local_dir="./Qwen2.5-Coder-0.5B-Instruct",  # 也可以不指定，用 HF 缓存
    resume_download=True       # 若中断可续传
)

print("Downloaded to:", local_dir)

In [None]:
#!/usr/bin/env python3
# filename: benchmark_qwen2_auto_cleanup.py

import time
import docker
import requests
import concurrent.futures
from docker.errors import NotFound, APIError

# —— 配置区 —— 
IMAGE                = "konstantinvernermaif/vllm-cpu:latest"
HOST_MODEL_PATH      = r"D:\subway\project\qwen-serve\Qwen2.5-Coder-0.5B-Instruct"
CONTAINER_MODEL_PATH = "/models/qwen2"

TOTAL_CPU_CORES      = 16     # 宿主机总 CPU 核心数
INSTANCE_COUNT       = 4      # Docker 实例数量
BASE_PORT            = 8010   # 第一个实例映射端口，依次 +1

TOTAL_REQUESTS       = 10     # 总并发请求数
PROMPT               = "San Francisco is a"


def start_containers():
    threads_per_instance = TOTAL_CPU_CORES // INSTANCE_COUNT
    env_threads = str(threads_per_instance)
    client = docker.from_env()

    print(f"→ 每个实例分配 {threads_per_instance} 线程")

    for i in range(INSTANCE_COUNT):
        name = f"vllm_cpu_{BASE_PORT + i}"
        port = BASE_PORT + i

        # 清理旧容器
        try:
            old = client.containers.get(name)
            print(f"[!] 移除已存在容器 {name}")
            old.remove(force=True)
        except NotFound:
            pass

        print(f"[*] 启动容器 {name}，映射端口 {port} → 8000 …")
        try:
            client.containers.run(
                image=IMAGE,
                name=name,
                detach=True,
                environment={
                    "OMP_NUM_THREADS": env_threads,
                    "MKL_NUM_THREADS": env_threads
                },
                restart_policy={"Name": "unless-stopped"},
                ipc_mode="host",
                volumes={HOST_MODEL_PATH: {"bind": CONTAINER_MODEL_PATH, "mode": "ro"}},
                ports={"8000/tcp": port},
                command=[
                    "--model", CONTAINER_MODEL_PATH,
                    "--host", "0.0.0.0",
                    "--port", "8000"
                ]
            )
            print(f"[✓] {name} 启动成功！")
        except APIError as e:
            print(f"[✗] 启动 {name} 失败：{e.explanation}")
            exit(1)

    print("[*] 等待 120 秒，确保所有服务启动完毕…")
    time.sleep(120)


def call_instance(port: int, prompt: str):
    url = f"http://localhost:{port}/v1/completions"
    payload = {
        "model": CONTAINER_MODEL_PATH,
        "prompt": prompt,
        "max_tokens": 100,
        "temperature": 0.0
    }
    resp = requests.post(url, json=payload, headers={"Content-Type": "application/json"})
    resp.raise_for_status()
    return resp.json()["choices"][0]["text"]


def benchmark_all():
    # 计算 per-instance 的并发数分配
    base = TOTAL_REQUESTS // INSTANCE_COUNT
    rem = TOTAL_REQUESTS % INSTANCE_COUNT
    concurrencies = [base + (1 if i < rem else 0) for i in range(INSTANCE_COUNT)]
    print(f"→ 并发请求分配: {concurrencies}")

    # 构造所有 (port, prompt) 任务
    tasks = []
    for idx, port in enumerate(range(BASE_PORT, BASE_PORT + INSTANCE_COUNT)):
        tasks += [(port, PROMPT)] * concurrencies[idx]

    # 一次性并发提交
    total_start = time.perf_counter()
    with concurrent.futures.ThreadPoolExecutor(max_workers=TOTAL_REQUESTS) as pool:
        futures = [pool.submit(call_instance, port, prompt) for port, prompt in tasks]
        for f in concurrent.futures.as_completed(futures):
            try:
                _ = f.result()
            except Exception as e:
                print("[✗] 请求出错：", e)
    total_elapsed = time.perf_counter() - total_start

    print(f"\n🚀 并发 {TOTAL_REQUESTS} 次，总耗时 {total_elapsed:.2f}s，"
          f"平均 {total_elapsed/TOTAL_REQUESTS:.2f}s/次")


def stop_containers():
    client = docker.from_env()
    for i in range(INSTANCE_COUNT):
        name = f"vllm_cpu_{BASE_PORT + i}"
        try:
            cont = client.containers.get(name)
            cont.remove(force=True)
            print(f"[✓] 已移除容器 {name}")
        except NotFound:
            print(f"[!] 容器 {name} 不存在或已被移除")


if __name__ == "__main__":
    try:
        start_containers()
        benchmark_all()
    finally:
        print("\n[*] 清理：停止并移除所有容器…")
        stop_containers()


→ 每个实例分配 2 线程
[!] 移除已存在容器 vllm_cpu_8010
[*] 启动容器 vllm_cpu_8010，映射端口 8010 → 8000 …
[✓] vllm_cpu_8010 启动成功！
[!] 移除已存在容器 vllm_cpu_8011
[*] 启动容器 vllm_cpu_8011，映射端口 8011 → 8000 …
[✓] vllm_cpu_8011 启动成功！
[!] 移除已存在容器 vllm_cpu_8012
[*] 启动容器 vllm_cpu_8012，映射端口 8012 → 8000 …
[✓] vllm_cpu_8012 启动成功！
[!] 移除已存在容器 vllm_cpu_8013
[*] 启动容器 vllm_cpu_8013，映射端口 8013 → 8000 …
[✓] vllm_cpu_8013 启动成功！
[*] 启动容器 vllm_cpu_8014，映射端口 8014 → 8000 …
[✓] vllm_cpu_8014 启动成功！
[*] 启动容器 vllm_cpu_8015，映射端口 8015 → 8000 …
[✓] vllm_cpu_8015 启动成功！
[*] 启动容器 vllm_cpu_8016，映射端口 8016 → 8000 …
[✓] vllm_cpu_8016 启动成功！
[*] 启动容器 vllm_cpu_8017，映射端口 8017 → 8000 …
[✓] vllm_cpu_8017 启动成功！
[*] 等待 120 秒，确保所有服务启动完毕…
→ 并发请求分配: [2, 2, 1, 1, 1, 1, 1, 1]

🚀 并发 10 次，总耗时 192.49s，平均 19.25s/次


In [13]:
def extract_indented_body(code: str) -> str:
    """提取 Python 或 Java 的代码主体"""
    lines = code.splitlines()
    # Python 函数体提取
    if any(ln.startswith("def ") for ln in lines):
        saw = False
        out = []
        for ln in lines:
            if not saw:
                if ln.startswith("def "):
                    saw = True
                continue
            out.append(ln)
        return "\n".join(out).rstrip()
    # Java 方法体提取
    if any(ln.strip().startswith(("public ", "private ", "protected ")) for ln in lines):
        start_idx = None
        # 寻找方法签名后的第一行 '{' 之后的内容
        for i, ln in enumerate(lines):
            if "{" in ln:
                start_idx = i + 1
                break
        # 寻找最后一个 '}' 的行
        end_idx = None
        for j in range(len(lines) - 1, -1, -1):
            if "}" in lines[j]:
                end_idx = j
                break
        if start_idx is not None and end_idx is not None and start_idx < end_idx:
            return "\n".join(lines[start_idx:end_idx]).rstrip()
    # 默认按缩进提取
    return "\n".join([ln for ln in lines if ln.startswith("    ") or ln.startswith("\t")]).rstrip()
# ------------------------ 测试 extract_indented_body ------------------------ #
def test_extract_java():
    java_code = "        return number % 1.0;"
    body = extract_indented_body(java_code)
    if ") {\n" in body:
        print("Java code detected, extracting method body...")
        body = extract_indented_body(body)
    print(body)
test_extract_java()

        return number % 1.0;
