# Phase 6.2: Deploy with vLLM

Deploy the quantized Korean MedGemma model using vLLM for high-performance inference.

## Contents
1. Setup
2. vLLM Configuration
3. Start vLLM Server
4. Test API
5. Usage Examples

In [None]:
# Setup
import sys
import os
sys.path.append("..")

import json
import requests

print("vLLM Deployment Setup")

In [None]:
# Model configuration
# Primary: Use AWQ quantized model
MODEL_DIR = "../models/korean_medgemma_awq"

# Alternative: Use full precision instruction-tuned model
# MODEL_DIR = "../models/instruction_tuned"

# Alternative: Use expanded model directly
# MODEL_DIR = "../models/final/korean_medgemma_expanded"

# vLLM server configuration
VLLM_CONFIG = {
    "host": "0.0.0.0",
    "port": 8000,
    "max_model_len": 4096,
    "gpu_memory_utilization": 0.9,
    "quantization": "awq",  # Set to None for full precision
    "dtype": "half",
}

print(f"Model: {MODEL_DIR}")
print(f"\nvLLM Configuration:")
for key, value in VLLM_CONFIG.items():
    print(f"  {key}: {value}")

---
## 1. Create Deployment Script

In [None]:
# Create vLLM deployment script
deploy_script = f'''#!/bin/bash
# Korean MedGemma vLLM Deployment Script

MODEL_PATH="{os.path.abspath(MODEL_DIR)}"
HOST="{VLLM_CONFIG['host']}"
PORT="{VLLM_CONFIG['port']}"

echo "Starting Korean MedGemma vLLM Server..."
echo "Model: $MODEL_PATH"
echo "Server: http://$HOST:$PORT"

python -m vllm.entrypoints.openai.api_server \\
    --model $MODEL_PATH \\
    --host $HOST \\
    --port $PORT \\
    --max-model-len {VLLM_CONFIG['max_model_len']} \\
    --gpu-memory-utilization {VLLM_CONFIG['gpu_memory_utilization']} \\
    --dtype {VLLM_CONFIG['dtype']} \\
    {"--quantization " + VLLM_CONFIG['quantization'] if VLLM_CONFIG['quantization'] else ""} \\
    --trust-remote-code
'''

script_path = "../scripts/deploy_vllm.sh"
os.makedirs(os.path.dirname(script_path), exist_ok=True)

with open(script_path, "w") as f:
    f.write(deploy_script)

# Make executable
os.chmod(script_path, 0o755)

print(f"Deployment script created: {script_path}")
print("\nScript contents:")
print(deploy_script)

---
## 2. Start vLLM Server (Run in Terminal)

To start the server, run the following command in a terminal:

```bash
bash ../scripts/deploy_vllm.sh
```

Or directly with vLLM:

In [None]:
# Print the vLLM command
vllm_command = f"""
python -m vllm.entrypoints.openai.api_server \\
    --model {os.path.abspath(MODEL_DIR)} \\
    --host {VLLM_CONFIG['host']} \\
    --port {VLLM_CONFIG['port']} \\
    --max-model-len {VLLM_CONFIG['max_model_len']} \\
    --gpu-memory-utilization {VLLM_CONFIG['gpu_memory_utilization']} \\
    --dtype {VLLM_CONFIG['dtype']} \\
    {"--quantization " + VLLM_CONFIG['quantization'] if VLLM_CONFIG['quantization'] else ""} \\
    --trust-remote-code
"""

print("Run this command in a terminal to start the server:")
print(vllm_command)

---
## 3. Test API (After Server is Running)

In [None]:
# API endpoint
API_URL = f"http://localhost:{VLLM_CONFIG['port']}/v1/chat/completions"

def chat_with_model(messages, max_tokens=256, temperature=0.7):
    """Send chat request to vLLM server"""
    
    payload = {
        "model": MODEL_DIR,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
    }
    
    try:
        response = requests.post(API_URL, json=payload, timeout=60)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.ConnectionError:
        return {"error": "Server not running. Start the vLLM server first."}
    except Exception as e:
        return {"error": str(e)}

print(f"API endpoint: {API_URL}")

In [None]:
# Test Korean medical question
print("Testing Korean medical question...")

messages = [
    {"role": "system", "content": "당신은 한국어 의료 전문 AI 어시스턴트입니다."},
    {"role": "user", "content": "고혈압의 주요 증상과 치료법에 대해 설명해주세요."}
]

response = chat_with_model(messages)

if "error" in response:
    print(f"Error: {response['error']}")
    print("\nMake sure the vLLM server is running.")
else:
    print("\nResponse:")
    print(response["choices"][0]["message"]["content"])

In [None]:
# Test English medical question
print("Testing English medical question...")

messages = [
    {"role": "system", "content": "You are a medical AI assistant."},
    {"role": "user", "content": "What are the symptoms and treatment options for Type 2 Diabetes?"}
]

response = chat_with_model(messages)

if "error" in response:
    print(f"Error: {response['error']}")
else:
    print("\nResponse:")
    print(response["choices"][0]["message"]["content"])

---
## 4. Python Client Example

In [None]:
# Create Python client example
client_code = f'''
#!/usr/bin/env python3
"""
Korean MedGemma Python Client Example

Usage:
    python korean_medgemma_client.py "고혈압의 증상은 무엇인가요?"
"""

import requests
import sys

API_URL = "http://localhost:{VLLM_CONFIG['port']}/v1/chat/completions"
MODEL_NAME = "{MODEL_DIR}"

def ask_medical_question(question, language="ko"):
    """
    Ask a medical question to Korean MedGemma.
    
    Args:
        question: The medical question
        language: "ko" for Korean, "en" for English
    
    Returns:
        The model\'s response
    """
    
    if language == "ko":
        system_prompt = "당신은 한국어 의료 전문 AI 어시스턴트입니다. 정확하고 도움이 되는 의료 정보를 제공하세요."
    else:
        system_prompt = "You are a medical AI assistant. Provide accurate and helpful medical information."
    
    payload = {{
        "model": MODEL_NAME,
        "messages": [
            {{"role": "system", "content": system_prompt}},
            {{"role": "user", "content": question}}
        ],
        "max_tokens": 512,
        "temperature": 0.7,
    }}
    
    response = requests.post(API_URL, json=payload)
    response.raise_for_status()
    
    return response.json()["choices"][0]["message"]["content"]


if __name__ == "__main__":
    if len(sys.argv) > 1:
        question = " ".join(sys.argv[1:])
    else:
        question = "고혈압의 증상과 치료법은 무엇인가요?"
    
    print(f"Question: {{question}}")
    print("\nAnswer:")
    
    try:
        answer = ask_medical_question(question)
        print(answer)
    except Exception as e:
        print(f"Error: {{e}}")
        print("Make sure the vLLM server is running.")
'''

client_path = "../scripts/korean_medgemma_client.py"
with open(client_path, "w") as f:
    f.write(client_code)

print(f"Python client created: {client_path}")

---
## 5. Docker Deployment (Optional)

In [None]:
# Create Dockerfile for deployment
dockerfile = f'''
# Korean MedGemma Docker Deployment
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04

# Install Python and dependencies
RUN apt-get update && apt-get install -y \\
    python3 python3-pip git && \\
    rm -rf /var/lib/apt/lists/*

# Install vLLM
RUN pip3 install vllm autoawq

# Copy model (or mount as volume)
# COPY models/korean_medgemma_awq /app/model

WORKDIR /app

# Expose port
EXPOSE 8000

# Run vLLM server
CMD ["python3", "-m", "vllm.entrypoints.openai.api_server", \\
     "--model", "/app/model", \\
     "--host", "0.0.0.0", \\
     "--port", "8000", \\
     "--quantization", "awq", \\
     "--trust-remote-code"]
'''

dockerfile_path = "../Dockerfile"
with open(dockerfile_path, "w") as f:
    f.write(dockerfile)

print(f"Dockerfile created: {dockerfile_path}")

In [None]:
# Create docker-compose file
docker_compose = f'''
version: "3.8"

services:
  korean-medgemma:
    build: .
    ports:
      - "8000:8000"
    volumes:
      - ./models/korean_medgemma_awq:/app/model:ro
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    environment:
      - CUDA_VISIBLE_DEVICES=0
    restart: unless-stopped
'''

compose_path = "../docker-compose.yml"
with open(compose_path, "w") as f:
    f.write(docker_compose)

print(f"Docker Compose created: {compose_path}")

---
## 6. Summary

In [None]:
print("\n" + "=" * 60)
print("Korean MedGemma Deployment Complete!")
print("=" * 60)

print(f"""
Deployment files created:
  - Deployment script: {script_path}
  - Python client: {client_path}
  - Dockerfile: {dockerfile_path}
  - Docker Compose: {compose_path}

To start the server:
  bash {script_path}

Or with Docker:
  docker-compose up

API endpoint: http://localhost:8000/v1/chat/completions

Example curl command:
  curl -X POST http://localhost:8000/v1/chat/completions \\
    -H "Content-Type: application/json" \\
    -d '{{
      "model": "{MODEL_DIR}",
      "messages": [
        {{"role": "system", "content": "당신은 의료 AI 어시스턴트입니다."}},
        {{"role": "user", "content": "고혈압이란 무엇인가요?"}}
      ]
    }}'

Project complete! You now have a Korean-adapted MedGemma model
ready for production deployment.
""")