diff --git a/config/rhaiis/defaults.yaml b/config/rhaiis/defaults.yaml new file mode 100644 index 0000000..6327038 --- /dev/null +++ b/config/rhaiis/defaults.yaml @@ -0,0 +1,44 @@ +# Global Defaults & Accelerator-Specific Settings +# These are merged with model configs at runtime using inheritance: +# defaults → accelerator → model → model.accelerator_overrides → scenario + +# Global defaults applied to all deployments +defaults: + deploy: + namespace: forge + replicas: 1 + cpu_request: "4" + memory_request: "16Gi" + storage_source: hf + storage_path: model-pvc-2 + + vllm_args: + gpu-memory-utilization: 0.9 + trust-remote-code: true + disable-log-requests: true + uvicorn-log-level: debug + tensor-parallel-size: 1 # Also determines num_gpus for deployment + + guidellm: + rate_type: concurrent + max_seconds: 300 + +# Accelerator-specific overrides +# Selected via --accelerator flag or auto-detected from cluster +accelerators: + nvidia: + image: quay.io/aipcc/rhaiis/cuda-ubi9:3.4.0-ea.2-1773886296 + vllm_args: {} + env_vars: {} + + amd: + image: quay.io/aipcc/rhaiis/rocm-ubi9:3.2.5-1766067105 + vllm_args: + num-scheduler-steps: 8 + env_vars: + VLLM_ROCM_USE_AITER: "1" + + # Future accelerators + # gaudi: + # image: ... + # vllm_args: {} diff --git a/config/rhaiis/models.yaml b/config/rhaiis/models.yaml new file mode 100644 index 0000000..29bd2d7 --- /dev/null +++ b/config/rhaiis/models.yaml @@ -0,0 +1,315 @@ +# Model Registry +# Models only specify what's DIFFERENT from defaults.yaml +# Accelerator-specific settings go in accelerator_overrides section +# +# Resolution order: +# defaults.yaml → accelerators[accel] → models[model] → models[model].accelerator_overrides[accel] + +models: + # === Small Test Models === + qwen-0.6b: + name: "Qwen3-0.6B" + hf_model_id: "Qwen/Qwen3-0.6B" + supported_workloads: [balanced, short, long-prompt] + + # === Llama 3.3 Family === + llama-3.3-70b: + name: "Llama-3.3-70B-Instruct" + hf_model_id: "meta-llama/Llama-3.3-70B-Instruct" + vllm_args: + tensor-parallel-size: 4 + supported_workloads: [balanced, short] + + llama-3.3-70b-fp8: + name: "Llama-3.3-70B-Instruct-FP8" + hf_model_id: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" + vllm_args: + tensor-parallel-size: 4 + kv-cache-dtype: fp8 + supported_workloads: [balanced, short, long-prompt] + + llama-3.3-70b-w8a8: + name: "Llama-3.3-70B-Instruct-W8A8" + hf_model_id: "RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8" + vllm_args: + tensor-parallel-size: 4 + max-model-len: 32768 + supported_workloads: [balanced, short] + + # === Llama 3.1 Family === + llama-3.1-8b: + name: "Llama-3.1-8B-Instruct" + hf_model_id: "meta-llama/Llama-3.1-8B-Instruct" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced, short, long-prompt] + + llama-3.1-8b-fp8: + name: "Llama-3.1-8B-Instruct-FP8" + hf_model_id: "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced, short, long-prompt] + + llama-3.1-405b-fp8: + name: "Llama-3.1-405B-Instruct-FP8" + hf_model_id: "RedHatAI/Meta-Llama-3.1-405B-Instruct-FP8-dynamic" + vllm_args: + tensor-parallel-size: 8 + kv-cache-dtype: fp8 + supported_workloads: [balanced] + + # === Llama 4 Family === + llama-4-scout-fp8: + name: "Llama-4-Scout-17B-16E-FP8" + hf_model_id: "RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic" + vllm_args: + tensor-parallel-size: 2 + kv-cache-dtype: fp8 + supported_workloads: [balanced, short] + + llama-4-maverick-fp8: + name: "Llama-4-Maverick-17B-128E-FP8" + hf_model_id: "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-FP8" + vllm_args: + tensor-parallel-size: 8 + kv-cache-dtype: fp8 + supported_workloads: [balanced] + + llama-4-maverick-w4a16: + name: "Llama-4-Maverick-17B-128E-W4A16" + hf_model_id: "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-quantized.w4a16" + vllm_args: + tensor-parallel-size: 8 + enable-expert-parallel: true + supported_workloads: [balanced] + + llama-4-scout-w4a16: + name: "Llama-4-Scout-17B-16E-W4A16" + hf_model_id: "RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16" + vllm_args: + tensor-parallel-size: 2 + supported_workloads: [balanced, short] + + # === Qwen Family === + qwen-235b-fp8: + name: "Qwen3-235B-A22B-FP8" + hf_model_id: "RedHatAI/Qwen3-235B-A22B-FP8-dynamic" + aliases: [qwen-235b, qwen3-moe] + vllm_args: + tensor-parallel-size: 4 + max-model-len: 16384 + gpu-memory-utilization: 0.95 + enable-expert-parallel: true + supported_workloads: [balanced, short] + + qwen-235b-instruct: + name: "Qwen3-235B-A22B-Instruct" + hf_model_id: "Qwen/Qwen3-235B-A22B-Instruct-2507" + vllm_args: + tensor-parallel-size: 4 + max-model-len: 16384 + gpu-memory-utilization: 0.95 + # AMD needs AITER disabled for this model + accelerator_overrides: + amd: + env_vars: + VLLM_ROCM_USE_AITER: "0" + supported_workloads: [balanced] + + qwen-30b-a3b: + name: "Qwen3-30B-A3B-Instruct" + hf_model_id: "Qwen/Qwen3-30B-A3B-Instruct-2507" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced, short] + + qwen-next-80b-a3b: + name: "Qwen3-Next-80B-A3B-Instruct" + hf_model_id: "Qwen/Qwen3-Next-80B-A3B-Instruct" + vllm_args: + tensor-parallel-size: 4 + supported_workloads: [balanced] + + qwen-vl-30b-a3b: + name: "Qwen3-VL-30B-A3B-Instruct" + hf_model_id: "Qwen/Qwen3-VL-30B-A3B-Instruct" + vllm_args: + tensor-parallel-size: 4 + supported_workloads: [balanced] + + qwen-25-7b: + name: "Qwen2.5-7B-Instruct" + hf_model_id: "Qwen/Qwen2.5-7B-Instruct" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced, short, long-prompt] + + # === DeepSeek === + deepseek-r1: + name: "DeepSeek-R1-0528" + hf_model_id: "deepseek-ai/DeepSeek-R1-0528" + aliases: [deepseek, r1] + vllm_args: + tensor-parallel-size: 8 + max-model-len: 16384 + gpu-memory-utilization: 0.95 + # AMD needs different AITER settings + accelerator_overrides: + amd: + env_vars: + VLLM_ROCM_USE_AITER: "0" + supported_workloads: [balanced] + + deepseek-r1-w4a16: + name: "DeepSeek-R1-0528-W4A16" + hf_model_id: "RedHatAI/DeepSeek-R1-0528-quantized.w4a16" + vllm_args: + tensor-parallel-size: 8 + max-model-len: 16384 + supported_workloads: [balanced] + + # === GPT-OSS === + gpt-oss-120b: + name: "GPT-OSS-120B" + hf_model_id: "openai/gpt-oss-120b" + vllm_args: + max-model-len: 16384 + gpu-memory-utilization: 0.95 + supported_workloads: [balanced, short] + + gpt-oss-120b-fp8: + name: "GPT-OSS-120B-FP8" + hf_model_id: "RedHatAI/gpt-oss-120b-FP8-dynamic" + vllm_args: + max-model-len: 16384 + gpu-memory-utilization: 0.95 + supported_workloads: [balanced, short] + + gpt-oss-20b: + name: "GPT-OSS-20B" + hf_model_id: "openai/gpt-oss-20b" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced, short] + + # === Mistral Family === + mistral-small-24b: + name: "Mistral-Small-3.1-24B" + hf_model_id: "mistralai/Mistral-Small-3.1-24B-Instruct-2503" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced, short] + + mistral-small-24b-fp8: + name: "Mistral-Small-3.1-24B-FP8" + hf_model_id: "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced, short] + + mixtral-8x22b: + name: "Mixtral-8x22B-Instruct" + hf_model_id: "mistralai/Mixtral-8x22B-Instruct-v0.1" + aliases: [mixtral] + vllm_args: + tensor-parallel-size: 4 + max-model-len: 16384 + gpu-memory-utilization: 0.95 + supported_workloads: [balanced, short] + + mixtral-8x7b: + name: "Mixtral-8x7B-Instruct" + hf_model_id: "mistralai/Mixtral-8x7B-Instruct-v0.1" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced, short] + + mistral-7b: + name: "Mistral-7B-Instruct" + hf_model_id: "mistralai/Mistral-7B-Instruct-v0.3" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced, short] + + ministral-14b: + name: "Ministral-3-14B-Instruct" + hf_model_id: "mistralai/Ministral-3-14B-Instruct-2512" + vllm_args: + tensor-parallel-size: 4 + supported_workloads: [balanced, short] + + ministral-14b-fp8: + name: "Ministral-3-14B-Instruct-FP8" + hf_model_id: "RedHatAI/Ministral-3-14B-Instruct-2512" + vllm_args: + tensor-parallel-size: 4 + supported_workloads: [balanced, short] + + # === Granite Family === + granite-3.1-8b: + name: "Granite-3.1-8B-Instruct" + hf_model_id: "ibm-granite/granite-3.1-8b-instruct" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced, short, long-prompt] + + granite-3.1-8b-fp8: + name: "Granite-3.1-8B-Instruct-FP8" + hf_model_id: "RedHatAI/granite-3.1-8b-instruct-fp8-dynamic" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced, short, long-prompt] + + # === Phi Family === + phi-4: + name: "Phi-4" + hf_model_id: "microsoft/phi-4" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced, short] + + phi-4-fp8: + name: "Phi-4-FP8" + hf_model_id: "RedHatAI/phi-4-FP8-dynamic" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced, short] + + # === Gemma Family === + gemma-2-9b: + name: "Gemma-2-9B-IT" + hf_model_id: "google/gemma-2-9b-it" + vllm_args: + max-model-len: 8192 + supported_workloads: [balanced, short] + + gemma-2-9b-fp8: + name: "Gemma-2-9B-IT-FP8" + hf_model_id: "RedHatAI/gemma-2-9b-it-FP8" + vllm_args: + max-model-len: 8192 + supported_workloads: [balanced, short] + + # === Nemotron Family === + nemotron-70b: + name: "Nemotron-70B-Instruct" + hf_model_id: "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF" + vllm_args: + tensor-parallel-size: 2 + max-model-len: 16384 + supported_workloads: [balanced] + + nemotron-70b-fp8: + name: "Nemotron-70B-Instruct-FP8" + hf_model_id: "RedHatAI/Llama-3.1-Nemotron-70B-Instruct-HF-FP8-dynamic" + vllm_args: + max-model-len: 16384 + supported_workloads: [balanced] + + nemotron-nano-30b-fp8: + name: "Nemotron-3-Nano-30B-FP8" + hf_model_id: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8" + vllm_args: + tensor-parallel-size: 4 + supported_workloads: [balanced, short] diff --git a/config/rhaiis/workloads.yaml b/config/rhaiis/workloads.yaml new file mode 100644 index 0000000..aa7bd0b --- /dev/null +++ b/config/rhaiis/workloads.yaml @@ -0,0 +1,64 @@ +# GuideLLM Workload Profiles +# These define benchmark parameters - can be changed WITHOUT restarting vLLM +# Only GuideLLM needs to restart with new workload config + +workloads: + # === Standard Workloads === + balanced: + name: "Balanced" + description: "Balanced prompt and output tokens (1k/1k)" + guidellm: + data: "prompt_tokens=1000,output_tokens=1000" + rates: [1, 50, 100, 200] + max_seconds: 180 + + short: + name: "Short" + description: "Short prompt and output (256/256)" + guidellm: + data: "prompt_tokens=256,output_tokens=256" + rates: [1, 50, 100, 200] + max_seconds: 120 + + long-prompt: + name: "Long Prompt" + description: "Long prompt, standard output (8k/1k)" + guidellm: + data: "prompt_tokens=8000,output_tokens=1000" + rate_type: "concurrent" + rates: [1, 25, 50, 100] + max_seconds: 300 + # Requires separate deployment with larger context + vllm_args: + max-model-len: 10000 + + very-long-prompt: + name: "Very Long Prompt" + description: "Very long prompt (16k/1k)" + guidellm: + data: "prompt_tokens=16000,output_tokens=1000" + rate_type: "concurrent" + rates: [1, 10, 25, 50] + max_seconds: 600 + # Requires separate deployment with larger context + vllm_args: + max-model-len: 20000 + + # === Advanced Workloads === + heterogeneous: + name: "Heterogeneous" + description: "Mixed token distributions simulating real traffic" + guidellm: + data: "ADD ME" + rate_type: "concurrent" + rates: [1, 25, 50] + max_seconds: 300 + + multi-turn: + name: "Multi-Turn" + description: "Multi-turn conversation with context reuse" + guidellm: + data: "multi_turn" + rate_type: "concurrent" + rates: [1, 10, 25] + max_seconds: 600 \ No newline at end of file diff --git a/projects/core/ci_entrypoint/run_ci.py b/projects/core/ci_entrypoint/run_ci.py index f55bf24..3b569f4 100755 --- a/projects/core/ci_entrypoint/run_ci.py +++ b/projects/core/ci_entrypoint/run_ci.py @@ -496,10 +496,13 @@ def execute_project_operation(project: str, operation: str, args: tuple, verbose sys.exit(1) -@click.command() +@click.command(context_settings=dict( + ignore_unknown_options=True, + allow_interspersed_args=False, +)) @click.argument('project', required=False) @click.argument('operation', required=False) -@click.argument('args', nargs=-1) +@click.argument('args', nargs=-1, type=click.UNPROCESSED) @click.option('--verbose', '-v', is_flag=True, help='Enable verbose output', default=True) @click.option('--dry-run', is_flag=True, help='Show what would be executed without running it') def main(project, operation, args, verbose, dry_run): diff --git a/projects/core/scenarios/__init__.py b/projects/core/scenarios/__init__.py new file mode 100644 index 0000000..240864a --- /dev/null +++ b/projects/core/scenarios/__init__.py @@ -0,0 +1,34 @@ +"""Declarative scenario generation with config inheritance. + +Generate benchmark scenarios from YAML configuration using matrix expansion. +Supports accelerator-specific settings via inheritance chain. + +Example: + from projects.core.scenarios import ConfigLoader, ScenarioGenerator + + # Load config with accelerator-specific inheritance + loader = ConfigLoader("config/", accelerator="nvidia") + model = loader.load_model("llama-3.3-70b-fp8") + + # Generate scenarios from matrix + gen = ScenarioGenerator("config/projects/rhaiis.yaml", config_loader=loader) + gen.load() + + for scenario in gen.expand(): + print(scenario.scenario_id) # e.g., llama-70b-fp8_balanced_direct_tp4 +""" + +from .config import ScenarioConfig +from .config_loader import ConfigLoader, ResolvedModelConfig, ResolvedWorkloadConfig +from .generator import DeploymentGroup, ExpandedScenario, ParsedConfig, ScenarioGenerator + +__all__ = [ + "ConfigLoader", + "DeploymentGroup", + "ExpandedScenario", + "ParsedConfig", + "ResolvedModelConfig", + "ResolvedWorkloadConfig", + "ScenarioConfig", + "ScenarioGenerator", +] diff --git a/projects/core/scenarios/config.py b/projects/core/scenarios/config.py new file mode 100644 index 0000000..3039574 --- /dev/null +++ b/projects/core/scenarios/config.py @@ -0,0 +1,110 @@ +"""Scenario configuration dataclasses.""" + +import re +from dataclasses import dataclass, field +from typing import Any + + +@dataclass +class ScenarioConfig: + """ + Single expanded scenario ready for execution. + + Generated by ScenarioGenerator from matrix expansion of scenarios.yaml. + Contains all configuration needed for a single benchmark run. + """ + + # Identity + scenario_id: str # qwen3-0-6b_balanced_direct_tp1 + model_id: str # Qwen/Qwen3-0.6B + model_short: str # qwen3-0-6b + + # Matrix dimensions + workload: str # balanced, short, long-context + routing: str # direct, prefix-estimation + tensor_parallel: int # 1, 2, 4, 8 + + # Deployment config + deployment_name: str # K8s resource name (sanitized) + namespace: str = "forge" + replicas: int = 1 + + # vLLM runtime args (merged from common + model + workload) + runtime_args: dict[str, Any] = field(default_factory=dict) + + # Workload config (from workloads section) + workload_config: dict[str, Any] = field(default_factory=dict) + + # Routing config (from routing section) + routing_config: dict[str, Any] = field(default_factory=dict) + + # Model-specific env_vars + env_vars: dict[str, str] = field(default_factory=dict) + + # Metadata + description: str | None = None + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for JSON serialization.""" + return { + "scenario_id": self.scenario_id, + "model_id": self.model_id, + "model_short": self.model_short, + "workload": self.workload, + "routing": self.routing, + "tensor_parallel": self.tensor_parallel, + "deployment_name": self.deployment_name, + "namespace": self.namespace, + "replicas": self.replicas, + "runtime_args": self.runtime_args, + "config": { + "workload_config": self.workload_config, + "routing_config": self.routing_config, + }, + "env_vars": self.env_vars, + "description": self.description, + } + + @staticmethod + def sanitize_name(name: str, max_len: int = 42) -> str: + """ + Sanitize name for K8s resource naming. + + Rules: + - Lowercase + - Replace / and _ with - + - Remove dots + - Truncate to max_len + """ + return ( + name.lower().replace("/", "-").replace("_", "-").replace(".", "") + )[:max_len] + + @staticmethod + def shorten_model_name(model_id: str) -> str: + """ + Create short model name for scenario_id. + + Examples: + - Qwen/Qwen3-0.6B -> qwen3-0-6b + - openai/gpt-oss-120b -> gpt-oss-120b + - RedHatAI/gpt-oss-120b-FP8-dynamic -> gpt-oss-120b-fp8 + """ + # Take last part after / + name = model_id.split("/")[-1] + + # Lowercase + name = name.lower() + + # Remove common suffixes + for suffix in ["-instruct", "-dynamic", "-chat"]: + if name.endswith(suffix): + name = name[: -len(suffix)] + + # Replace dots with dashes + name = name.replace(".", "-") + + # Truncate version numbers like -2507 + name = re.sub(r"-\d{4}$", "", name) + + return name diff --git a/projects/core/scenarios/config_loader.py b/projects/core/scenarios/config_loader.py new file mode 100644 index 0000000..f7d9d8d --- /dev/null +++ b/projects/core/scenarios/config_loader.py @@ -0,0 +1,312 @@ +"""Config loading with inheritance and accelerator support. + +Resolution order: + defaults.yaml (base) + ↓ merge + defaults.yaml.accelerators[accelerator] + ↓ merge + models.yaml[model] + ↓ merge + models.yaml[model].accelerator_overrides[accelerator] + ↓ merge + scenarios/*.yaml.defaults + ↓ merge + scenarios/*.yaml.runs[].overrides +""" + +from copy import deepcopy +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import yaml + + +def deep_merge(base: dict, override: dict) -> dict: + """ + Deep merge two dictionaries. + + Values in override take precedence. Nested dicts are merged recursively. + Lists are replaced (not merged). + """ + result = deepcopy(base) + for key, value in override.items(): + if key in result and isinstance(result[key], dict) and isinstance(value, dict): + result[key] = deep_merge(result[key], value) + else: + result[key] = deepcopy(value) + return result + + +@dataclass +class ResolvedModelConfig: + """Fully resolved model configuration after inheritance.""" + + key: str + name: str + hf_model_id: str + deploy: dict[str, Any] = field(default_factory=dict) + vllm_args: dict[str, Any] = field(default_factory=dict) + env_vars: dict[str, str] = field(default_factory=dict) + supported_workloads: list[str] = field(default_factory=list) + aliases: list[str] = field(default_factory=list) + + @property + def num_gpus(self) -> int: + """Number of GPUs from deploy config or tensor_parallel.""" + return self.deploy.get("num_gpus", self.vllm_args.get("tensor-parallel-size", 1)) + + @property + def tensor_parallel(self) -> int: + """Tensor parallel size from vllm_args.""" + return self.vllm_args.get("tensor-parallel-size", 1) + + +@dataclass +class ResolvedWorkloadConfig: + """Resolved workload configuration.""" + + key: str + name: str + description: str = "" + guidellm: dict[str, Any] = field(default_factory=dict) + max_seconds: int = 300 + vllm_args: dict[str, Any] = field(default_factory=dict) # Workload-specific overrides + + +class ConfigLoader: + """ + Load and resolve configurations with inheritance. + + Usage: + loader = ConfigLoader('config/', accelerator='nvidia') + model_config = loader.load_model('llama-3.3-70b-fp8') + workload_config = loader.load_workload('balanced') + """ + + def __init__( + self, + config_dir: str | Path, + accelerator: str = "nvidia", + ): + """ + Initialize config loader. + + Args: + config_dir: Directory containing defaults.yaml, models.yaml, workloads.yaml + accelerator: Accelerator type ('nvidia', 'amd') + """ + self.config_dir = Path(config_dir) + self.accelerator = accelerator + + # Cache loaded configs + self._defaults: dict[str, Any] | None = None + self._models: dict[str, Any] | None = None + self._workloads: dict[str, Any] | None = None + + @property + def defaults(self) -> dict[str, Any]: + """Load and cache defaults.yaml.""" + if self._defaults is None: + defaults_path = self.config_dir / "defaults.yaml" + if defaults_path.exists(): + with open(defaults_path) as f: + self._defaults = yaml.safe_load(f) or {} + else: + self._defaults = {} + return self._defaults + + @property + def models(self) -> dict[str, Any]: + """Load and cache models.yaml.""" + if self._models is None: + models_path = self.config_dir / "models.yaml" + if models_path.exists(): + with open(models_path) as f: + data = yaml.safe_load(f) or {} + self._models = data.get("models", {}) + else: + self._models = {} + return self._models + + @property + def workloads(self) -> dict[str, Any]: + """Load and cache workloads.yaml.""" + if self._workloads is None: + workloads_path = self.config_dir / "workloads.yaml" + if workloads_path.exists(): + with open(workloads_path) as f: + data = yaml.safe_load(f) or {} + self._workloads = data.get("workloads", {}) + else: + self._workloads = {} + return self._workloads + + def get_accelerator_defaults(self) -> dict[str, Any]: + """Get accelerator-specific defaults.""" + accelerators = self.defaults.get("accelerators", {}) + return accelerators.get(self.accelerator, {}) + + def get_global_defaults(self) -> dict[str, Any]: + """Get global defaults (deploy, vllm_args, guidellm).""" + return self.defaults.get("defaults", {}) + + def load_model(self, model_key: str) -> ResolvedModelConfig: + """ + Load and resolve a model configuration. + + Applies inheritance: + defaults → accelerator_defaults → model → model.accelerator_overrides + + Args: + model_key: Model key from models.yaml, alias, or HuggingFace ID + + Returns: + Fully resolved model configuration + + Raises: + KeyError: If model not found + """ + model_data = self._find_model(model_key) + if model_data is None: + raise KeyError(f"Model '{model_key}' not found in registry") + + actual_key, raw_config = model_data + + # Start with global defaults + global_defaults = self.get_global_defaults() + base_deploy = global_defaults.get("deploy", {}) + base_vllm_args = global_defaults.get("vllm_args", {}) + + # Merge accelerator defaults + accel_defaults = self.get_accelerator_defaults() + accel_vllm_args = accel_defaults.get("vllm_args", {}) + accel_env_vars = accel_defaults.get("env_vars", {}) + + # Merge model config + model_deploy = raw_config.get("deploy", {}) + model_vllm_args = raw_config.get("vllm_args", {}) + model_env_vars = raw_config.get("env_vars", {}) + + # Merge accelerator overrides from model + accel_overrides = raw_config.get("accelerator_overrides", {}).get(self.accelerator, {}) + override_vllm_args = accel_overrides.get("vllm_args", {}) + override_env_vars = accel_overrides.get("env_vars", {}) + + # Build final config through inheritance chain + final_deploy = deep_merge(base_deploy, model_deploy) + final_vllm_args = deep_merge( + deep_merge(deep_merge(base_vllm_args, accel_vllm_args), model_vllm_args), + override_vllm_args, + ) + final_env_vars = deep_merge( + deep_merge(accel_env_vars, model_env_vars), + override_env_vars, + ) + + return ResolvedModelConfig( + key=actual_key, + name=raw_config.get("name", actual_key), + hf_model_id=raw_config.get("hf_model_id", actual_key), + deploy=final_deploy, + vllm_args=final_vllm_args, + env_vars=final_env_vars, + supported_workloads=raw_config.get("supported_workloads", []), + aliases=raw_config.get("aliases", []), + ) + + def load_workload(self, workload_key: str) -> ResolvedWorkloadConfig: + """ + Load and resolve a workload configuration. + + Args: + workload_key: Workload key from workloads.yaml + + Returns: + Resolved workload configuration + + Raises: + KeyError: If workload not found + """ + if workload_key not in self.workloads: + raise KeyError(f"Workload '{workload_key}' not found") + + raw_config = self.workloads[workload_key] + + # Merge with guidellm defaults + global_defaults = self.get_global_defaults() + base_guidellm = global_defaults.get("guidellm", {}) + workload_guidellm = raw_config.get("guidellm", {}) + final_guidellm = deep_merge(base_guidellm, workload_guidellm) + + return ResolvedWorkloadConfig( + key=workload_key, + name=raw_config.get("name", workload_key), + description=raw_config.get("description", ""), + guidellm=final_guidellm, + max_seconds=raw_config.get("max_seconds", base_guidellm.get("max_seconds", 300)), + vllm_args=raw_config.get("vllm_args", {}), + ) + + def load_scenario(self, scenario_path: str | Path) -> dict[str, Any]: + """ + Load a scenario file and resolve its defaults. + + Args: + scenario_path: Path to scenario YAML file + + Returns: + Parsed scenario data with resolved defaults + """ + scenario_path = Path(scenario_path) + + with open(scenario_path) as f: + data = yaml.safe_load(f) or {} + + # Merge scenario defaults with global defaults + global_defaults = self.get_global_defaults() + scenario_defaults = data.get("defaults", {}) + data["_resolved_defaults"] = deep_merge(global_defaults, scenario_defaults) + + # Add accelerator info + data["_accelerator"] = self.accelerator + data["_accelerator_config"] = self.get_accelerator_defaults() + + return data + + def _find_model(self, model_key: str) -> tuple[str, dict[str, Any]] | None: + """ + Find model by key, alias, or HuggingFace ID. + + Returns: + Tuple of (actual_key, config) or None if not found + """ + # Try exact key match + if model_key in self.models: + return (model_key, self.models[model_key]) + + # Try alias match + for key, config in self.models.items(): + aliases = config.get("aliases", []) + if model_key in aliases: + return (key, config) + + # Try HuggingFace ID match + for key, config in self.models.items(): + if config.get("hf_model_id") == model_key: + return (key, config) + + return None + + def list_models(self) -> list[str]: + """List all model keys.""" + return list(self.models.keys()) + + def list_workloads(self) -> list[str]: + """List all workload keys.""" + return list(self.workloads.keys()) + + def get_image(self) -> str: + """Get container image for current accelerator.""" + accel_config = self.get_accelerator_defaults() + return accel_config.get("image", "") diff --git a/projects/core/scenarios/generator.py b/projects/core/scenarios/generator.py new file mode 100644 index 0000000..9199cf6 --- /dev/null +++ b/projects/core/scenarios/generator.py @@ -0,0 +1,676 @@ +"""Declarative scenario generation with split configuration and inheritance. + +NOTE: This module is not currently wired to any CLI. The RHAIIS CLI uses +ConfigLoader directly with --model and --workloads flags. This generator +is available for future batch/matrix scenario execution if needed. + +Supports: +1. Split config files: defaults.yaml, models.yaml, workloads.yaml +2. Deploy-once pattern: Deploy vLLM once, run multiple workloads +3. Config inheritance: defaults → accelerator → model → scenario +4. Matrix expansion: model × workloads × tensor_parallel + +Example scenario format (if wired to CLI): +```yaml +scenarios: + - model: qwen-0.6b # Key from models.yaml + workloads: [balanced, short] # Keys from workloads.yaml + tensor_parallel: [1] +``` +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from itertools import product +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import yaml + +from .config import ScenarioConfig + +if TYPE_CHECKING: + from .config_loader import ConfigLoader + + +@dataclass +class ModelConfig: + """Model configuration from models.yaml.""" + + key: str # qwen-0.6b + name: str # Qwen3-0.6B + hf_model_id: str # Qwen/Qwen3-0.6B + aliases: list[str] = field(default_factory=list) + vllm_args: dict[str, Any] = field(default_factory=dict) + env_vars: dict[str, str] = field(default_factory=dict) + supported_workloads: list[str] = field(default_factory=list) + + @classmethod + def from_dict(cls, key: str, data: dict[str, Any]) -> "ModelConfig": + return cls( + key=key, + name=data.get("name", key), + hf_model_id=data.get("hf_model_id", key), + aliases=data.get("aliases", []), + vllm_args=data.get("vllm_args", {}), + env_vars=data.get("env_vars", {}), + supported_workloads=data.get("supported_workloads", []), + ) + + +@dataclass +class WorkloadConfig: + """Workload configuration from workloads.yaml.""" + + key: str # balanced + name: str # Balanced + description: str = "" + guidellm: dict[str, Any] = field(default_factory=dict) + max_seconds: int = 300 + vllm_args: dict[str, Any] = field(default_factory=dict) # Workload-specific overrides + + @classmethod + def from_dict(cls, key: str, data: dict[str, Any]) -> "WorkloadConfig": + return cls( + key=key, + name=data.get("name", key), + description=data.get("description", ""), + guidellm=data.get("guidellm", {}), + max_seconds=data.get("max_seconds", 300), + vllm_args=data.get("vllm_args", {}), + ) + + +@dataclass +class DeploymentGroup: + """ + A group of workloads to run on a single vLLM deployment. + + Deploy vLLM once -> Run all workloads -> Cleanup + + Workloads with different vllm_args get separate deployment groups. + """ + + model: ModelConfig + tensor_parallel: int + routing: str + workloads: list[WorkloadConfig] + routing_config: dict[str, Any] = field(default_factory=dict) + namespace: str = "forge" + vllm_args_override: dict[str, Any] = field(default_factory=dict) # From workload + + @property + def deployment_id(self) -> str: + """Unique ID for this deployment.""" + base = f"{self.model.key}_{self.routing}_tp{self.tensor_parallel}" + if self.vllm_args_override: + # Add hash suffix for workload-specific vllm_args + override_hash = hash(frozenset(self.vllm_args_override.items())) % 10000 + return f"{base}_wl{override_hash}" + return base + + @property + def deployment_name(self) -> str: + """K8s resource name.""" + return ScenarioConfig.sanitize_name(self.model.key) + + @property + def merged_vllm_args(self) -> dict[str, Any]: + """Model vllm_args merged with workload overrides.""" + merged = dict(self.model.vllm_args) + merged.update(self.vllm_args_override) + return merged + + +@dataclass +class ExpandedScenario: + """A single expanded scenario from matrix.""" + + model_id: str # HuggingFace ID + model_key: str # Key from models.yaml + model_short: str # Short name for display + workload: str # balanced, short, etc. + routing: str # direct, prefix-estimation, etc. + tensor_parallel: int # TP size + runtime_args: dict[str, Any] # Merged runtime args + workload_config: dict[str, Any] # Workload settings + routing_config: dict[str, Any] # Routing settings + deploy_config: dict[str, Any] # Deployment settings + + @property + def scenario_id(self) -> str: + """Generate deterministic scenario ID.""" + return f"{self.model_short}_{self.workload}_{self.routing}_tp{self.tensor_parallel}" + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for JSON serialization.""" + return { + "model_id": self.model_id, + "model_key": self.model_key, + "model_short": self.model_short, + "workload": self.workload, + "routing": self.routing, + "tensor_parallel": self.tensor_parallel, + "scenario_id": self.scenario_id, + "runtime_args": self.runtime_args, + "workload_config": self.workload_config, + "routing_config": self.routing_config, + "deploy_config": self.deploy_config, + } + + def to_scenario_config(self, namespace: str = "forge") -> ScenarioConfig: + """Convert to ScenarioConfig for workflow execution.""" + return ScenarioConfig( + scenario_id=self.scenario_id, + model_id=self.model_id, + model_short=self.model_short, + workload=self.workload, + routing=self.routing, + tensor_parallel=self.tensor_parallel, + deployment_name=ScenarioConfig.sanitize_name(self.model_short), + namespace=namespace, + replicas=self.deploy_config.get("replicas", 1), + runtime_args=self.runtime_args, + workload_config=self.workload_config, + routing_config=self.routing_config, + ) + + +@dataclass +class ParsedConfig: + """Parsed scenario YAML configuration.""" + + name: str + description: str + target_cluster: str = "" + # Common defaults + common: dict[str, Any] = field(default_factory=dict) + # Workload definitions (inline or from workloads.yaml) + workloads: dict[str, WorkloadConfig] = field(default_factory=dict) + # Routing definitions + routing: dict[str, dict[str, Any]] = field(default_factory=dict) + # Model registry (from models.yaml) + models: dict[str, ModelConfig] = field(default_factory=dict) + # New: Scenario list (references model keys) + scenarios: list[dict[str, Any]] = field(default_factory=list) + # Legacy: Explicit run list + runs: list[dict[str, Any]] = field(default_factory=list) + + +class ScenarioGenerator: + """ + Generate scenarios from declarative configuration. + + Supports: + - Split config: defaults.yaml + models.yaml + workloads.yaml + scenarios/*.yaml + - Deploy-once pattern: Group workloads under single deployment + - Config inheritance via ConfigLoader: defaults → accelerator → model → scenario + - Legacy inline: All config in single file + """ + + def __init__( + self, + scenarios_path: str | Path | None = None, + models_path: str | Path | None = None, + workloads_path: str | Path | None = None, + config_dir: str | Path | None = None, + config_loader: ConfigLoader | None = None, + accelerator: str = "nvidia", + ): + """ + Initialize generator. + + Args: + scenarios_path: Path to scenarios/*.yaml file + models_path: Path to models.yaml (optional, auto-detected from config_dir) + workloads_path: Path to workloads.yaml (optional, auto-detected) + config_dir: Config directory (auto-detects models.yaml, workloads.yaml) + config_loader: Optional ConfigLoader for inheritance-based resolution + accelerator: Accelerator type ('nvidia', 'amd') for inheritance + """ + self.scenarios_path = Path(scenarios_path) if scenarios_path else None + self.models_path = Path(models_path) if models_path else None + self.workloads_path = Path(workloads_path) if workloads_path else None + self.accelerator = accelerator + + # Auto-detect config paths from config_dir or scenarios_path parent + if config_dir: + config_dir = Path(config_dir) + elif scenarios_path: + # Handle scenarios in subdirectory: config/projects/rhaiis.yaml -> config/ + scenarios_parent = Path(scenarios_path).parent + if scenarios_parent.name == "projects": + config_dir = scenarios_parent.parent + else: + config_dir = scenarios_parent + + self.config_dir = config_dir + + if config_dir: + if not self.models_path and (config_dir / "models.yaml").exists(): + self.models_path = config_dir / "models.yaml" + if not self.workloads_path and (config_dir / "workloads.yaml").exists(): + self.workloads_path = config_dir / "workloads.yaml" + + # Use provided ConfigLoader or create one if config_dir is available + self.config_loader = config_loader + if not self.config_loader and config_dir: + from .config_loader import ConfigLoader + self.config_loader = ConfigLoader(config_dir, accelerator=accelerator) + + self.config: ParsedConfig | None = None + + def load(self, path: str | Path | None = None) -> ParsedConfig: + """ + Load and parse scenario configuration. + + Loads from: + 1. models.yaml (if exists) -> model registry + 2. workloads.yaml (if exists) -> workload profiles + 3. scenarios-*.yaml -> scenario definitions + + Args: + path: Optional path override for scenarios file + + Returns: + Parsed configuration + """ + scenarios_path = Path(path) if path else self.scenarios_path + if not scenarios_path: + raise ValueError("No scenarios config path provided") + + # Load models registry + models: dict[str, ModelConfig] = {} + if self.models_path and self.models_path.exists(): + with open(self.models_path) as f: + models_data = yaml.safe_load(f) + for key, data in models_data.get("models", {}).items(): + models[key] = ModelConfig.from_dict(key, data) + + # Load workloads + workloads: dict[str, WorkloadConfig] = {} + if self.workloads_path and self.workloads_path.exists(): + with open(self.workloads_path) as f: + workloads_data = yaml.safe_load(f) + for key, data in workloads_data.get("workloads", {}).items(): + workloads[key] = WorkloadConfig.from_dict(key, data) + + # Load scenarios + with open(scenarios_path) as f: + data = yaml.safe_load(f) + + # Merge inline workloads (if any) with loaded workloads + for key, wl_data in data.get("workloads", {}).items(): + if key not in workloads: + workloads[key] = WorkloadConfig.from_dict(key, wl_data) + + # Merge inline models with loaded models + # Supports both new format (hf_model_id, vllm_args) and legacy (runtime_args) + for model_key, model_data in data.get("models", {}).items(): + if model_key not in models: + # Check for new format fields + hf_model_id = model_data.get("hf_model_id", model_key) + vllm_args = model_data.get("vllm_args") or model_data.get("runtime_args", {}) + name = model_data.get("name") or model_data.get("deploy", {}).get("name", model_key) + + models[model_key] = ModelConfig.from_dict( + key=model_key, + data={ + "hf_model_id": hf_model_id, + "name": name, + "vllm_args": vllm_args, + "env_vars": model_data.get("env_vars", {}), + }, + ) + + self.config = ParsedConfig( + name=data.get("name", scenarios_path.stem), + description=data.get("description", ""), + target_cluster=data.get("target_cluster", ""), + common=data.get("common", {}), + workloads=workloads, + routing=data.get("routing", {}), + models=models, + scenarios=data.get("scenarios", []), + runs=data.get("runs", []), + ) + + return self.config + + def expand(self) -> list[ExpandedScenario]: + """ + Expand all scenarios into individual benchmark runs. + + Returns: + List of ExpandedScenario objects + """ + if not self.config: + raise RuntimeError("Must call load() first") + + expanded = [] + + # New format: scenarios list with model key references + for scenario_def in self.config.scenarios: + scenarios = self._expand_scenario_def(scenario_def) + expanded.extend(scenarios) + + # Legacy format: models with inline matrix + for model_id, model_config in self.config.models.items(): + if isinstance(model_config, ModelConfig): + continue # Skip, already processed via scenarios + # Legacy dict format + matrix = model_config.get("matrix", {}) if isinstance(model_config, dict) else {} + if matrix: + scenarios = self._expand_legacy_model_matrix(model_id, model_config) + expanded.extend(scenarios) + + # Explicit runs (no matrix expansion) + for run in self.config.runs: + scenario = self._create_from_run(run) + if scenario: + expanded.append(scenario) + + return expanded + + def expand_grouped(self) -> list[DeploymentGroup]: + """ + Expand scenarios grouped by deployment. + + Returns groups where each group shares a single vLLM deployment. + Deploy once -> Run all workloads in group -> Cleanup + + Uses ConfigLoader when available for full inheritance chain. + + Returns: + List of DeploymentGroup objects + """ + if not self.config: + raise RuntimeError("Must call load() first") + + groups: dict[str, DeploymentGroup] = {} + + for scenario_def in self.config.scenarios: + model_key = scenario_def.get("model") + if not model_key or model_key not in self.config.models: + continue + + workload_keys = scenario_def.get("workloads", ["balanced"]) + routings = scenario_def.get("routing", ["direct"]) + tp_values = scenario_def.get("tensor_parallel", [1]) + namespace = self.config.common.get("namespace", "forge") + + # Use ConfigLoader for resolved model config if available + if self.config_loader: + try: + resolved_model = self.config_loader.load_model(model_key) + # Create a ModelConfig-compatible object with resolved values + model = ModelConfig( + key=resolved_model.key, + name=resolved_model.name, + hf_model_id=resolved_model.hf_model_id, + aliases=resolved_model.aliases, + vllm_args=resolved_model.vllm_args, + env_vars=resolved_model.env_vars, + supported_workloads=resolved_model.supported_workloads, + ) + except KeyError: + model = self.config.models[model_key] + else: + model = self.config.models[model_key] + + # Create groups for each (model, routing, tp, vllm_args) combination + # Workloads with different vllm_args get separate deployment groups + for routing, tp in product(routings, tp_values): + # Group workloads by their vllm_args + workloads_by_vllm_args: dict[tuple, list[WorkloadConfig]] = {} + + for wl_key in workload_keys: + if wl_key not in self.config.workloads: + continue + wl = self.config.workloads[wl_key] + # Create hashable key from vllm_args + vllm_args_key = tuple(sorted(wl.vllm_args.items())) if wl.vllm_args else () + if vllm_args_key not in workloads_by_vllm_args: + workloads_by_vllm_args[vllm_args_key] = [] + workloads_by_vllm_args[vllm_args_key].append(wl) + + # Create a deployment group for each unique vllm_args + for vllm_args_key, workloads in workloads_by_vllm_args.items(): + vllm_args_override = dict(vllm_args_key) if vllm_args_key else {} + + # Include vllm_args hash in group_id for uniqueness + if vllm_args_override: + override_hash = hash(vllm_args_key) % 10000 + group_id = f"{model_key}_{routing}_tp{tp}_wl{override_hash}" + else: + group_id = f"{model_key}_{routing}_tp{tp}" + + if group_id not in groups: + groups[group_id] = DeploymentGroup( + model=model, + tensor_parallel=tp, + routing=routing, + workloads=workloads, + routing_config=self.config.routing.get(routing, {}), + namespace=namespace, + vllm_args_override=vllm_args_override, + ) + else: + # Add more workloads to existing group + for wl in workloads: + if wl not in groups[group_id].workloads: + groups[group_id].workloads.append(wl) + + return list(groups.values()) + + def _expand_scenario_def(self, scenario_def: dict[str, Any]) -> list[ExpandedScenario]: + """Expand a scenario definition from the new format. + + Uses ConfigLoader when available for full inheritance chain: + defaults → accelerator → model → model.accelerator_overrides → scenario + """ + model_key = scenario_def.get("model") + if not model_key or model_key not in self.config.models: + return [] + + workload_keys = scenario_def.get("workloads", ["balanced"]) + routings = scenario_def.get("routing", ["direct"]) + tp_values = scenario_def.get("tensor_parallel", [1]) + vllm_args_override = scenario_def.get("vllm_args_override", {}) + + scenarios = [] + namespace = self.config.common.get("namespace", "forge") + + # Use ConfigLoader for full inheritance if available + if self.config_loader: + try: + resolved_model = self.config_loader.load_model(model_key) + model_id = resolved_model.hf_model_id + base_vllm_args = dict(resolved_model.vllm_args) + env_vars = dict(resolved_model.env_vars) + deploy_config_base = dict(resolved_model.deploy) + except KeyError: + # Fall back to basic model config + model = self.config.models[model_key] + model_id = model.hf_model_id + base_vllm_args = dict(model.vllm_args) + env_vars = dict(model.env_vars) + deploy_config_base = {} + else: + model = self.config.models[model_key] + model_id = model.hf_model_id + base_vllm_args = dict(model.vllm_args) + env_vars = dict(model.env_vars) + deploy_config_base = {} + + for workload_key, routing, tp in product(workload_keys, routings, tp_values): + # Resolve workload config + if self.config_loader: + try: + resolved_workload = self.config_loader.load_workload(workload_key) + workload_guidellm = resolved_workload.guidellm + except KeyError: + workload_config = self.config.workloads.get(workload_key) + if not workload_config: + continue + workload_guidellm = workload_config.guidellm + else: + workload_config = self.config.workloads.get(workload_key) + if not workload_config: + continue + workload_guidellm = workload_config.guidellm + + # Build runtime args with inheritance + runtime_args = dict(base_vllm_args) + runtime_args["tensor-parallel-size"] = tp + runtime_args.update(vllm_args_override) + + scenario = ExpandedScenario( + model_id=model_id, + model_key=model_key, + model_short=self._shorten_model_name(model_key), + workload=workload_key, + routing=routing, + tensor_parallel=tp, + runtime_args=runtime_args, + workload_config=workload_guidellm, + routing_config=self.config.routing.get(routing, {}), + deploy_config={ + "namespace": namespace, + "replicas": self.config.common.get("replicas", 1), + "num_gpus": deploy_config_base.get("num_gpus", tp), + "env_vars": env_vars, + }, + ) + scenarios.append(scenario) + + return scenarios + + def _expand_legacy_model_matrix( + self, + model_id: str, + model_config: dict[str, Any], + ) -> list[ExpandedScenario]: + """Expand a model's matrix (legacy inline format).""" + matrix = model_config.get("matrix", {}) + deploy_config = model_config.get("deploy", {}) + + workloads = matrix.get("workloads", ["balanced"]) + routings = matrix.get("routing", ["direct"]) + tp_values = matrix.get("tensor-parallel-size", [1]) + + common_runtime = self.config.common.get("runtime_args", {}) + model_runtime = model_config.get("runtime_args", {}) + + scenarios = [] + + for workload, routing, tp in product(workloads, routings, tp_values): + runtime_args = dict(common_runtime) + runtime_args.update(model_runtime) + runtime_args["tensor-parallel-size"] = tp + + workload_config = self.config.workloads.get(workload) + wl_dict = workload_config.guidellm if workload_config else {} + + routing_config = self.config.routing.get(routing, {}) + + model_short = self._shorten_model_name(model_id) + + scenario_deploy_config = dict(deploy_config) + scenario_deploy_config["num_gpus"] = tp + + scenario = ExpandedScenario( + model_id=model_id, + model_key=model_id, + model_short=model_short, + workload=workload, + routing=routing, + tensor_parallel=tp, + runtime_args=runtime_args, + workload_config=wl_dict, + routing_config=routing_config, + deploy_config=scenario_deploy_config, + ) + scenarios.append(scenario) + + return scenarios + + def _create_from_run(self, run: dict[str, Any]) -> ExpandedScenario | None: + """Create scenario from explicit run definition.""" + model_key = run.get("model") + if not model_key: + return None + + model = self.config.models.get(model_key) + if not model: + return None + + workload = run.get("workload", "balanced") + routing = run.get("routing", "direct") + tp = run.get("tensor_parallel", 1 if isinstance(model, ModelConfig) else 1) + + runtime_args = dict(model.vllm_args) if isinstance(model, ModelConfig) else {} + runtime_args.update(run.get("runtime_args_override", {})) + runtime_args["tensor-parallel-size"] = tp + + workload_config = self.config.workloads.get(workload) + + return ExpandedScenario( + model_id=model.hf_model_id if isinstance(model, ModelConfig) else model_key, + model_key=model_key, + model_short=self._shorten_model_name(model_key), + workload=workload, + routing=routing, + tensor_parallel=tp, + runtime_args=runtime_args, + workload_config=workload_config.guidellm if workload_config else {}, + routing_config=self.config.routing.get(routing, {}), + deploy_config={ + "namespace": self.config.common.get("namespace", "forge"), + "replicas": 1, + }, + ) + + @staticmethod + def _shorten_model_name(model_id: str) -> str: + """Create short model name from model key or HuggingFace ID.""" + name = model_id.split("/")[-1].lower() + name = re.sub(r"-instruct.*", "", name) + name = re.sub(r"-dynamic$", "", name) + name = re.sub(r"-a\d+b", "", name) + name = re.sub(r"[^a-z0-9]+", "-", name) + name = name.strip("-") + if len(name) > 40: + name = name[:40].rstrip("-") + return name + + def summary(self) -> str: + """Generate summary of scenarios.""" + if not self.config: + return "No config loaded" + + expanded = self.expand() + groups = self.expand_grouped() + + lines = [ + f"Scenario Config: {self.config.name}", + f"Description: {self.config.description}", + f"Target Cluster: {self.config.target_cluster or '(not set)'}", + f"Models: {len(self.config.models)}", + f"Workloads: {len(self.config.workloads)}", + f"Deployment Groups: {len(groups)}", + f"Total Benchmark Runs: {len(expanded)}", + "", + ] + + # Show deployment groups + lines.append("Deployment Groups (deploy once, run N workloads):") + for group in groups: + wl_names = ", ".join(wl.key for wl in group.workloads) + lines.append(f" {group.deployment_id}:") + lines.append(f" Model: {group.model.hf_model_id}") + lines.append(f" Workloads: [{wl_names}]") + + return "\n".join(lines) diff --git a/projects/core/steps/__init__.py b/projects/core/steps/__init__.py new file mode 100644 index 0000000..ea3cf43 --- /dev/null +++ b/projects/core/steps/__init__.py @@ -0,0 +1,14 @@ +"""Shared workflow steps for all projects. + +These steps can be imported and used by any project: + from projects.core.steps import RunGuideLLMStep, CollectArtifactsStep +""" + +from .artifacts import CleanupDeploymentStep, CollectArtifactsStep +from .guidellm import RunGuideLLMStep + +__all__ = [ + "CleanupDeploymentStep", + "CollectArtifactsStep", + "RunGuideLLMStep", +] diff --git a/projects/core/steps/artifacts.py b/projects/core/steps/artifacts.py new file mode 100644 index 0000000..9f1f5f2 --- /dev/null +++ b/projects/core/steps/artifacts.py @@ -0,0 +1,250 @@ +"""Artifact collection step - shared by all projects.""" + +import logging +import subprocess +from pathlib import Path +from typing import TYPE_CHECKING + +from projects.core.workflow import StepResult, WorkflowStep + +if TYPE_CHECKING: + from projects.core.workflow import WorkflowContext + +logger = logging.getLogger(__name__) + + +class CollectArtifactsStep(WorkflowStep): + """ + Collect logs, events, and pod status for debugging. + + Always runs as a finally step to capture artifacts regardless + of success or failure. Does not fail the workflow if collection + fails - just logs warnings. + + Can be customized per project: + - rhaiis: app_label="vllm" + - llm_d: app_label="epp" + """ + + def __init__( + self, + app_label: str = "vllm", + namespace: str | None = None, + collect_events: bool = True, + collect_pod_logs: bool = True, + collect_pod_describe: bool = True, + name: str | None = None, + ): + """ + Initialize artifact collection step. + + Args: + app_label: Kubernetes app label to filter pods (e.g., "vllm", "epp") + namespace: Kubernetes namespace (uses current context if None) + collect_events: Whether to collect namespace events + collect_pod_logs: Whether to collect pod logs + collect_pod_describe: Whether to collect pod descriptions + name: Optional step name + """ + super().__init__(name=name or "collect_artifacts") + self.app_label = app_label + self.namespace = namespace + self.collect_events = collect_events + self.collect_pod_logs = collect_pod_logs + self.collect_pod_describe = collect_pod_describe + + def execute(self, ctx: "WorkflowContext") -> StepResult: + """Collect artifacts from cluster.""" + step_dir = ctx.artifact_dir / f"{ctx.step_number:03d}__{ctx.current_step_name}" + step_dir.mkdir(parents=True, exist_ok=True) + collected_files: list[str] = [] + warnings: list[str] = [] + + ns_args = ["-n", self.namespace] if self.namespace else [] + + # Collect pod logs + if self.collect_pod_logs: + log_file = step_dir / "app_logs.txt" + result = self._run_oc( + ["logs", "-l", f"app={self.app_label}", "--tail=1000", *ns_args], + log_file, + ) + if result: + collected_files.append(str(log_file)) + else: + warnings.append(f"Failed to collect logs for app={self.app_label}") + + # Collect pod descriptions + if self.collect_pod_describe: + describe_file = step_dir / "pod_describe.txt" + result = self._run_oc( + ["describe", "pods", "-l", f"app={self.app_label}", *ns_args], + describe_file, + ) + if result: + collected_files.append(str(describe_file)) + else: + warnings.append(f"Failed to describe pods for app={self.app_label}") + + # Collect events + if self.collect_events: + events_file = step_dir / "events.txt" + result = self._run_oc( + ["get", "events", "--sort-by=.lastTimestamp", *ns_args], + events_file, + ) + if result: + collected_files.append(str(events_file)) + else: + warnings.append("Failed to collect events") + + # Collect pod status + status_file = step_dir / "pod_status.txt" + result = self._run_oc( + ["get", "pods", "-l", f"app={self.app_label}", "-o", "wide", *ns_args], + status_file, + ) + if result: + collected_files.append(str(status_file)) + + message = f"Collected {len(collected_files)} artifacts" + if warnings: + message += f" ({len(warnings)} warnings)" + for w in warnings: + logger.warning(w) + + return StepResult( + success=True, # Never fail - this is a finally step + message=message, + artifacts=collected_files, + ) + + def _run_oc(self, args: list[str], output_file: Path) -> bool: + """ + Run oc command and write output to file. + + Returns True if successful, False otherwise. + """ + try: + cmd = ["oc", *args] + logger.debug(f"Running: {' '.join(cmd)}") + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=60, + ) + + # Write output regardless of exit code + with open(output_file, "w") as f: + f.write(f"# Command: oc {' '.join(args)}\n") + f.write(f"# Exit code: {result.returncode}\n\n") + if result.stdout: + f.write(result.stdout) + if result.stderr: + f.write(f"\n# STDERR:\n{result.stderr}") + + return result.returncode == 0 + + except subprocess.TimeoutExpired: + logger.warning(f"Command timed out: oc {' '.join(args)}") + return False + except FileNotFoundError: + logger.warning("oc command not found") + return False + except Exception as e: + logger.warning(f"Error running oc: {e}") + return False + + +class CleanupDeploymentStep(WorkflowStep): + """ + Clean up Kubernetes/KServe deployment resources. + + Runs as a finally step to ensure resources are cleaned up + even on failure. Handles both standard K8s deployments and + KServe InferenceService/ServingRuntime resources. + """ + + def __init__( + self, + deployment_name: str, + namespace: str | None = None, + delete_service: bool = True, + delete_route: bool = True, + use_kserve: bool = True, + name: str | None = None, + ): + """ + Initialize cleanup step. + + Args: + deployment_name: Name of the deployment/InferenceService to delete + namespace: Kubernetes namespace (uses current context if None) + delete_service: Also delete the associated service + delete_route: Also delete the associated route + use_kserve: Delete KServe resources (InferenceService, ServingRuntime) + name: Optional step name + """ + super().__init__(name=name or "cleanup") + self.deployment_name = deployment_name + self.namespace = namespace + self.delete_service = delete_service + self.delete_route = delete_route + self.use_kserve = use_kserve + + def execute(self, ctx: "WorkflowContext") -> StepResult: + """Delete deployment and related resources.""" + ns_args = ["-n", self.namespace] if self.namespace else [] + deleted: list[str] = [] + errors: list[str] = [] + + # Delete KServe resources first (they manage the underlying deployments) + if self.use_kserve: + if self._delete_resource("inferenceservice", self.deployment_name, ns_args): + deleted.append(f"inferenceservice/{self.deployment_name}") + if self._delete_resource("servingruntime", self.deployment_name, ns_args): + deleted.append(f"servingruntime/{self.deployment_name}") + + # Delete standard deployment (if not using KServe or as fallback) + if self._delete_resource("deployment", self.deployment_name, ns_args): + deleted.append(f"deployment/{self.deployment_name}") + + # Delete service + if self.delete_service: + if self._delete_resource("service", self.deployment_name, ns_args): + deleted.append(f"service/{self.deployment_name}") + + # Delete route + if self.delete_route: + if self._delete_resource("route", self.deployment_name, ns_args): + deleted.append(f"route/{self.deployment_name}") + + message = f"Deleted: {', '.join(deleted)}" if deleted else "Nothing deleted" + if errors: + message += f" (errors: {len(errors)})" + + return StepResult( + success=True, # Never fail - this is a finally step + message=message, + data={"deleted": deleted, "errors": errors}, + ) + + def _delete_resource(self, kind: str, name: str, ns_args: list[str]) -> bool: + """Delete a Kubernetes resource. Returns True if successful.""" + try: + cmd = ["oc", "delete", kind, name, "--ignore-not-found", *ns_args] + logger.debug(f"Running: {' '.join(cmd)}") + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=60, + ) + return result.returncode == 0 + + except Exception as e: + logger.warning(f"Error deleting {kind}/{name}: {e}") + return False diff --git a/projects/core/steps/guidellm.py b/projects/core/steps/guidellm.py new file mode 100644 index 0000000..2688f10 --- /dev/null +++ b/projects/core/steps/guidellm.py @@ -0,0 +1,343 @@ +"""GuideLLM benchmark step - runs as a pod on the cluster.""" + +import json +import logging +import subprocess +import time +import uuid +from typing import TYPE_CHECKING + +from projects.core.workflow import StepResult, WorkflowStep + +if TYPE_CHECKING: + from projects.core.workflow import WorkflowContext + +logger = logging.getLogger(__name__) + +# Default GuideLLM image from llm-d-bench +DEFAULT_GUIDELLM_IMAGE = "ghcr.io/openshift-psap/llm-d-bench/guidellm:latest" + + +class RunGuideLLMStep(WorkflowStep): + """ + Run GuideLLM benchmark as a pod on the cluster. + + Deploys a GuideLLM pod in the same namespace as the inference service, + waits for completion, and collects results. + """ + + def __init__( + self, + endpoint: str, + model: str, + namespace: str = "forge", + workload: str = "balanced", + max_requests: int | None = None, + max_seconds: int = 120, + rate: str = "1,50,100", + rate_type: str = "concurrent", + guidellm_image: str | None = None, + output_file: str = "guidellm_results.json", + name: str | None = None, + ): + """ + Initialize GuideLLM step. + + Args: + endpoint: Inference endpoint URL (e.g., http://vllm-svc:8080/v1) + model: Model name as deployed + namespace: Kubernetes namespace where to run the benchmark pod + workload: GuideLLM workload type (balanced, heterogeneous, multiturn) + or explicit format: "prompt_tokens=1000,output_tokens=1000" + max_requests: Maximum number of requests to send + max_seconds: Maximum benchmark duration in seconds per rate + rate: Comma-separated rates to test (e.g., "1,50,100") + rate_type: Rate type - "concurrent" or "synchronous" + guidellm_image: GuideLLM container image + output_file: Name of output file in artifact directory + name: Optional step name + """ + super().__init__(name=name or "benchmark") + self.endpoint = endpoint + self.model = model + self.namespace = namespace + self.workload = workload + self.max_requests = max_requests + self.max_seconds = max_seconds + self.rate = rate + self.rate_type = rate_type + self.guidellm_image = guidellm_image or DEFAULT_GUIDELLM_IMAGE + self.output_file = output_file + # Use model name in pod name for easier correlation with inference pods + model_short = model.split("/")[-1].lower().replace(".", "-").replace("_", "-")[:20] + self.pod_name = f"guidellm-{model_short}-{uuid.uuid4().hex[:6]}" + + def execute(self, ctx: "WorkflowContext") -> StepResult: + """Run GuideLLM benchmark as a pod.""" + step_dir = ctx.artifact_dir / f"{ctx.step_number:03d}__{ctx.current_step_name}" + step_dir.mkdir(parents=True, exist_ok=True) + + # Convert workload to GuideLLM data format + data = self._workload_to_data(self.workload) + + # Generate pod YAML + pod_yaml = self._generate_pod_yaml(data) + yaml_path = step_dir / "guidellm-pod.yaml" + yaml_path.write_text(pod_yaml) + + logger.info(f"Creating GuideLLM pod: {self.pod_name}") + print(f"Creating GuideLLM pod: {self.pod_name} in namespace {self.namespace}") + + # Create the pod + try: + result = subprocess.run( + ["oc", "apply", "-f", str(yaml_path)], + capture_output=True, + text=True, + timeout=60, + ) + if result.returncode != 0: + return StepResult.fail( + f"Failed to create GuideLLM pod: {result.stderr}", + error=RuntimeError(result.stderr), + ) + except Exception as e: + return StepResult.fail(f"Failed to create pod: {e}", error=e) + + # Wait for pod to complete + # Calculate generous timeout: max_seconds per rate, plus 30min overhead for startup/warmup + num_rates = len(self.rate.split(",")) + timeout = (self.max_seconds * num_rates) + 1800 # 30min overhead + wait_result = self._wait_for_pod_completion(timeout) + + # Collect logs regardless of outcome + self._collect_pod_logs(step_dir) + + # Cleanup pod + self._delete_pod() + + if not wait_result["success"]: + return StepResult.fail( + f"GuideLLM pod failed: {wait_result['message']}", + error=RuntimeError(wait_result["message"]), + ) + + return StepResult.ok( + f"GuideLLM completed in {wait_result.get('duration', 0):.1f}s", + pod_name=self.pod_name, + ) + + def _workload_to_data(self, workload: str) -> str: + """Convert workload name to GuideLLM data format.""" + workload_map = { + "balanced": "prompt_tokens=1000,output_tokens=1000", + "short": "prompt_tokens=256,output_tokens=256", + "long-prompt": "prompt_tokens=8000,output_tokens=1000", + "very-long-prompt": "prompt_tokens=16000,output_tokens=1000", + "heterogeneous": "emulated", + "multi-turn": "multi_turn", + } + return workload_map.get(workload, workload) + + def _generate_pod_yaml(self, data: str) -> str: + """Generate GuideLLM pod YAML.""" + # Build guidellm args + args = [ + "--target", self.endpoint, + "--model", self.model, + "--rate", self.rate, + "--rate-type", self.rate_type, + "--data", data, + "--max-seconds", str(self.max_seconds), + "--backend-type", "openai_http", + ] + + if self.max_requests: + args.extend(["--max-requests", str(self.max_requests)]) + + # Build command as shell script: run benchmark, signal completion, sleep for rsync + guidellm_cmd = f"python3 -m benchmark.main {' '.join(args)}" + + return f"""apiVersion: v1 +kind: Pod +metadata: + name: {self.pod_name} + namespace: {self.namespace} + labels: + app: guidellm-benchmark + forge-run: "true" +spec: + restartPolicy: Never + containers: + - name: guidellm + image: {self.guidellm_image} + imagePullPolicy: Always + command: + - /bin/sh + - -c + - | + {guidellm_cmd} + echo "BENCHMARK_COMPLETE" + echo "Sleeping 30s for artifact collection..." + sleep 30 + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: storage-config + key: HF_TOKEN + optional: true + - name: GUIDELLM__REQUEST_TIMEOUT + value: "6000" + - name: GUIDELLM__LOGGING__CONSOLE_LOG_LEVEL + value: "INFO" + - name: HF_HOME + value: /tmp/.huggingface + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "4" + memory: "8Gi" + volumeMounts: + - name: results-volume + mountPath: /benchmark-results + volumes: + - name: results-volume + emptyDir: {{}} + # Avoid GPU nodes - run on infra/worker nodes + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: nvidia.com/gpu + operator: DoesNotExist +""" + + def _wait_for_pod_completion(self, timeout: int) -> dict: + """Wait for benchmark to complete (watching for BENCHMARK_COMPLETE marker in logs). + + The pod runs benchmark, prints BENCHMARK_COMPLETE, then sleeps for 30s. + We detect completion via the marker while pod is still running, + allowing rsync to work before the pod exits. + """ + start_time = time.monotonic() + poll_interval = 10 + + print(f"Waiting for GuideLLM benchmark to complete (timeout: {timeout}s)...") + + while time.monotonic() - start_time < timeout: + try: + # First check pod phase + phase_result = subprocess.run( + [ + "oc", "get", "pod", self.pod_name, + "-n", self.namespace, + "-o", "jsonpath={.status.phase}", + ], + capture_output=True, + text=True, + timeout=15, + ) + phase = phase_result.stdout.strip() + + if phase == "Failed": + return {"success": False, "message": "Pod failed"} + + if phase == "Error": + return {"success": False, "message": "Pod error"} + + # Check logs for BENCHMARK_COMPLETE marker + if phase in ("Running", "Succeeded"): + log_result = subprocess.run( + ["oc", "logs", self.pod_name, "-n", self.namespace, "--tail=50"], + capture_output=True, + text=True, + timeout=15, + ) + if "BENCHMARK_COMPLETE" in log_result.stdout: + duration = time.monotonic() - start_time + print(f"GuideLLM benchmark completed in {duration:.1f}s") + return {"success": True, "duration": duration} + + # Also handle case where pod already Succeeded (marker might have been missed) + if phase == "Succeeded": + duration = time.monotonic() - start_time + print(f"GuideLLM pod completed in {duration:.1f}s") + return {"success": True, "duration": duration} + + # Still running, no marker yet + elapsed = int(time.monotonic() - start_time) + if elapsed % 60 == 0: # Print every minute + print(f" GuideLLM running... ({elapsed}s elapsed, phase={phase})") + + except subprocess.TimeoutExpired: + pass + except Exception as e: + logger.warning(f"Error checking pod status: {e}") + + time.sleep(poll_interval) + + return {"success": False, "message": f"Timeout after {timeout}s"} + + def _collect_pod_logs(self, step_dir): + """Collect logs and results from the GuideLLM pod.""" + # Collect logs + try: + result = subprocess.run( + ["oc", "logs", self.pod_name, "-n", self.namespace], + capture_output=True, + text=True, + timeout=60, + ) + (step_dir / "guidellm_logs.txt").write_text(result.stdout) + if result.stderr: + (step_dir / "guidellm_stderr.txt").write_text(result.stderr) + + print(f"GuideLLM logs saved to {step_dir}/guidellm_logs.txt") + + except Exception as e: + logger.warning(f"Failed to collect pod logs: {e}") + + # Copy results from pod before it's deleted (use rsync for large files) + try: + results_dir = step_dir / "results" + results_dir.mkdir(exist_ok=True) + + # Use oc rsync for efficient transfer of large files (up to 300MB) + result = subprocess.run( + [ + "oc", "rsync", + f"{self.pod_name}:/benchmark-results/", + str(results_dir), + "-n", self.namespace, + "--progress", + ], + capture_output=True, + text=True, + timeout=600, # 10 min timeout for large files + ) + if result.returncode == 0: + print(f"GuideLLM results synced to {results_dir}/") + else: + logger.warning(f"Failed to rsync results: {result.stderr}") + + except subprocess.TimeoutExpired: + logger.warning("Timeout copying results (>10 min)") + except Exception as e: + logger.warning(f"Failed to copy results from pod: {e}") + + def _delete_pod(self): + """Delete the GuideLLM pod.""" + try: + subprocess.run( + ["oc", "delete", "pod", self.pod_name, "-n", self.namespace, "--ignore-not-found"], + capture_output=True, + timeout=30, + ) + print(f"Cleaned up GuideLLM pod: {self.pod_name}") + except Exception as e: + logger.warning(f"Failed to delete pod: {e}") diff --git a/projects/core/utils/__init__.py b/projects/core/utils/__init__.py new file mode 100644 index 0000000..5390081 --- /dev/null +++ b/projects/core/utils/__init__.py @@ -0,0 +1,16 @@ +"""Core utilities for workflow steps. + +Reusable utilities that can be imported by any project (rhaiis, llm-d, etc.). + +Example: + from projects.core.utils import OC, RetryConfig + + oc = OC(namespace="forge") + result = oc.get("pods", "-l", "app=vllm") + if result.success: + print(result.stdout) +""" + +from .oc import OC, OCResult, RetryConfig + +__all__ = ["OC", "OCResult", "RetryConfig"] diff --git a/projects/core/utils/oc.py b/projects/core/utils/oc.py new file mode 100644 index 0000000..19b9d0f --- /dev/null +++ b/projects/core/utils/oc.py @@ -0,0 +1,529 @@ +"""OpenShift CLI wrapper with built-in retry for transient failures. + +Provides a clean, method-based API for oc commands with automatic retry +on transient network errors, API server unavailability, etc. + +Example: + from projects.core.utils import OC, RetryConfig + + # Basic usage + oc = OC(namespace="forge") + result = oc.get("pods") + if result.success: + print(result.stdout) + + # With custom retry config + oc = OC(namespace="forge", retry=RetryConfig(max_retries=5)) + result = oc.apply("-f", "manifest.yaml") + + # Without namespace (uses current context) + oc = OC() + result = oc.get("namespaces") +""" + +import logging +import subprocess +import time +from dataclasses import dataclass, field +from typing import Any + +logger = logging.getLogger(__name__) + +# Error patterns that indicate transient failures worth retrying +TRANSIENT_ERROR_PATTERNS = [ + "connection refused", + "connection reset", + "connection timed out", + "unable to connect", + "no route to host", + "temporary failure", + "service unavailable", + "server is currently unable", + "etcdserver: request timed out", + "context deadline exceeded", + "the server was unable to return a response", + "unexpected eof", + "i/o timeout", + "tls handshake timeout", + "net/http: request canceled", + "client rate limiter", + "too many requests", + "throttling", + "apiserver not ready", +] + + +@dataclass +class RetryConfig: + """Configuration for retry behavior. + + Attributes: + max_retries: Maximum number of retry attempts (default: 3) + initial_delay: Initial delay between retries in seconds (default: 1.0) + max_delay: Maximum delay between retries in seconds (default: 30.0) + backoff_multiplier: Multiplier for exponential backoff (default: 2.0) + retry_on_timeout: Whether to retry on subprocess timeout (default: True) + """ + + max_retries: int = 3 + initial_delay: float = 1.0 + max_delay: float = 30.0 + backoff_multiplier: float = 2.0 + retry_on_timeout: bool = True + + +@dataclass +class OCResult: + """Result of an oc command execution. + + Attributes: + success: Whether the command succeeded (returncode == 0) + returncode: Command exit code + stdout: Standard output as string + stderr: Standard error as string + command: The command that was executed + attempts: Number of attempts made (1 = no retries needed) + duration: Total execution time including retries in seconds + """ + + success: bool + returncode: int + stdout: str = "" + stderr: str = "" + command: list[str] = field(default_factory=list) + attempts: int = 1 + duration: float = 0.0 + + @classmethod + def from_completed_process( + cls, + result: subprocess.CompletedProcess, + command: list[str], + attempts: int = 1, + duration: float = 0.0, + ) -> "OCResult": + """Create OCResult from subprocess.CompletedProcess.""" + return cls( + success=result.returncode == 0, + returncode=result.returncode, + stdout=result.stdout if result.stdout else "", + stderr=result.stderr if result.stderr else "", + command=command, + attempts=attempts, + duration=duration, + ) + + @classmethod + def from_error( + cls, + error: Exception, + command: list[str], + attempts: int = 1, + duration: float = 0.0, + ) -> "OCResult": + """Create failed OCResult from exception.""" + return cls( + success=False, + returncode=-1, + stdout="", + stderr=str(error), + command=command, + attempts=attempts, + duration=duration, + ) + + +def _is_transient_error(stderr: str, returncode: int) -> bool: + """Check if error is likely transient and worth retrying.""" + if returncode == 0: + return False + stderr_lower = stderr.lower() + return any(pattern in stderr_lower for pattern in TRANSIENT_ERROR_PATTERNS) + + +class OC: + """OpenShift CLI wrapper with built-in retry. + + Provides a clean, method-based API for common oc operations. + All methods automatically retry on transient failures. + + Args: + namespace: Default namespace for commands (optional) + retry: Retry configuration (uses defaults if None) + timeout: Default command timeout in seconds (default: 60) + + Example: + oc = OC(namespace="forge") + + # Get pods + result = oc.get("pods") + result = oc.get("pods", "-l", "app=vllm") + result = oc.get("pod", "my-pod", "-o", "yaml") + + # Apply manifests + result = oc.apply("-f", "manifest.yaml") + result = oc.apply("-f", "-", input=yaml_content) + + # Delete resources + result = oc.delete("pod", "my-pod") + result = oc.delete("pod", "my-pod", "--ignore-not-found") + + # Logs + result = oc.logs("my-pod") + result = oc.logs("my-pod", "-c", "container", "--tail=100") + + # Exec + result = oc.exec("my-pod", "--", "curl", "localhost:8080/health") + + # Raw command + result = oc.run("get", "pods", "-A") + """ + + def __init__( + self, + namespace: str | None = None, + retry: RetryConfig | None = None, + timeout: int = 60, + ): + self.namespace = namespace + self.retry = retry or RetryConfig() + self.timeout = timeout + + def _build_cmd(self, args: list[str], namespace: str | None = None) -> list[str]: + """Build full oc command with namespace.""" + cmd = ["oc"] + + # Use provided namespace, fall back to instance default + ns = namespace if namespace is not None else self.namespace + if ns: + cmd.extend(["-n", ns]) + + cmd.extend(args) + return cmd + + def _run_with_retry( + self, + cmd: list[str], + timeout: int | None = None, + input: str | None = None, + ) -> OCResult: + """Execute command with retry on transient failures.""" + timeout = timeout if timeout is not None else self.timeout + delay = self.retry.initial_delay + attempts = 0 + start_time = time.monotonic() + + last_result: subprocess.CompletedProcess | None = None + last_error: Exception | None = None + + for attempt in range(self.retry.max_retries + 1): + attempts = attempt + 1 + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + input=input, + ) + + # Success + if result.returncode == 0: + duration = time.monotonic() - start_time + if attempts > 1: + logger.info( + f"Command succeeded on attempt {attempts}: {' '.join(cmd[:4])}" + ) + return OCResult.from_completed_process( + result, cmd, attempts, duration + ) + + # Check if transient error + if _is_transient_error(result.stderr, result.returncode): + last_result = result + if attempt < self.retry.max_retries: + logger.warning( + f"Transient error (attempt {attempts}/{self.retry.max_retries + 1}), " + f"retrying in {delay:.1f}s: {' '.join(cmd[:4])}..." + ) + time.sleep(delay) + delay = min(delay * self.retry.backoff_multiplier, self.retry.max_delay) + continue + + # Non-transient error or exhausted retries + duration = time.monotonic() - start_time + return OCResult.from_completed_process(result, cmd, attempts, duration) + + except subprocess.TimeoutExpired as e: + last_error = e + if self.retry.retry_on_timeout and attempt < self.retry.max_retries: + logger.warning( + f"Timeout (attempt {attempts}/{self.retry.max_retries + 1}), " + f"retrying in {delay:.1f}s: {' '.join(cmd[:4])}..." + ) + time.sleep(delay) + delay = min(delay * self.retry.backoff_multiplier, self.retry.max_delay) + continue + + duration = time.monotonic() - start_time + return OCResult.from_error(e, cmd, attempts, duration) + + except FileNotFoundError as e: + # oc command not found - don't retry + duration = time.monotonic() - start_time + return OCResult.from_error( + Exception("oc command not found. Is OpenShift CLI installed?"), + cmd, + attempts, + duration, + ) + + except Exception as e: + # Unexpected error - don't retry + duration = time.monotonic() - start_time + return OCResult.from_error(e, cmd, attempts, duration) + + # Exhausted retries + duration = time.monotonic() - start_time + if last_error: + return OCResult.from_error(last_error, cmd, attempts, duration) + if last_result: + return OCResult.from_completed_process(last_result, cmd, attempts, duration) + + return OCResult.from_error( + Exception("Unexpected state after retries"), + cmd, + attempts, + duration, + ) + + def run(self, *args: str, namespace: str | None = None, timeout: int | None = None, input: str | None = None) -> OCResult: + """Run arbitrary oc command. + + Args: + *args: Command arguments (e.g., "get", "pods", "-o", "yaml") + namespace: Override namespace for this command + timeout: Override timeout for this command + input: Input to pass to stdin + + Returns: + OCResult with command output + """ + cmd = self._build_cmd(list(args), namespace) + return self._run_with_retry(cmd, timeout, input) + + def get(self, resource: str, *args: str, namespace: str | None = None, timeout: int | None = None) -> OCResult: + """Get Kubernetes resources. + + Args: + resource: Resource type (e.g., "pods", "deployments") + *args: Additional arguments (name, selectors, output format) + namespace: Override namespace + timeout: Override timeout + + Returns: + OCResult with resource data + + Examples: + oc.get("pods") + oc.get("pods", "-l", "app=vllm") + oc.get("pod", "my-pod", "-o", "yaml") + oc.get("pods", "-o", "jsonpath={.items[*].metadata.name}") + """ + return self.run("get", resource, *args, namespace=namespace, timeout=timeout) + + def apply(self, *args: str, namespace: str | None = None, timeout: int | None = None, input: str | None = None) -> OCResult: + """Apply configuration to resources. + + Args: + *args: Apply arguments (e.g., "-f", "manifest.yaml") + namespace: Override namespace + timeout: Override timeout + input: YAML content to apply via stdin (use with "-f", "-") + + Returns: + OCResult + + Examples: + oc.apply("-f", "manifest.yaml") + oc.apply("-f", "-", input=yaml_content) + """ + return self.run("apply", *args, namespace=namespace, timeout=timeout, input=input) + + def delete(self, resource: str, name: str = "", *args: str, namespace: str | None = None, timeout: int | None = None) -> OCResult: + """Delete resources. + + Args: + resource: Resource type + name: Resource name (optional for label selectors) + *args: Additional arguments (--ignore-not-found, etc.) + namespace: Override namespace + timeout: Override timeout + + Returns: + OCResult + + Examples: + oc.delete("pod", "my-pod") + oc.delete("pod", "my-pod", "--ignore-not-found") + oc.delete("pods", "-l", "app=test") + """ + if name: + return self.run("delete", resource, name, *args, namespace=namespace, timeout=timeout) + return self.run("delete", resource, *args, namespace=namespace, timeout=timeout) + + def logs(self, pod: str, *args: str, namespace: str | None = None, timeout: int | None = None) -> OCResult: + """Get pod logs. + + Args: + pod: Pod name + *args: Additional arguments (-c container, --tail, --since, etc.) + namespace: Override namespace + timeout: Override timeout + + Returns: + OCResult with logs in stdout + + Examples: + oc.logs("my-pod") + oc.logs("my-pod", "-c", "sidecar") + oc.logs("my-pod", "--tail=100") + """ + return self.run("logs", pod, *args, namespace=namespace, timeout=timeout) + + def exec(self, pod: str, *args: str, namespace: str | None = None, timeout: int | None = None) -> OCResult: + """Execute command in pod. + + Args: + pod: Pod name + *args: Command to execute (use "--" separator) + namespace: Override namespace + timeout: Override timeout + + Returns: + OCResult with command output + + Examples: + oc.exec("my-pod", "--", "curl", "localhost:8080/health") + oc.exec("my-pod", "-c", "container", "--", "cat", "/etc/config") + """ + return self.run("exec", pod, *args, namespace=namespace, timeout=timeout) + + def describe(self, resource: str, name: str = "", *args: str, namespace: str | None = None, timeout: int | None = None) -> OCResult: + """Describe resources. + + Args: + resource: Resource type + name: Resource name (optional) + *args: Additional arguments + namespace: Override namespace + timeout: Override timeout + + Returns: + OCResult with description + """ + if name: + return self.run("describe", resource, name, *args, namespace=namespace, timeout=timeout) + return self.run("describe", resource, *args, namespace=namespace, timeout=timeout) + + def wait( + self, + resource: str, + name: str, + condition: str, + timeout_seconds: int = 300, + namespace: str | None = None, + ) -> OCResult: + """Wait for resource condition. + + Args: + resource: Resource type + name: Resource name + condition: Condition to wait for (e.g., "condition=Ready") + timeout_seconds: Wait timeout in seconds + namespace: Override namespace + + Returns: + OCResult + + Example: + oc.wait("pod", "my-pod", "condition=Ready", timeout_seconds=120) + """ + return self.run( + "wait", + f"{resource}/{name}", + f"--for={condition}", + f"--timeout={timeout_seconds}s", + namespace=namespace, + timeout=timeout_seconds + 10, # Give subprocess a bit more time + ) + + def rollout_status( + self, + resource: str, + name: str, + timeout_seconds: int = 300, + namespace: str | None = None, + ) -> OCResult: + """Check rollout status. + + Args: + resource: Resource type (deployment, statefulset, etc.) + name: Resource name + timeout_seconds: Timeout for rollout + namespace: Override namespace + + Returns: + OCResult + """ + return self.run( + "rollout", + "status", + f"{resource}/{name}", + f"--timeout={timeout_seconds}s", + namespace=namespace, + timeout=timeout_seconds + 10, + ) + + def rsync( + self, + source: str, + dest: str, + *args: str, + namespace: str | None = None, + timeout: int | None = None, + ) -> OCResult: + """Rsync files to/from pod. + + Args: + source: Source path (pod:path or local path) + dest: Destination path + *args: Additional rsync arguments + namespace: Override namespace + timeout: Override timeout + + Returns: + OCResult + + Example: + oc.rsync("my-pod:/data/", "./local/") + oc.rsync("./local/", "my-pod:/data/", "--progress") + """ + return self.run("rsync", source, dest, *args, namespace=namespace, timeout=timeout) + + def create_namespace(self, name: str) -> OCResult: + """Create namespace if it doesn't exist. + + Args: + name: Namespace name + + Returns: + OCResult + """ + # Use apply with dry-run to create idempotently + yaml_content = f"""apiVersion: v1 +kind: Namespace +metadata: + name: {name} +""" + return self.apply("-f", "-", input=yaml_content, namespace=None) diff --git a/projects/core/workflow/__init__.py b/projects/core/workflow/__init__.py new file mode 100644 index 0000000..757ab19 --- /dev/null +++ b/projects/core/workflow/__init__.py @@ -0,0 +1,37 @@ +"""Forge workflow engine. + +A simple, testable workflow engine for sequential step execution +with finally/cleanup blocks. Integrates with the existing DSL patterns. + +Example usage: + from projects.core.workflow import Workflow, WorkflowContext, WorkflowStep, StepResult + + class MyStep(WorkflowStep): + def execute(self, ctx: WorkflowContext) -> StepResult: + # Do work... + return StepResult.ok("Done") + + class MyWorkflow(Workflow): + def define_steps(self): + self.add_step(MyStep()) + self.add_finally(CleanupStep()) + + ctx = WorkflowContext.from_environment() + workflow = MyWorkflow(ctx) + result = workflow.execute() +""" + +from .context import WorkflowContext +from .executor import SequentialExecutor +from .step import StepResult, StepStatus, WorkflowStep +from .workflow import Workflow, WorkflowResult + +__all__ = [ + "SequentialExecutor", + "StepResult", + "StepStatus", + "Workflow", + "WorkflowContext", + "WorkflowResult", + "WorkflowStep", +] diff --git a/projects/core/workflow/context.py b/projects/core/workflow/context.py new file mode 100644 index 0000000..418dd78 --- /dev/null +++ b/projects/core/workflow/context.py @@ -0,0 +1,158 @@ +"""Workflow execution context.""" + +import os +import uuid +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import yaml + +import projects.core.library.env as env + + +@dataclass +class WorkflowContext: + """ + Runtime context for workflow execution. + + Holds run-specific state: UUID, artifact directories, config, and env vars. + Created once per workflow execution and passed to all steps. + Integrates with the existing env.ARTIFACT_DIR system. + """ + + run_uuid: str + artifact_dir: Path + config: dict[str, Any] = field(default_factory=dict) + env_vars: dict[str, str] = field(default_factory=dict) + start_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + + # Current step tracking + step_number: int = 0 + current_step_name: str = "" + + @classmethod + def from_environment( + cls, + artifact_base: str | None = None, + config: dict[str, Any] | None = None, + ) -> "WorkflowContext": + """ + Create context from environment variables. + + Reads FORGE_* environment variables and creates artifact directory. + Integrates with env.init() if artifact_base not provided. + + Args: + artifact_base: Base path for artifacts (uses env.ARTIFACT_DIR if not set) + config: Optional config dict to merge + + Returns: + Initialized WorkflowContext + """ + run_uuid = str(uuid.uuid4()) + + # Collect FORGE_* env vars + env_vars = {k: v for k, v in os.environ.items() if k.startswith("FORGE_")} + + # Use existing env.ARTIFACT_DIR system if available + if artifact_base: + artifact_dir = Path(artifact_base) / run_uuid + artifact_dir.mkdir(parents=True, exist_ok=True) + elif env.ARTIFACT_DIR: + artifact_dir = env.ARTIFACT_DIR / run_uuid + artifact_dir.mkdir(parents=True, exist_ok=True) + else: + # Initialize env system + env.init() + artifact_dir = env.ARTIFACT_DIR / run_uuid + artifact_dir.mkdir(parents=True, exist_ok=True) + + # Create _meta subdirectory + meta_dir = artifact_dir / "_meta" + meta_dir.mkdir(exist_ok=True) + + return cls( + run_uuid=run_uuid, + artifact_dir=artifact_dir, + config=config or {}, + env_vars=env_vars, + ) + + def get_step_artifact_dir(self, step_name: str) -> Path: + """ + Get artifact directory for a specific step. + + Creates numbered directory like: 001__deploy/ + + Args: + step_name: Name of the step + + Returns: + Path to step's artifact directory + """ + self.step_number += 1 + self.current_step_name = step_name + step_dir = self.artifact_dir / f"{self.step_number:03d}__{step_name}" + step_dir.mkdir(exist_ok=True) + return step_dir + + def get_env(self, key: str, default: str | None = None) -> str | None: + """ + Get environment variable with FORGE_ prefix. + + Args: + key: Variable name (with or without FORGE_ prefix) + default: Default value if not found + + Returns: + Environment variable value or default + """ + if not key.startswith("FORGE_"): + key = f"FORGE_{key}" + return self.env_vars.get(key, default) + + def write_metadata(self, args: dict[str, Any] | None = None) -> Path: + """ + Write run metadata to _meta/metadata.yaml. + + Args: + args: CLI arguments to include + + Returns: + Path to metadata file + """ + meta_path = self.artifact_dir / "_meta" / "metadata.yaml" + metadata = { + "run_uuid": self.run_uuid, + "start_time": self.start_time.isoformat(), + "env_vars": self.env_vars, + "config": self.config, + "args": args or {}, + } + with open(meta_path, "w") as f: + yaml.safe_dump(metadata, f, default_flow_style=False) + return meta_path + + def write_restart_script(self, command: str) -> Path: + """ + Write restart script to _meta/restart.sh. + + Args: + command: Full command to replay this run + + Returns: + Path to restart script + """ + restart_path = self.artifact_dir / "_meta" / "restart.sh" + script = f"""#!/bin/bash +# Restart script for run {self.run_uuid} +# Generated at {self.start_time.isoformat()} + +{command} +""" + with open(restart_path, "w") as f: + f.write(script) + restart_path.chmod(0o755) + return restart_path diff --git a/projects/core/workflow/executor.py b/projects/core/workflow/executor.py new file mode 100644 index 0000000..d9cdaab --- /dev/null +++ b/projects/core/workflow/executor.py @@ -0,0 +1,132 @@ +"""Workflow executors.""" + +import logging +import time +from datetime import datetime, timezone +from typing import TYPE_CHECKING + +from .step import StepResult +from .workflow import WorkflowResult + +if TYPE_CHECKING: + from .workflow import Workflow + +logger = logging.getLogger(__name__) + + +class SequentialExecutor: + """ + Execute workflow steps sequentially with finally block support. + + Execution flow: + 1. Run normal steps in order until completion or failure + 2. On failure, skip remaining normal steps + 3. Always run finally steps, even if normal steps failed + 4. Collect all results and return WorkflowResult + """ + + def execute(self, workflow: "Workflow") -> WorkflowResult: + """ + Execute the workflow. + + Args: + workflow: Workflow instance to execute + + Returns: + WorkflowResult with all step outcomes + """ + start_time = datetime.now(timezone.utc) + step_results: dict[str, StepResult] = {} + failed_step: str | None = None + original_error: Exception | None = None + + ctx = workflow.ctx + logger.info(f"Starting workflow run {ctx.run_uuid}") + + # Run normal steps + for step in workflow.steps: + step_name = step.name + logger.info(f"Running step: {step_name}") + + # Get artifact directory for this step + step_artifact_dir = ctx.get_step_artifact_dir(step_name) + + step_start = time.monotonic() + try: + result = step.execute(ctx) + result.duration_seconds = time.monotonic() - step_start + result.start_time = datetime.now(timezone.utc) + step_results[step_name] = result + + if not result.success: + logger.error(f"Step {step_name} failed: {result.message}") + failed_step = step_name + original_error = result.error + break + logger.info(f"Step {step_name} completed in {result.duration_seconds:.2f}s") + + except Exception as e: + duration = time.monotonic() - step_start + logger.exception(f"Step {step_name} raised exception") + step_results[step_name] = StepResult( + success=False, + message=f"Exception: {e}", + error=e, + duration_seconds=duration, + ) + failed_step = step_name + original_error = e + break + + # Run finally steps (always) + finally_errors: list[Exception] = [] + for step in workflow.finally_steps: + step_name = step.name + logger.info(f"Running finally step: {step_name}") + + step_artifact_dir = ctx.get_step_artifact_dir(step_name) + + step_start = time.monotonic() + try: + result = step.execute(ctx) + result.duration_seconds = time.monotonic() - step_start + step_results[step_name] = result + + if not result.success: + logger.warning(f"Finally step {step_name} failed: {result.message}") + # Don't break - continue with other finally steps + if result.error: + finally_errors.append(result.error) + else: + logger.info(f"Finally step {step_name} completed in {result.duration_seconds:.2f}s") + + except Exception as e: + duration = time.monotonic() - step_start + logger.exception(f"Finally step {step_name} raised exception") + step_results[step_name] = StepResult( + success=False, + message=f"Exception: {e}", + error=e, + duration_seconds=duration, + ) + finally_errors.append(e) + # Continue with other finally steps + + end_time = datetime.now(timezone.utc) + duration = (end_time - start_time).total_seconds() + + workflow_success = failed_step is None + logger.info( + f"Workflow completed: success={workflow_success}, " + f"duration={duration:.2f}s, failed_step={failed_step}" + ) + + return WorkflowResult( + success=workflow_success, + step_results=step_results, + failed_step=failed_step, + duration_seconds=duration, + run_uuid=ctx.run_uuid, + start_time=start_time, + end_time=end_time, + ) diff --git a/projects/core/workflow/step.py b/projects/core/workflow/step.py new file mode 100644 index 0000000..908bba6 --- /dev/null +++ b/projects/core/workflow/step.py @@ -0,0 +1,100 @@ +"""Workflow step protocol and result types.""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from datetime import datetime +from enum import Enum +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from .context import WorkflowContext + + +class StepStatus(Enum): + """Step execution status.""" + + PENDING = "pending" + RUNNING = "running" + SUCCESS = "success" + FAILED = "failed" + SKIPPED = "skipped" + + +@dataclass +class StepResult: + """ + Result of a single step execution. + + Attributes: + success: Whether the step completed successfully + message: Human-readable status message + error: Exception if step failed + artifacts: Paths to artifacts produced by this step + data: Arbitrary output data for downstream steps + duration_seconds: Execution time in seconds + """ + + success: bool + message: str = "" + error: Exception | None = None + artifacts: list[str] = field(default_factory=list) + data: dict[str, Any] = field(default_factory=dict) + duration_seconds: float = 0.0 + start_time: datetime | None = None + end_time: datetime | None = None + + @classmethod + def ok(cls, message: str = "Success", **data: Any) -> "StepResult": + """Create a successful result.""" + return cls(success=True, message=message, data=data) + + @classmethod + def fail(cls, message: str, error: Exception | None = None) -> "StepResult": + """Create a failed result.""" + return cls(success=False, message=message, error=error) + + +class WorkflowStep(ABC): + """ + Abstract base class for workflow steps. + + Implement execute() to define step behavior. + Step name defaults to class name if not provided. + """ + + def __init__(self, name: str | None = None): + """ + Initialize step. + + Args: + name: Optional step name (defaults to class name) + """ + self._name = name + + @property + def name(self) -> str: + """Get step name.""" + if self._name: + return self._name + # Default to class name, converting CamelCase to snake_case + class_name = self.__class__.__name__ + # Remove 'Step' suffix if present + if class_name.endswith("Step"): + class_name = class_name[:-4] + # Convert to lowercase + return class_name.lower() + + @abstractmethod + def execute(self, ctx: "WorkflowContext") -> StepResult: + """ + Execute the step. + + Args: + ctx: Workflow execution context + + Returns: + StepResult indicating success/failure and any outputs + """ + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(name={self.name!r})" diff --git a/projects/core/workflow/workflow.py b/projects/core/workflow/workflow.py new file mode 100644 index 0000000..0bd3ce7 --- /dev/null +++ b/projects/core/workflow/workflow.py @@ -0,0 +1,129 @@ +"""Base workflow class with step registration.""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from datetime import datetime +from typing import TYPE_CHECKING + +from .step import StepResult, WorkflowStep + +if TYPE_CHECKING: + from .context import WorkflowContext + + +@dataclass +class WorkflowResult: + """ + Result of a complete workflow execution. + + Attributes: + success: Whether all steps completed successfully + step_results: Results from each step, keyed by step name + failed_step: Name of first step that failed (if any) + duration_seconds: Total execution time + run_uuid: UUID of this workflow run + """ + + success: bool + step_results: dict[str, StepResult] = field(default_factory=dict) + failed_step: str | None = None + duration_seconds: float = 0.0 + run_uuid: str = "" + start_time: datetime | None = None + end_time: datetime | None = None + + +class Workflow(ABC): + """ + Base class for defining workflows with steps and finally blocks. + + Subclasses implement define_steps() to register steps. + Steps run sequentially; finally steps always run regardless of failure. + + Example: + class BenchmarkWorkflow(Workflow): + def define_steps(self): + self.add_step(DeployVLLMStep(model=..., vllm_image=..., runtime_args=...)) + self.add_step(RunGuideLLMStep(...)) + self.add_finally(CollectArtifactsStep()) + self.add_finally(CleanupDeploymentStep()) + """ + + def __init__(self, ctx: "WorkflowContext"): + """ + Initialize workflow with context. + + Args: + ctx: Workflow execution context + """ + self.ctx = ctx + self._steps: list[WorkflowStep] = [] + self._finally_steps: list[WorkflowStep] = [] + self._defined = False + + def add_step(self, step: WorkflowStep) -> None: + """ + Add a step to the workflow. + + Steps run in order of registration. If a step fails, + remaining steps are skipped and finally steps run. + + Args: + step: WorkflowStep instance to add + """ + self._steps.append(step) + + def add_finally(self, step: WorkflowStep) -> None: + """ + Add a finally step that always runs. + + Finally steps run in order after all normal steps complete + or after a step failure. They run even if previous finally + steps fail. + + Args: + step: WorkflowStep instance to add + """ + self._finally_steps.append(step) + + @abstractmethod + def define_steps(self) -> None: + """ + Define workflow steps. + + Override this method to register steps via add_step() + and add_finally(). + """ + + @property + def steps(self) -> list[WorkflowStep]: + """Get registered steps.""" + self._ensure_defined() + return self._steps + + @property + def finally_steps(self) -> list[WorkflowStep]: + """Get registered finally steps.""" + self._ensure_defined() + return self._finally_steps + + def _ensure_defined(self) -> None: + """Ensure define_steps() has been called.""" + if not self._defined: + self.define_steps() + self._defined = True + + def execute(self) -> WorkflowResult: + """ + Execute the workflow. + + Runs all steps sequentially, then runs finally steps. + Uses SequentialExecutor internally. + + Returns: + WorkflowResult with step outcomes + """ + from .executor import SequentialExecutor + + executor = SequentialExecutor() + return executor.execute(self) diff --git a/projects/rhaiis/IMPLEMENTATION.md b/projects/rhaiis/IMPLEMENTATION.md new file mode 100644 index 0000000..1cae22a --- /dev/null +++ b/projects/rhaiis/IMPLEMENTATION.md @@ -0,0 +1,488 @@ +# RHAIIS Benchmark Implementation + +RHAIIS benchmarking system built on Forge's workflow engine (projects/core/workflow/). + +## Architecture + +``` +CLI/CI Entry Points + │ + ▼ +┌─────────────────────────────────────┐ +│ test_rhaiis.py │ ← Orchestration layer +│ (run_test, run_prepare, run_cleanup)│ +└─────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────┐ +│ ConfigLoader │ ← Config inheritance +│ defaults → accelerator → model │ +└─────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────┐ +│ BenchmarkWorkflow │ ← Workflow definition +│ (deploy → wait → benchmark) │ +└─────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────┐ +│ WorkflowStep implementations │ ← Step execution +│ DeployVLLM, WaitForReady, │ +│ RunGuideLLM, CollectArtifacts │ +└─────────────────────────────────────┘ +``` + +## Entry Points + +### CLI (`cli.py`) +```bash +# Single model + workload +PYTHONPATH=. python3 projects/rhaiis/orchestration/cli.py test \ + --model llama-3.3-70b-fp8 --workload balanced --accelerator nvidia + +# Deploy-once: multiple workloads without restarting vLLM +cli.py test --model qwen-0.6b --workloads balanced,short,long-prompt +``` + +### CI (`ci.py`) +```bash +# Env var driven (for FOURNOS jobs) +FORGE_MODEL=qwen-0.6b FORGE_WORKLOADS=balanced,short \ + python3 projects/rhaiis/orchestration/ci.py test +``` + +## Config Structure + +Project-specific configs allow different projects (rhaiis, llm-d) to have their own settings: + +``` +config/ +├── rhaiis/ +│ ├── defaults.yaml # Global defaults + accelerator settings +│ ├── models.yaml # Model registry (HF IDs, vllm_args, env_vars) +│ └── workloads.yaml # GuideLLM profiles (rates, max_seconds) +└── llm-d/ # (future) llm-d specific configs + └── ... +``` + +### Inheritance Chain +``` +defaults.yaml (base vllm_args, deploy settings) + ↓ merge +accelerators[nvidia|amd] (image, vllm_args, env_vars) + ↓ merge +models[model] (hf_model_id, vllm_args, env_vars) + ↓ merge +models[model].accelerator_overrides[accelerator] (vllm_args, env_vars) +``` + +## Environment Variables + +Env vars are passed to the vLLM pod and follow the same inheritance chain: + +### Accelerator-level (`defaults.yaml`) +```yaml +accelerators: + nvidia: + env_vars: + TORCH_CUDA_ARCH_LIST: "9.0" # All NVIDIA models + amd: + env_vars: + VLLM_ROCM_USE_AITER: "1" # All AMD models +``` + +### Model-level (`models.yaml`) +```yaml +models: + my-model: + env_vars: + VLLM_MXFP4_USE_MARLIN: "1" # This model on all accelerators +``` + +### Model + Accelerator specific (`models.yaml`) +```yaml +models: + deepseek-r1: + accelerator_overrides: + amd: + env_vars: + VLLM_ROCM_USE_AITER: "0" # Override AMD default for this model + nvidia: + env_vars: + TORCH_CUDA_ARCH_LIST: "9.0" +``` + +## Core Interfaces - Class Diagram + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ FORGE WORKFLOW ENGINE │ +└─────────────────────────────────────────────────────────────────────────────────┘ + +┌──────────────────────────────┐ ┌──────────────────────────────┐ +│ «abstract» WorkflowStep │ │ WorkflowContext │ +├──────────────────────────────┤ ├──────────────────────────────┤ +│ - _name: str | None │ │ + run_uuid: str │ +├──────────────────────────────┤ │ + artifact_dir: Path │ +│ + name: str {property} │ │ + config: dict │ +│ + execute(ctx) → StepResult │◄────────│ + env_vars: dict │ +│ «abstract» │ uses │ + start_time: datetime │ +└──────────────────────────────┘ │ + step_number: int │ + △ │ + current_step_name: str │ + │ inherits ├──────────────────────────────┤ + │ │ + from_environment() → ctx │ + ┌───────┴───────┐ │ + get_step_artifact_dir() │ + │ │ │ + get_env(key) → str │ + ▼ ▼ │ + write_metadata() │ +┌────────────┐ ┌────────────┐ │ + write_restart_script() │ +│ Core Steps │ │ Project │ └──────────────────────────────┘ +│ │ │ Steps │ +├────────────┤ ├────────────┤ +│RunGuideLLM │ │DeployVLLM │ +│Collect │ │WaitForReady│ +│Artifacts │ │DeployHelm │ +│Cleanup │ │ConfigureEPP│ +│Deployment │ │... │ +└────────────┘ └────────────┘ + + +┌──────────────────────────────┐ +│ «abstract» Workflow │ ┌──────────────────────────────┐ +├──────────────────────────────┤ │ StepResult │ +│ + ctx: WorkflowContext │ ├──────────────────────────────┤ +│ - _steps: list[WorkflowStep] │ │ + success: bool │ +│ - _finally_steps: list[...] │ │ + message: str │ +│ - _defined: bool │ │ + error: Exception | None │ +├──────────────────────────────┤ │ + artifacts: list[str] │ +│ + add_step(step) │ │ + data: dict │ +│ + add_finally(step) │ │ + duration_seconds: float │ +│ + define_steps() «abstract» │ ├──────────────────────────────┤ +│ + steps: list {property} │ │ + ok(message) → StepResult │ +│ + finally_steps: list {prop} │ │ + fail(message) → StepResult │ +│ + execute() → WorkflowResult │ └──────────────────────────────┘ +└──────────────────────────────┘ △ + │ │ returns + │ uses │ + ▼ ┌─────────────┴────────────────┐ +┌──────────────────────────────┐ │ SequentialExecutor │ +│ WorkflowResult │ ├──────────────────────────────┤ +├──────────────────────────────┤ │ │ +│ + success: bool │◄────────┤ + execute(workflow) │ +│ + step_results: dict │ returns │ → WorkflowResult │ +│ + failed_step: str | None │ │ │ +│ + duration_seconds: float │ │ Execution Flow: │ +│ + run_uuid: str │ │ 1. Run steps sequentially │ +│ + start_time: datetime │ │ 2. Stop on first failure │ +│ + end_time: datetime │ │ 3. Always run finally_steps │ +└──────────────────────────────┘ │ 4. Collect all StepResults │ + └──────────────────────────────┘ +``` + +### Concrete Workflow Implementations + +``` + △ inherits from Workflow + │ + ┌───────┴────────────────────┐ + │ │ + ▼ ▼ +┌────────────────────┐ ┌────────────────────┐ +│ BenchmarkWorkflow │ │ LlmdBenchmark │ +│ (RHAIIS) │ │ Workflow (llm-d) │ +├────────────────────┤ ├────────────────────┤ +│ + model: str │ │ + model: str │ +│ + workload: str │ │ + routing_mode: str│ +│ + vllm_image: str │ │ + helmfile_path │ +│ + namespace: str │ │ + namespace: str │ +├────────────────────┤ ├────────────────────┤ +│ define_steps(): │ │ define_steps(): │ +│ ├─ DeployVLLMStep │ │ ├─ DeployHelmStep │ +│ ├─ WaitForReady │ │ ├─ ConfigureEPP │ +│ ├─ RunGuideLLM │ │ ├─ WaitForGateway │ +│ ├─ [finally] │ │ ├─ RunGuideLLM │ +│ │ CollectArtif. │ │ ├─ [finally] │ +│ └─ CleanupDeploy │ │ │ CollectArtif. │ +└────────────────────┘ │ └─ HelmCleanup │ + └────────────────────┘ +``` + +### Execution Sequence Diagram + +``` +┌──────────┐ ┌──────────────┐ ┌────────────────────┐ ┌──────────────┐ +│ Client │ │ Workflow │ │ SequentialExecutor │ │ WorkflowStep │ +└────┬─────┘ └──────┬───────┘ └─────────┬──────────┘ └──────┬───────┘ + │ │ │ │ + │ execute() │ │ │ + │─────────────────>│ │ │ + │ │ │ │ + │ │ execute(self) │ │ + │ │──────────────────────>│ │ + │ │ │ │ + │ │ │ ┌─────────────────┐ │ + │ │ │ │ For each step: │ │ + │ │ │ └────────┬────────┘ │ + │ │ │ │ │ + │ │ │ execute(ctx) │ + │ │ │──────────────────────>│ + │ │ │ │ + │ │ │ StepResult │ + │ │ │<──────────────────────│ + │ │ │ │ │ + │ │ │ ┌────────┴────────┐ │ + │ │ │ │ if !success: │ │ + │ │ │ │ break loop │ │ + │ │ │ └────────┬────────┘ │ + │ │ │ │ │ + │ │ │ ┌────────┴────────┐ │ + │ │ │ │ For each │ │ + │ │ │ │ finally_step: │ │ + │ │ │ └────────┬────────┘ │ + │ │ │ │ │ + │ │ │ execute(ctx) │ + │ │ │──────────────────────>│ + │ │ │ │ + │ │ │ StepResult │ + │ │ │<──────────────────────│ + │ │ │ (continue even │ + │ │ │ if failed) │ + │ │ │ │ + │ │ WorkflowResult │ │ + │ │<──────────────────────│ │ + │ │ │ │ + │ WorkflowResult │ │ │ + │<─────────────────│ │ │ + │ │ │ │ +``` + +### Dependency Graph + +``` + ┌─────────────────┐ + │ WorkflowContext │ + └────────┬────────┘ + │ + created by│from_environment() + │ + ▼ +┌──────────────┐ ┌─────────────────┐ ┌────────────────────┐ +│ WorkflowStep │◄───│ Workflow │───►│ SequentialExecutor │ +│ (ABC) │ │ (ABC) │ │ │ +└──────┬───────┘ └────────┬────────┘ └─────────┬──────────┘ + │ │ │ + │ implements │ implements │ produces + ▼ ▼ ▼ +┌──────────────┐ ┌─────────────────┐ ┌────────────────────┐ +│ Concrete │ │ Concrete │ │ WorkflowResult │ +│ Steps │ │ Workflows │ │ + StepResults │ +│ │ │ │ │ │ +│ DeployVLLM │ │ BenchmarkWF │ │ { │ +│ RunGuideLLM │ │ PrepareWF │ │ success: bool │ +│ CollectArtif │ │ CleanupWF │ │ step_results: {} │ +│ WaitForReady │ │ LlmdBenchmarkWF │ │ failed_step: str │ +│ DeployHelm │ │ │ │ } │ +└──────────────┘ └─────────────────┘ └────────────────────┘ +``` + +### Key Relationships + +| Relationship | Type | Description | +|-------------|------|-------------| +| `Workflow` → `WorkflowContext` | composition | Workflow holds a context instance | +| `Workflow` → `WorkflowStep` | aggregation | Workflow contains list of steps | +| `WorkflowStep.execute()` → `WorkflowContext` | dependency | Steps receive context as parameter | +| `WorkflowStep.execute()` → `StepResult` | returns | Steps return result objects | +| `SequentialExecutor.execute()` → `Workflow` | uses | Executor runs a workflow | +| `SequentialExecutor.execute()` → `WorkflowResult` | returns | Executor returns final result | +| Concrete Steps → `WorkflowStep` | inheritance | All steps extend the abstract class | +| Concrete Workflows → `Workflow` | inheritance | All workflows extend the abstract class | + +--- + +## Core Interfaces - Code + +### WorkflowStep +```python +class WorkflowStep(ABC): + """Base class for all workflow steps.""" + + def __init__(self, name: str | None = None): + self._name = name # Defaults to class name if not provided + + @abstractmethod + def execute(self, ctx: WorkflowContext) -> StepResult: + """Execute step, return success/failure with data.""" + +@dataclass +class StepResult: + success: bool + message: str = "" + error: Exception | None = None + artifacts: list[str] = field(default_factory=list) + data: dict[str, Any] = field(default_factory=dict) + duration_seconds: float = 0.0 +``` + +### Workflow +```python +class Workflow(ABC): + def add_step(self, step: WorkflowStep): ... + def add_finally(self, step: WorkflowStep): ... # Always runs + + @abstractmethod + def define_steps(self) -> None: + """Register steps via add_step() and add_finally().""" +``` + +### WorkflowContext +```python +@dataclass +class WorkflowContext: + run_uuid: str + artifact_dir: Path + config: dict + env_vars: dict # FORGE_* env vars +``` + +### SequentialExecutor +The executor runs steps with these guarantees: +```python +class SequentialExecutor: + """ + Execution flow: + 1. Run normal steps in order until completion or failure + 2. On failure, skip remaining normal steps + 3. Always run finally steps, even if normal steps failed + 4. Finally steps continue even if previous finally steps fail + 5. Collect all results and return WorkflowResult + """ +``` + +## Reliability and Safety + +### Current Reliability Features + +| Feature | Status | Description | +|---------|--------|-------------| +| Finally steps | ✅ | Cleanup always runs, even on failure | +| Exception handling | ✅ | Unhandled exceptions caught, logged, step marked failed | +| Artifact collection | ✅ | Each step gets its own artifact directory | +| Duration tracking | ✅ | Execution time recorded per step | +| Transient retry | ✅ | OC wrapper retries network errors with backoff | + +### Execution Guarantees + +``` +Step 1 (deploy) ──success──► Step 2 (wait) ──success──► Step 3 (benchmark) + │ │ │ + │ failure │ failure │ failure + ▼ ▼ ▼ + Finally 1 (collect) ─────► Finally 2 (cleanup) ─────► Return Result + (always runs) (always runs) +``` + +### Transient Errors Handled by OC Wrapper + +- Connection refused / reset / timed out +- Service unavailable +- API server not ready +- etcd timeout +- Rate limiting (too many requests) +- TLS handshake timeout + +### Safety Considerations + +| Aspect | Implementation | +|--------|---------------| +| Resource cleanup | Finally steps delete InferenceService/ServingRuntime | +| Namespace isolation | All resources created in specified namespace | +| Resource labeling | Resources labeled with `app={deployment_name}` for easy identification | +| Idempotent apply | Uses `oc apply` (not `create`) for idempotency | +| Orphan prevention | Cleanup step uses `--ignore-not-found` | + +### Known Limitations + +| Limitation | Mitigation | +|------------|------------| +| No checkpointing | Re-run from beginning on failure | +| No step timeout enforcement | Use subprocess timeout in OC wrapper | +| No parallel step execution | Use deploy-once pattern to minimize overhead | +| No circuit breaker | Relies on retry exhaustion | + +## RHAIIS Steps + +| Step | Location | Purpose | +|------|----------|---------| +| `DeployVLLMStep` | `rhaiis/workflows/steps/deploy.py` | Create KServe ServingRuntime + InferenceService | +| `WaitForReadyStep` | `rhaiis/workflows/steps/deploy.py` | Wait for ISVC ready + health check | +| `RunGuideLLMStep` | `core/steps/guidellm.py` | Run GuideLLM as pod, collect results | +| `CollectArtifactsStep` | `core/steps/artifacts.py` | Gather logs, events, pod status | +| `CleanupDeploymentStep` | `core/steps/artifacts.py` | Delete ISVC/ServingRuntime | + +## BenchmarkWorkflow + +```python +class BenchmarkWorkflow(Workflow): + def define_steps(self): + self.add_step(DeployVLLMStep(...)) + self.add_step(WaitForReadyStep(...)) + self.add_step(RunGuideLLMStep(...)) + self.add_finally(CollectArtifactsStep(...)) # Always runs + self.add_finally(CleanupDeploymentStep(...)) # Always runs +``` + +## Deploy-Once Pattern + +For multiple workloads with same vLLM config, deploys once and runs GuideLLM multiple times: + +```python +# test_rhaiis.py::_run_multi_workload() +for workload in workloads: + # Group by vllm_args (workloads with different vllm_args get separate deployments) + # Run GuideLLM for each workload without restarting vLLM +``` + +## Artifact Structure + +``` +artifacts/{run_uuid}/ +├── _meta/ +│ └── metadata.yaml +├── 001__deploy/ +│ └── kserve.yaml +├── 002__wait/ +├── 003__benchmark_balanced/ +│ ├── guidellm_logs.txt +│ └── results/ +│ └── benchmark_results.json +├── 004__collect_artifacts/ +│ ├── app_logs.txt +│ ├── pod_describe.txt +│ └── events.txt +└── 005__cleanup/ +``` + +## Running Unit Tests + +```bash +# Activate venv and run from forge directory +source ~/test_foo/python3_virt/bin/activate +cd /Users/memehta/workspace/forge +PYTHONPATH=. python -m pytest tests/ --ignore=tests/llm_d -v + +# Or run specific test files +PYTHONPATH=. pytest tests/core/utils/test_oc.py -v # OC wrapper tests +PYTHONPATH=. pytest tests/core/scenarios/test_config_loader.py -v # ConfigLoader tests +PYTHONPATH=. pytest tests/rhaiis/ -v # RHAIIS tests +``` + +## Key Design Decisions + +1. **KServe RawDeployment**: Uses ServingRuntime + InferenceService for RHOAI compatibility +2. **Pod-based GuideLLM**: Runs benchmark as pod inside cluster (not local) +3. **Finally steps**: CollectArtifacts and Cleanup always run, even on failure +4. **Config inheritance**: Minimizes duplication, accelerator-specific overrides where needed +5. **num_gpus = tensor-parallel-size**: Single source of truth for GPU count +6. **Project-specific configs**: Each project (rhaiis, llm-d) has its own config directory +7. **Env vars inheritance**: Supports accelerator → model → model.accelerator_overrides chain diff --git a/projects/rhaiis/LLM_D_EXTENSIBILITY_REPORT.md b/projects/rhaiis/LLM_D_EXTENSIBILITY_REPORT.md new file mode 100644 index 0000000..342cdf9 --- /dev/null +++ b/projects/rhaiis/LLM_D_EXTENSIBILITY_REPORT.md @@ -0,0 +1,232 @@ +# llm-d Integration Extensibility Report + +Analysis of how the current RHAIIS workflow implementation can be extended to support llm-d benchmarking use cases. + +## Executive Summary + +The current workflow engine (`projects/core/workflow/`) is **highly extensible** for llm-d. The abstract `WorkflowStep` and `Workflow` interfaces allow llm-d to define its own deployment steps while reusing shared steps (GuideLLM, artifact collection). Key differences are in the **deployment layer**, not the workflow engine itself. + +## llm-d vs RHAIIS Comparison + +| Aspect | RHAIIS | llm-d | +|--------|--------|-------| +| **Deployment** | KServe (ServingRuntime + InferenceService) | Helm + Helmfile (model-service, gaie-scheduler, infra) | +| **Networking** | KServe service (`{name}-predictor.{ns}`) | Gateway API + HTTPRoute | +| **Routing** | Direct to vLLM pod | EPP router with scheduling strategies | +| **Scheduling** | None (direct inference) | GAIE scheduler (prefix-aware, disaggregated) | +| **Config** | YAML (models.yaml, defaults.yaml) | Helmfile values + routing configs | + +## Reusable Components + +### Fully Reusable (No Changes) +``` +projects/core/ +├── workflow/ +│ ├── step.py # WorkflowStep abstract class +│ ├── workflow.py # Workflow abstract class +│ ├── context.py # WorkflowContext +│ └── executor.py # SequentialExecutor +├── steps/ +│ ├── guidellm.py # RunGuideLLMStep (endpoint agnostic) +│ └── artifacts.py # CollectArtifactsStep, CleanupDeploymentStep +└── scenarios/ + └── config_loader.py # ConfigLoader (model + workload resolution) +``` + +### Requires llm-d-Specific Implementation +``` +projects/llm_d/ +├── workflows/ +│ ├── steps/ +│ │ ├── deploy_helm.py # DeployHelmStep (model-service, gaie, infra) +│ │ ├── configure_epp.py # ConfigureEPPStep (routing strategy) +│ │ └── wait_gateway.py # WaitForGatewayStep +│ ├── benchmark.py # LlmdBenchmarkWorkflow +│ └── prepare.py # LlmdPrepareWorkflow (install operators) +└── orchestration/ + ├── cli.py # llm-d CLI (similar structure to RHAIIS) + └── test_llmd.py # llm-d orchestration (run_test, etc.) +``` + +## Proposed llm-d Step Implementations + +### 1. DeployHelmStep +```python +class DeployHelmStep(WorkflowStep): + """Deploy llm-d components via Helm/Helmfile.""" + + def __init__( + self, + model: str, + routing_mode: str, # direct, prefix-estimation, pd-disaggregation + helmfile_path: str, + namespace: str, + ): + ... + + def execute(self, ctx: WorkflowContext) -> StepResult: + # helmfile apply -f {helmfile_path} --state-values-set model={model} + # Returns: gateway_url, epp_endpoint +``` + +### 2. ConfigureEPPStep +```python +class ConfigureEPPStep(WorkflowStep): + """Configure EPP routing strategy.""" + + def __init__( + self, + routing_mode: str, + epp_namespace: str, + ): + ... + + def execute(self, ctx: WorkflowContext) -> StepResult: + # Patch EPP ConfigMap with routing config + # Wait for EPP pods to reload +``` + +### 3. WaitForGatewayStep +```python +class WaitForGatewayStep(WorkflowStep): + """Wait for K8s Gateway + HTTPRoute to be ready.""" + + def __init__( + self, + gateway_name: str, + namespace: str, + ): + ... + + def execute(self, ctx: WorkflowContext) -> StepResult: + # Check Gateway status + # Verify HTTPRoute attached + # Health check endpoint +``` + +## Proposed LlmdBenchmarkWorkflow + +```python +class LlmdBenchmarkWorkflow(Workflow): + """llm-d benchmark: deploy via Helm → configure EPP → run GuideLLM → cleanup.""" + + def __init__( + self, + ctx: WorkflowContext, + model: str, + routing_mode: str, # direct, prefix-estimation, prefix-precise, pd-disaggregation + workload: str, + namespace: str, + ): + ... + + def define_steps(self): + # Deploy model-service + GAIE scheduler + infra via Helm + self.add_step(DeployHelmStep( + model=self.model, + routing_mode=self.routing_mode, + helmfile_path=self._get_helmfile(), + namespace=self.namespace, + )) + + # Configure EPP routing strategy + self.add_step(ConfigureEPPStep( + routing_mode=self.routing_mode, + epp_namespace=self.namespace, + )) + + # Wait for Gateway API + HTTPRoute + self.add_step(WaitForGatewayStep( + gateway_name=f"{self.model}-gateway", + namespace=self.namespace, + )) + + # Run GuideLLM (reused from core) + gateway_endpoint = f"http://{self.model}-gateway.{self.namespace}.svc:8080/v1" + self.add_step(RunGuideLLMStep( + endpoint=gateway_endpoint, + model=self.model, + namespace=self.namespace, + workload=self.workload, + )) + + # Cleanup (reused from core, modified for Helm) + self.add_finally(CollectArtifactsStep( + app_label="llm-d", + namespace=self.namespace, + )) + self.add_finally(HelmCleanupStep( + namespace=self.namespace, + )) +``` + +## Config Structure for llm-d + +```yaml +# config/llm-d/defaults.yaml +defaults: + deploy: + namespace: llm-d + helmfile_path: deploy/llm-d/helmfile.yaml + + routing: + default_mode: direct + modes: + direct: {} + prefix-estimation: + scheduler: gaie + prefix_cache: redis + pd-disaggregation: + prefill_replicas: 2 + decode_replicas: 4 + +# config/llm-d/models.yaml +models: + llama-3.1-8b: + hf_model_id: meta-llama/Llama-3.1-8B-Instruct + supported_routing: [direct, prefix-estimation] + helm_values: + vllm: + tensor_parallel: 1 +``` + +## Implementation Roadmap + +### Phase 1: Step Implementations (2-3 days) +- [ ] `DeployHelmStep` - Helm/Helmfile deployment +- [ ] `WaitForGatewayStep` - Gateway API readiness +- [ ] `HelmCleanupStep` - Helm uninstall + +### Phase 2: EPP Integration (2-3 days) +- [ ] `ConfigureEPPStep` - Routing strategy configuration +- [ ] EPP config templates (prefix-estimation, pd-disaggregation) + +### Phase 3: Workflow + CLI (1-2 days) +- [ ] `LlmdBenchmarkWorkflow` +- [ ] `projects/llm_d/orchestration/cli.py` +- [ ] `projects/llm_d/orchestration/test_llmd.py` + +### Phase 4: Config + Testing (1-2 days) +- [ ] llm-d config files (defaults, models, routing modes) +- [ ] Integration tests + +## Gaps in Current Implementation + +| Gap | Impact | Resolution | +|-----|--------|------------| +| No Helm support | High | Create `DeployHelmStep` | +| No Gateway API support | High | Create `WaitForGatewayStep` | +| No EPP routing config | High | Create `ConfigureEPPStep` | +| RHAIIS-specific in deploy.py | Low | Already isolated in `rhaiis/workflows/steps/` | +| GuideLLM assumes OpenAI endpoint | None | Already generic (`--target` flag) | + +## Conclusion + +The workflow engine architecture is **well-suited** for llm-d extension: + +1. **Clean separation**: Core workflow engine (`projects/core/`) is deployment-agnostic +2. **Step abstraction**: New steps (Helm, Gateway) implement same `WorkflowStep` interface +3. **Reusable components**: GuideLLM, artifact collection work unchanged +4. **Config system**: `ConfigLoader` can be extended with llm-d-specific configs + +Estimated effort: **6-10 developer days** for full llm-d integration with routing modes. diff --git a/projects/rhaiis/__init__.py b/projects/rhaiis/__init__.py new file mode 100644 index 0000000..d90b9a4 --- /dev/null +++ b/projects/rhaiis/__init__.py @@ -0,0 +1 @@ +"""RHAIIS benchmarking project.""" diff --git a/projects/rhaiis/orchestration/__init__.py b/projects/rhaiis/orchestration/__init__.py new file mode 100644 index 0000000..29fa342 --- /dev/null +++ b/projects/rhaiis/orchestration/__init__.py @@ -0,0 +1,5 @@ +"""RHAIIS orchestration module.""" + +from .ci import ci + +__all__ = ["ci"] diff --git a/projects/rhaiis/orchestration/ci.py b/projects/rhaiis/orchestration/ci.py new file mode 100644 index 0000000..3a9837b --- /dev/null +++ b/projects/rhaiis/orchestration/ci.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +"""RHAIIS CI Operations - Minimal CLI for FOURNOS jobs. + +Most configuration comes from config/ directory via ConfigLoader. +FOURNOS only needs to set a few key parameters: + FORGE_MODEL - Model key to benchmark + FORGE_WORKLOADS - Comma-separated workloads (optional) + +For interactive use with detailed CLI options, use cli.py instead. +""" + +import os +import sys +import types +from pathlib import Path + +import click + +from projects.core.workflow import WorkflowContext + +from . import test_rhaiis + +# Default config directory +DEFAULT_CONFIG_DIR = Path(__file__).parent.parent.parent.parent / "config" / "rhaiis" + + +@click.group() +@click.pass_context +def ci(ctx): + """RHAIIS CI Operations for FOURNOS.""" + ctx.ensure_object(types.SimpleNamespace) + + +@ci.command() +@click.option("--dry-run", is_flag=True, help="Print what would be done without executing") +@click.pass_context +def prepare(ctx, dry_run: bool): + """Prepare phase - Install operators (RHOAI, NFD, GPU).""" + rhoai_version = os.environ.get("FORGE_RHOAI_VERSION", "2.19") + + if dry_run: + click.echo("[DRY-RUN] Prepare phase") + click.echo(f"[DRY-RUN] RHOAI Version: {rhoai_version}") + click.echo("[DRY-RUN] Would install: NFD, GPU Operator, RHOAI") + return + + workflow_ctx = WorkflowContext.from_environment() + workflow_ctx.write_metadata({ + "command": "prepare", + "rhoai_version": rhoai_version, + }) + + exit_code = test_rhaiis.run_prepare(workflow_ctx, rhoai_version) + sys.exit(exit_code) + + +@ci.command() +@click.option("--dry-run", is_flag=True, help="Print what would be done without executing") +@click.pass_context +def test(ctx, dry_run: bool): + """Test phase - Run vLLM benchmark. + + Configuration from config/ directory. FOURNOS sets: + FORGE_MODEL - Model key (e.g., qwen-0.6b) + FORGE_WORKLOADS - Comma-separated workloads (optional) + """ + workflow_ctx = WorkflowContext.from_environment() + + # Key parameters from FOURNOS + model = os.environ.get("FORGE_MODEL") + workloads_str = os.environ.get("FORGE_WORKLOADS") + + # Parse workloads + workloads = None + if workloads_str: + workloads = [w.strip() for w in workloads_str.split(",")] + + # Log + click.echo("RHAIIS CI Test") + click.echo(f" Model: {model}") + if workloads: + click.echo(f" Workloads: {workloads}") + + # All other config comes from config/ directory via ConfigLoader + exit_code = test_rhaiis.run_test( + ctx=workflow_ctx, + model=model, + workloads=workloads, + config_dir=DEFAULT_CONFIG_DIR, + dry_run=dry_run, + ) + sys.exit(exit_code) + + +@ci.command() +@click.option("--dry-run", is_flag=True, help="Print what would be done without executing") +@click.pass_context +def cleanup(ctx, dry_run: bool): + """Cleanup phase - Remove deployments and resources.""" + namespace = os.environ.get("FORGE_NAMESPACE", "forge") + + if dry_run: + click.echo("[DRY-RUN] Cleanup phase") + click.echo(f"[DRY-RUN] Namespace: {namespace}") + click.echo("[DRY-RUN] Would delete: InferenceServices, ServingRuntimes") + return + + workflow_ctx = WorkflowContext.from_environment() + workflow_ctx.write_metadata({ + "command": "cleanup", + "namespace": namespace, + }) + + exit_code = test_rhaiis.run_cleanup(workflow_ctx, namespace) + sys.exit(exit_code) + + +if __name__ == "__main__": + ci() diff --git a/projects/rhaiis/orchestration/cli.py b/projects/rhaiis/orchestration/cli.py new file mode 100644 index 0000000..4e62de1 --- /dev/null +++ b/projects/rhaiis/orchestration/cli.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +"""RHAIIS CLI - Detailed CLI for interactive/manual use. + +This CLI provides full control over benchmark parameters via command-line options. +For FOURNOS jobs that read config from files, use ci.py instead. + +Examples: + # Single workload + cli.py test --model qwen-0.6b --workload balanced + + # Multiple workloads (deploy vLLM once) + cli.py test --model qwen-0.6b --workloads balanced,short,heterogeneous + + # AMD accelerator + cli.py test --model qwen-0.6b --accelerator amd +""" + +import sys +from pathlib import Path + +import click + +from projects.core.workflow import WorkflowContext + +from . import test_rhaiis + +# Default config directory +DEFAULT_CONFIG_DIR = Path(__file__).parent.parent.parent.parent / "config" / "rhaiis" + + +@click.group() +def cli(): + """RHAIIS CLI - Interactive benchmark commands.""" + + +@cli.command() +@click.option( + "--rhoai-version", + envvar="FORGE_RHOAI_VERSION", + default="2.19", + help="RHOAI operator version to install", +) +@click.option( + "--dry-run", + is_flag=True, + help="Print what would be done without executing", +) +def prepare(rhoai_version: str, dry_run: bool): + """Install operators (RHOAI, NFD, GPU) on OpenShift.""" + ctx = WorkflowContext.from_environment() + ctx.write_metadata({"command": "prepare", "rhoai_version": rhoai_version}) + + if dry_run: + click.echo(f"[DRY-RUN] Would install RHOAI {rhoai_version}") + click.echo(f"[DRY-RUN] Artifacts would be written to: {ctx.artifact_dir}") + return + + exit_code = test_rhaiis.run_prepare(ctx, rhoai_version) + sys.exit(exit_code) + + +@cli.command() +@click.option( + "--model", + envvar="FORGE_MODEL", + default=None, + help="Model key or HuggingFace ID (e.g., qwen-0.6b or Qwen/Qwen3-0.6B)", +) +@click.option( + "--workload", + envvar="FORGE_WORKLOAD", + default=None, + help="Single workload: balanced, heterogeneous, multiturn, etc.", +) +@click.option( + "--workloads", + envvar="FORGE_WORKLOADS", + default=None, + help="Comma-separated workloads to run WITHOUT restarting vLLM (e.g., balanced,short,heterogeneous)", +) +@click.option( + "--config-dir", + type=click.Path(exists=True, path_type=Path), + default=None, + help="Config directory containing defaults.yaml, models.yaml, workloads.yaml", +) +@click.option( + "--accelerator", + envvar="FORGE_ACCELERATOR", + type=click.Choice(["nvidia", "amd"]), + default="nvidia", + help="Accelerator type for config inheritance (nvidia, amd)", +) +@click.option( + "--vllm-image", + envvar="FORGE_VLLM_IMAGE", + help="vLLM container image to use (overrides accelerator default)", +) +@click.option( + "--tensor-parallel", + envvar="FORGE_TENSOR_PARALLEL", + type=int, + default=None, + help="Tensor parallelism override (default: from model config)", +) +@click.option( + "--max-requests", + envvar="FORGE_MAX_REQUESTS", + type=int, + default=None, + help="Maximum requests for GuideLLM benchmark (default: from config)", +) +@click.option( + "--namespace", + envvar="FORGE_NAMESPACE", + default=None, + help="Kubernetes namespace for deployment (default: from config)", +) +@click.option( + "--dry-run", + is_flag=True, + help="Print what would be done without executing", +) +def test( + model: str | None, + workload: str | None, + workloads: str | None, + config_dir: Path | None, + accelerator: str, + vllm_image: str | None, + tensor_parallel: int | None, + max_requests: int | None, + namespace: str | None, + dry_run: bool, +): + """Run benchmark: deploy vLLM -> run GuideLLM -> collect artifacts. + + \b + Config inheritance (defaults -> accelerator -> model): + defaults.yaml provides base settings + accelerator (nvidia/amd) provides accelerator-specific overrides + model config provides model-specific settings + CLI flags override everything + + \b + Modes of operation: + 1. Single workload: --model X --workload balanced + 2. Multiple workloads: --model X --workloads balanced,short,heterogeneous + (deploys vLLM once, runs GuideLLM multiple times) + """ + ctx = WorkflowContext.from_environment() + config_dir = config_dir or DEFAULT_CONFIG_DIR + + # Parse workloads list + workload_list = workloads.split(",") if workloads else None + + # Validate inputs (skip for dry-run) + if not dry_run and not model: + click.echo("Error: Must specify --model", err=True) + sys.exit(1) + + exit_code = test_rhaiis.run_test( + ctx=ctx, + model=model, + workload=workload, + workloads=workload_list, + config_dir=config_dir, + accelerator=accelerator, + vllm_image=vllm_image, + tensor_parallel=tensor_parallel, + max_requests=max_requests, + namespace=namespace, + dry_run=dry_run, + ) + sys.exit(exit_code) + + +@cli.command() +@click.option( + "--namespace", + envvar="FORGE_NAMESPACE", + default="forge", + help="Kubernetes namespace to clean up", +) +@click.option( + "--dry-run", + is_flag=True, + help="Print what would be done without executing", +) +def cleanup(namespace: str, dry_run: bool): + """Uninstall operators and cleanup resources.""" + ctx = WorkflowContext.from_environment() + ctx.write_metadata({"command": "cleanup", "namespace": namespace}) + + if dry_run: + click.echo(f"[DRY-RUN] Would clean up namespace: {namespace}") + return + + exit_code = test_rhaiis.run_cleanup(ctx, namespace) + sys.exit(exit_code) + + +if __name__ == "__main__": + cli() diff --git a/projects/rhaiis/orchestration/test_rhaiis.py b/projects/rhaiis/orchestration/test_rhaiis.py new file mode 100644 index 0000000..13ec5b6 --- /dev/null +++ b/projects/rhaiis/orchestration/test_rhaiis.py @@ -0,0 +1,407 @@ +"""RHAIIS Benchmark Implementation. + +Shared logic for running vLLM benchmarks. Used by both: +- ci.py (minimal CLI for FOURNOS jobs) +- cli.py (detailed CLI for interactive use) +""" + +import logging +import sys +import time +from pathlib import Path + +import click + +from projects.core.scenarios import ConfigLoader +from projects.core.workflow import WorkflowContext +from projects.rhaiis.workflows import BenchmarkWorkflow, CleanupWorkflow, PrepareWorkflow + +logger = logging.getLogger(__name__) + +# Default config directory +DEFAULT_CONFIG_DIR = Path(__file__).parent.parent.parent.parent / "config" / "rhaiis" + + +def _dry_run_test( + model: str | None, + workload: str | None, + workloads: list[str] | None, + config_loader, + accelerator: str, + vllm_image: str | None, + tensor_parallel: int | None, + max_requests: int | None, + namespace: str | None, + ctx, +) -> int: + """Show what would be executed without running.""" + global_defaults = config_loader.get_global_defaults() + resolved_namespace = namespace or global_defaults.get("deploy", {}).get("namespace", "forge") + resolved_max_requests = max_requests or global_defaults.get("guidellm", {}).get("max_requests", 100) + resolved_image = vllm_image or config_loader.get_image() + + click.echo(f"[DRY-RUN] Model: {model}") + click.echo(f"[DRY-RUN] Accelerator: {accelerator}") + click.echo(f"[DRY-RUN] Namespace: {resolved_namespace}") + click.echo(f"[DRY-RUN] Image: {resolved_image}") + click.echo(f"[DRY-RUN] Max requests: {resolved_max_requests}") + click.echo(f"[DRY-RUN] Artifacts: {ctx.artifact_dir}") + + if model: + try: + resolved = config_loader.load_model(model) + click.echo(f"\n[DRY-RUN] Resolved model config ({model}):") + click.echo(f" HF Model ID: {resolved.hf_model_id}") + click.echo(f" num_gpus: {resolved.num_gpus}") + click.echo(f" tensor_parallel: {tensor_parallel or resolved.tensor_parallel}") + click.echo(f" vllm_args: {resolved.vllm_args}") + if resolved.env_vars: + click.echo(f" env_vars: {resolved.env_vars}") + except KeyError: + click.echo(f"\n[DRY-RUN] Model '{model}' not in registry, will use defaults") + + if workloads: + click.echo(f"\n[DRY-RUN] Workloads (deploy-once): {workloads}") + click.echo("[DRY-RUN] vLLM will be deployed ONCE, GuideLLM runs for each workload") + elif workload: + click.echo(f"[DRY-RUN] Workload: {workload}") + else: + click.echo("[DRY-RUN] Workload: balanced (default)") + + return 0 + + +def run_prepare(ctx: WorkflowContext, rhoai_version: str) -> int: + """Run prepare phase - install operators.""" + workflow = PrepareWorkflow(ctx, rhoai_version=rhoai_version) + result = workflow.execute() + + if result.success: + click.echo(f"Prepare completed successfully in {result.duration_seconds:.1f}s") + return 0 + else: + click.echo(f"Prepare failed at step: {result.failed_step}", err=True) + return 1 + + +def run_test( + ctx: WorkflowContext, + model: str | None = None, + workload: str | None = None, + workloads: list[str] | None = None, + config_dir: Path | None = None, + accelerator: str = "nvidia", + vllm_image: str | None = None, + tensor_parallel: int | None = None, + max_requests: int | None = None, + namespace: str | None = None, + dry_run: bool = False, +) -> int: + """Run benchmark test phase. + + Args: + ctx: Workflow context + model: Model key or HuggingFace ID + workload: Single workload name + workloads: List of workloads (deploy-once pattern) + config_dir: Config directory path + accelerator: Accelerator type (nvidia, amd) + vllm_image: Container image override + tensor_parallel: TP override + max_requests: Max GuideLLM requests + namespace: K8s namespace + dry_run: Print what would be done without executing + + Returns: + Exit code (0 = success) + """ + config_dir = config_dir or DEFAULT_CONFIG_DIR + + # Initialize ConfigLoader for inheritance-based config resolution + config_loader = ConfigLoader(config_dir, accelerator=accelerator) + + # Dry-run: show resolved config and exit + if dry_run: + return _dry_run_test( + model=model, + workload=workload, + workloads=workloads, + config_loader=config_loader, + accelerator=accelerator, + vllm_image=vllm_image, + tensor_parallel=tensor_parallel, + max_requests=max_requests, + namespace=namespace, + ctx=ctx, + ) + + # Get defaults from config + global_defaults = config_loader.get_global_defaults() + default_namespace = global_defaults.get("deploy", {}).get("namespace", "forge") + default_max_requests = global_defaults.get("guidellm", {}).get("max_requests", 100) + + # Apply defaults + namespace = namespace or default_namespace + max_requests = max_requests or default_max_requests + + # Get accelerator-specific image if not overridden + if not vllm_image: + vllm_image = config_loader.get_image() + + args = { + "command": "test", + "model": model, + "workload": workload, + "workloads": workloads, + "accelerator": accelerator, + "vllm_image": vllm_image, + "tensor_parallel": tensor_parallel, + "max_requests": max_requests, + "namespace": namespace, + } + ctx.write_metadata(args) + + # Mode 1: Multiple workloads (deploy-once pattern) + if workloads and model: + click.echo(f"Deploy-once mode: {model} with workloads {workloads}") + click.echo(f"Accelerator: {accelerator}") + return _run_multi_workload( + ctx, model, workloads, vllm_image, + tensor_parallel, max_requests, namespace, config_loader + ) + + # Mode 2: Single workload + elif model: + single_workload = workload or "balanced" + + # Resolve model config for vllm_args and env_vars + resolved_tp = tensor_parallel + resolved_vllm_args = {} + resolved_env_vars = {} + resolved_model_id = model + + try: + resolved_model = config_loader.load_model(model) + resolved_model_id = resolved_model.hf_model_id + resolved_vllm_args = dict(resolved_model.vllm_args) + resolved_env_vars = dict(resolved_model.env_vars) + if resolved_tp is None: + resolved_tp = resolved_model.tensor_parallel + click.echo(f"Using resolved model config: {resolved_model.key}") + click.echo(f" HF Model ID: {resolved_model_id}") + click.echo(f" vLLM args: {resolved_vllm_args}") + if resolved_env_vars: + click.echo(f" Env vars: {resolved_env_vars}") + except KeyError: + if resolved_tp is None: + resolved_tp = 1 + click.echo(f"Model not in registry, using defaults for: {model}") + + workflow = BenchmarkWorkflow( + ctx, + model=resolved_model_id, + workload=single_workload, + vllm_image=vllm_image, + runtime_args=resolved_vllm_args, + tensor_parallel=resolved_tp, + max_requests=max_requests, + namespace=namespace, + env_vars=resolved_env_vars, + ) + result = workflow.execute() + + if result.success: + click.echo(f"\nBenchmark completed successfully in {result.duration_seconds:.1f}s") + click.echo(f"Artifacts: {result.run_uuid}") + return 0 + else: + click.echo(f"\nBenchmark failed at step: {result.failed_step}", err=True) + return 1 + + else: + click.echo("Error: Must specify --model", err=True) + return 1 + + +def run_cleanup(ctx: WorkflowContext, namespace: str) -> int: + """Run cleanup phase.""" + workflow = CleanupWorkflow(ctx, namespace=namespace) + result = workflow.execute() + + if result.success: + click.echo(f"Cleanup completed in {result.duration_seconds:.1f}s") + return 0 + else: + click.echo("Cleanup had errors (check logs)", err=True) + return 1 + + +def _run_multi_workload( + ctx: WorkflowContext, + model: str, + workload_list: list[str], + vllm_image: str | None, + tensor_parallel: int | None, + max_requests: int, + namespace: str, + config_loader: ConfigLoader, +) -> int: + """Run multiple workloads with deploy-once optimization. + + Groups workloads by their vllm_args - workloads with different vllm_args + get separate deployment groups (requires vLLM restart). + """ + from projects.core.steps import CollectArtifactsStep, CleanupDeploymentStep, RunGuideLLMStep + from projects.rhaiis.workflows.steps import DeployVLLMStep, WaitForReadyStep + + # Load model config + try: + resolved = config_loader.load_model(model) + hf_model_id = resolved.hf_model_id + base_vllm_args = dict(resolved.vllm_args) + env_vars = dict(resolved.env_vars) + model_key = resolved.key + model_tp = resolved.tensor_parallel + click.echo(f"Using resolved model config: {model_key}") + click.echo(f" HF Model ID: {hf_model_id}") + click.echo(f" vLLM args: {base_vllm_args}") + if env_vars: + click.echo(f" Env vars: {env_vars}") + except KeyError: + hf_model_id = model + base_vllm_args = {} + env_vars = {} + model_tp = 1 + click.echo(f"Model not in registry, using defaults for: {model}") + + tensor_parallel = tensor_parallel if tensor_parallel is not None else model_tp + deployment_name = hf_model_id.split("/")[-1].lower().replace(".", "-").replace("_", "-")[:42] + + # Group workloads by their vllm_args + # Workloads with same vllm_args share a deployment, different vllm_args get separate deployments + workload_groups: dict[tuple, list[str]] = {} + for wl_key in workload_list: + try: + wl_config = config_loader.load_workload(wl_key) + vllm_args_key = tuple(sorted(wl_config.vllm_args.items())) if wl_config.vllm_args else () + except KeyError: + vllm_args_key = () + if vllm_args_key not in workload_groups: + workload_groups[vllm_args_key] = [] + workload_groups[vllm_args_key].append(wl_key) + + num_groups = len(workload_groups) + if num_groups > 1: + click.echo(f"\nWorkloads grouped into {num_groups} deployment groups (different vllm_args)") + + failed = False + total_workloads = len(workload_list) + workload_idx = 0 + + for group_idx, (vllm_args_key, group_workloads) in enumerate(workload_groups.items(), 1): + vllm_args_override = dict(vllm_args_key) if vllm_args_key else {} + + # Merge model vllm_args with workload override + merged_vllm_args = dict(base_vllm_args) + merged_vllm_args.update(vllm_args_override) + + if num_groups > 1: + click.echo(f"\n=== Deployment Group {group_idx}/{num_groups} ===") + click.echo(f"Workloads: {group_workloads}") + if vllm_args_override: + click.echo(f"vllm_args override: {vllm_args_override}") + + click.echo(f"Deploying vLLM for {hf_model_id}...") + + # Deploy + ctx.step_number += 1 + ctx.current_step_name = "deploy" + deploy_step = DeployVLLMStep( + model=hf_model_id, + deployment_name=deployment_name, + vllm_image=vllm_image, + tensor_parallel=tensor_parallel, + namespace=namespace, + runtime_args=merged_vllm_args, + env_vars=env_vars, + ) + deploy_result = deploy_step.execute(ctx) + if not deploy_result.success: + click.echo(f"Deployment failed: {deploy_result.message}", err=True) + return 1 + + # Wait + ctx.step_number += 1 + ctx.current_step_name = "wait" + wait_step = WaitForReadyStep( + deployment_name=deployment_name, + namespace=namespace, + timeout_seconds=3600, + ) + wait_result = wait_step.execute(ctx) + if not wait_result.success: + click.echo(f"Wait failed: {wait_result.message}", err=True) + CleanupDeploymentStep(deployment_name, namespace).execute(ctx) + return 1 + + click.echo("vLLM deployed successfully!") + + # Run workloads in this group + endpoint = f"http://{deployment_name}-predictor.{namespace}.svc.cluster.local:8080/v1" + + for idx, wl in enumerate(group_workloads, 1): + workload_idx += 1 + click.echo(f"\n--- Workload {workload_idx}/{total_workloads}: {wl} ---") + + # Load workload config to get max_seconds and rates + try: + wl_config = config_loader.load_workload(wl) + wl_max_seconds = wl_config.guidellm.get("max_seconds", 300) + wl_rates = wl_config.guidellm.get("rates", [1, 50, 100]) + wl_rate_str = ",".join(str(r) for r in wl_rates) + except KeyError: + wl_max_seconds = 300 + wl_rate_str = "1,50,100" + + ctx.step_number += 1 + ctx.current_step_name = f"benchmark_{wl}" + guidellm_step = RunGuideLLMStep( + endpoint=endpoint, + model=hf_model_id, + namespace=namespace, + workload=wl, + max_requests=max_requests, + max_seconds=wl_max_seconds, + rate=wl_rate_str, + ) + result = guidellm_step.execute(ctx) + + if not result.success: + click.echo(f"Workload {wl} failed: {result.message}", err=True) + failed = True + break + else: + click.echo(f"Workload {wl} completed successfully") + if idx < len(group_workloads): + click.echo("Waiting 5s for in-flight requests to drain...") + time.sleep(5) + + # Cleanup this deployment before starting next group + ctx.step_number += 1 + ctx.current_step_name = "collect_artifacts" + click.echo("\nCollecting artifacts...") + CollectArtifactsStep(app_label=deployment_name, namespace=namespace).execute(ctx) + + ctx.step_number += 1 + ctx.current_step_name = "cleanup" + click.echo("Cleaning up deployment...") + CleanupDeploymentStep(deployment_name, namespace).execute(ctx) + + if failed: + break + + if failed: + return 1 + else: + click.echo(f"\nAll {total_workloads} workloads completed successfully!") + return 0 diff --git a/projects/rhaiis/workflows/__init__.py b/projects/rhaiis/workflows/__init__.py new file mode 100644 index 0000000..f94034c --- /dev/null +++ b/projects/rhaiis/workflows/__init__.py @@ -0,0 +1,11 @@ +"""RHAIIS workflow definitions.""" + +from .benchmark import BenchmarkWorkflow +from .cleanup import CleanupWorkflow +from .prepare import PrepareWorkflow + +__all__ = [ + "BenchmarkWorkflow", + "CleanupWorkflow", + "PrepareWorkflow", +] diff --git a/projects/rhaiis/workflows/benchmark.py b/projects/rhaiis/workflows/benchmark.py new file mode 100644 index 0000000..012513a --- /dev/null +++ b/projects/rhaiis/workflows/benchmark.py @@ -0,0 +1,122 @@ +"""RHAIIS benchmark workflow. + +Deploy vLLM -> Run GuideLLM -> Collect Artifacts +""" + +from projects.core.steps import CleanupDeploymentStep, CollectArtifactsStep, RunGuideLLMStep +from projects.core.workflow import Workflow, WorkflowContext +from projects.rhaiis.workflows.steps import DeployVLLMStep, WaitForReadyStep + + +class BenchmarkWorkflow(Workflow): + """ + RHAIIS benchmark workflow: deploy vLLM, run benchmark, cleanup. + + Steps: + 1. deploy: Deploy vLLM serving + 2. wait: Wait for deployment to be ready + 3. benchmark: Run GuideLLM benchmark + + Finally: + 1. collect_artifacts: Collect logs and events + 2. cleanup: Delete deployment + """ + + def __init__( + self, + ctx: WorkflowContext, + model: str, + workload: str = "balanced", + vllm_image: str = "", + runtime_args: dict | None = None, + tensor_parallel: int = 1, + max_requests: int = 100, + namespace: str = "forge", + env_vars: dict | None = None, + ): + """ + Initialize benchmark workflow. + + Args: + ctx: Workflow context + model: HuggingFace model ID + workload: GuideLLM workload type + vllm_image: vLLM container image (from config) + runtime_args: vLLM runtime arguments (from config) + tensor_parallel: Number of GPUs for tensor parallelism + max_requests: Maximum requests for benchmark + namespace: Kubernetes namespace + env_vars: Environment variables for vLLM (from config) + """ + super().__init__(ctx) + self.model = model + self.workload = workload + self.vllm_image = vllm_image or ctx.get_env("VLLM_IMAGE", "") + self.runtime_args = runtime_args or {} + self.tensor_parallel = tensor_parallel + self.max_requests = max_requests + self.namespace = namespace + self.env_vars = env_vars or {} + + # Generate deployment name from model + self.deployment_name = self._sanitize_name(model) + + def define_steps(self): + """Define workflow steps.""" + # Deploy vLLM + self.add_step( + DeployVLLMStep( + model=self.model, + deployment_name=self.deployment_name, + vllm_image=self.vllm_image, + runtime_args=self.runtime_args, + tensor_parallel=self.tensor_parallel, + namespace=self.namespace, + env_vars=self.env_vars, + ) + ) + + # Wait for deployment to be ready (3600s = 1 hour for large models) + self.add_step( + WaitForReadyStep( + deployment_name=self.deployment_name, + namespace=self.namespace, + timeout_seconds=3600, + ) + ) + + # Run GuideLLM benchmark (as a pod on cluster) + # KServe RawDeployment mode creates service named {name}-predictor + endpoint = f"http://{self.deployment_name}-predictor.{self.namespace}.svc.cluster.local:8080/v1" + self.add_step( + RunGuideLLMStep( + endpoint=endpoint, + model=self.model, + namespace=self.namespace, + workload=self.workload, + max_requests=self.max_requests, + ) + ) + + # Finally: collect artifacts (always runs) + self.add_finally( + CollectArtifactsStep( + app_label=self.deployment_name, + namespace=self.namespace, + ) + ) + + # Finally: cleanup deployment (always runs) + self.add_finally( + CleanupDeploymentStep( + deployment_name=self.deployment_name, + namespace=self.namespace, + ) + ) + + @staticmethod + def _sanitize_name(name: str) -> str: + """Sanitize model name for K8s resource naming.""" + name = name.split("/")[-1].lower() + name = name.replace(".", "-").replace("_", "-") + return name[:42] diff --git a/projects/rhaiis/workflows/cleanup.py b/projects/rhaiis/workflows/cleanup.py new file mode 100644 index 0000000..2a53056 --- /dev/null +++ b/projects/rhaiis/workflows/cleanup.py @@ -0,0 +1,40 @@ +"""RHAIIS cleanup workflow - remove deployments and optionally operators.""" + +from projects.core.workflow import Workflow, WorkflowContext +from projects.rhaiis.workflows.steps import CleanupNamespaceStep + + +class CleanupWorkflow(Workflow): + """ + Cleanup RHAIIS resources. + + Removes deployments from the benchmark namespace. + Optionally can remove operators (not enabled by default). + """ + + def __init__( + self, + ctx: WorkflowContext, + namespace: str = "forge", + remove_operators: bool = False, + ): + """ + Initialize cleanup workflow. + + Args: + ctx: Workflow context + namespace: Namespace to clean up + remove_operators: Whether to also remove operators + """ + super().__init__(ctx) + self.namespace = namespace + self.remove_operators = remove_operators + + def define_steps(self): + """Define cleanup steps.""" + self.add_step( + CleanupNamespaceStep( + namespace=self.namespace, + delete_namespace=False, # Keep namespace, delete contents + ) + ) diff --git a/projects/rhaiis/workflows/prepare.py b/projects/rhaiis/workflows/prepare.py new file mode 100644 index 0000000..fb7f957 --- /dev/null +++ b/projects/rhaiis/workflows/prepare.py @@ -0,0 +1,36 @@ +"""RHAIIS prepare workflow - install operators.""" + +from projects.core.workflow import Workflow, WorkflowContext +from projects.rhaiis.workflows.steps import ( + InstallGPUOperatorStep, + InstallNFDOperatorStep, + InstallRHOAIOperatorStep, +) + + +class PrepareWorkflow(Workflow): + """ + Prepare cluster for RHAIIS benchmarking. + + Installs required operators: + 1. NFD (Node Feature Discovery) Operator + 2. GPU Operator (NVIDIA or AMD) + 3. RHOAI (Red Hat OpenShift AI) Operator + """ + + def __init__(self, ctx: WorkflowContext, rhoai_version: str = "2.19"): + """ + Initialize prepare workflow. + + Args: + ctx: Workflow context + rhoai_version: RHOAI operator version + """ + super().__init__(ctx) + self.rhoai_version = rhoai_version + + def define_steps(self): + """Define operator installation steps.""" + self.add_step(InstallNFDOperatorStep()) + self.add_step(InstallGPUOperatorStep()) + self.add_step(InstallRHOAIOperatorStep(version=self.rhoai_version)) diff --git a/projects/rhaiis/workflows/steps/__init__.py b/projects/rhaiis/workflows/steps/__init__.py new file mode 100644 index 0000000..9d7c8f4 --- /dev/null +++ b/projects/rhaiis/workflows/steps/__init__.py @@ -0,0 +1,14 @@ +"""RHAIIS-specific workflow steps.""" + +from .cleanup import CleanupNamespaceStep +from .deploy import DeployVLLMStep, WaitForReadyStep +from .operators import InstallGPUOperatorStep, InstallNFDOperatorStep, InstallRHOAIOperatorStep + +__all__ = [ + "CleanupNamespaceStep", + "DeployVLLMStep", + "InstallGPUOperatorStep", + "InstallNFDOperatorStep", + "InstallRHOAIOperatorStep", + "WaitForReadyStep", +] diff --git a/projects/rhaiis/workflows/steps/cleanup.py b/projects/rhaiis/workflows/steps/cleanup.py new file mode 100644 index 0000000..661811b --- /dev/null +++ b/projects/rhaiis/workflows/steps/cleanup.py @@ -0,0 +1,101 @@ +"""Cleanup steps for RHAIIS.""" + +import logging +import subprocess +from typing import TYPE_CHECKING + +from projects.core.workflow import StepResult, WorkflowStep + +if TYPE_CHECKING: + from projects.core.workflow import WorkflowContext + +logger = logging.getLogger(__name__) + + +class CleanupNamespaceStep(WorkflowStep): + """Clean up all resources in a namespace.""" + + def __init__( + self, + namespace: str, + delete_namespace: bool = False, + name: str | None = None, + ): + """ + Initialize cleanup step. + + Args: + namespace: Namespace to clean up + delete_namespace: Whether to delete the namespace itself + name: Optional step name + """ + super().__init__(name=name or "cleanup_namespace") + self.namespace = namespace + self.delete_namespace = delete_namespace + + def execute(self, ctx: "WorkflowContext") -> StepResult: + """Delete all resources in namespace.""" + deleted_resources: list[str] = [] + errors: list[str] = [] + + # Resource types to delete (KServe resources first, then standard K8s) + resource_types = [ + "inferenceservice", + "servingruntime", + "deployment", + "service", + "route", + "configmap", + "secret", + "pod", + ] + + for resource_type in resource_types: + try: + result = subprocess.run( + [ + "oc", "delete", resource_type, + "--all", + "-n", self.namespace, + "--ignore-not-found", + ], + capture_output=True, + text=True, + timeout=120, + ) + + if result.returncode == 0: + deleted_resources.append(resource_type) + else: + errors.append(f"{resource_type}: {result.stderr}") + + except subprocess.TimeoutExpired: + errors.append(f"{resource_type}: timeout") + except Exception as e: + errors.append(f"{resource_type}: {e}") + + # Optionally delete namespace + if self.delete_namespace: + try: + result = subprocess.run( + ["oc", "delete", "namespace", self.namespace, "--ignore-not-found"], + capture_output=True, + text=True, + timeout=300, + ) + if result.returncode == 0: + deleted_resources.append(f"namespace/{self.namespace}") + except Exception as e: + errors.append(f"namespace: {e}") + + message = f"Cleaned up {len(deleted_resources)} resource types" + if errors: + message += f" ({len(errors)} errors)" + for err in errors: + logger.warning(f"Cleanup error: {err}") + + return StepResult( + success=True, # Don't fail on cleanup errors + message=message, + data={"deleted": deleted_resources, "errors": errors}, + ) diff --git a/projects/rhaiis/workflows/steps/deploy.py b/projects/rhaiis/workflows/steps/deploy.py new file mode 100644 index 0000000..c580899 --- /dev/null +++ b/projects/rhaiis/workflows/steps/deploy.py @@ -0,0 +1,512 @@ +"""RHAIIS vLLM deployment steps using KServe (ServingRuntime + InferenceService).""" + +import logging +import subprocess +import time +import uuid +from typing import TYPE_CHECKING, Any + +from projects.core.workflow import StepResult, WorkflowStep + +if TYPE_CHECKING: + from projects.core.workflow import WorkflowContext + +logger = logging.getLogger(__name__) + + +class DeployVLLMStep(WorkflowStep): + """ + Deploy vLLM serving on OpenShift using KServe. + + Creates a ServingRuntime and InferenceService for vLLM model serving. + This is the recommended deployment method for RHAIIS/RHOAI. + """ + + def __init__( + self, + model: str, + deployment_name: str, + vllm_image: str, + runtime_args: dict[str, Any], + namespace: str = "forge", + tensor_parallel: int | None = None, + replicas: int = 1, + accelerator: str = "nvidia", + storage_source: str = "hf", + storage_path: str | None = None, + cpu_request: str = "4", + memory_request: str = "16Gi", + env_vars: dict[str, str] | None = None, + name: str | None = None, + ): + """ + Initialize vLLM deployment step. + + Args: + model: HuggingFace model ID (e.g., Qwen/Qwen3-0.6B) + deployment_name: Name for K8s resources (ServingRuntime, InferenceService) + vllm_image: Container image for vLLM (from config) + runtime_args: vLLM runtime arguments (from config, includes all vllm_args) + namespace: Kubernetes namespace + tensor_parallel: Override tensor parallelism (default: from runtime_args) + replicas: Number of replicas (minReplicas) + accelerator: GPU accelerator type ("nvidia" or "amd") + storage_source: Model storage source ("hf" for HuggingFace, "s3", "pvc") + storage_path: Storage path (PVC name for hf, bucket path for s3) + cpu_request: CPU request + memory_request: Memory request + env_vars: Environment variables (from config) + name: Optional step name + """ + super().__init__(name=name or "deploy") + self.model = model + self.deployment_name = deployment_name + self.accelerator = accelerator.lower() + self.vllm_image = vllm_image + self.namespace = namespace + self.replicas = replicas + self.cpu_request = cpu_request + self.memory_request = memory_request + self.storage_source = storage_source + self.storage_path = storage_path + + # Use runtime_args directly from config + self.runtime_args = dict(runtime_args) + + # tensor_parallel: use explicit override or get from runtime_args + self.tensor_parallel = tensor_parallel or self.runtime_args.get("tensor-parallel-size", 1) + + self.env_vars = env_vars or {} + self.deployment_uuid = str(uuid.uuid4())[:8] + + def execute(self, ctx: "WorkflowContext") -> StepResult: + """Deploy vLLM to OpenShift using KServe.""" + step_dir = ctx.artifact_dir / f"{ctx.step_number:03d}__{ctx.current_step_name}" + step_dir.mkdir(parents=True, exist_ok=True) + + # Generate KServe YAML (ServingRuntime + InferenceService) + kserve_yaml = self._generate_kserve_yaml() + yaml_path = step_dir / "kserve.yaml" + yaml_path.write_text(kserve_yaml) + + # Ensure namespace exists + subprocess.run( + ["oc", "create", "namespace", self.namespace, "--dry-run=client", "-o", "yaml"], + capture_output=True, + ) + subprocess.run( + ["oc", "apply", "-f", "-"], + input=f"apiVersion: v1\nkind: Namespace\nmetadata:\n name: {self.namespace}\n", + capture_output=True, + text=True, + ) + + # Apply KServe resources + try: + result = subprocess.run( + ["oc", "apply", "-f", str(yaml_path)], + capture_output=True, + text=True, + timeout=60, + ) + + if result.returncode != 0: + return StepResult.fail( + f"Failed to apply KServe resources: {result.stderr}", + error=RuntimeError(result.stderr), + ) + + logger.info(f"Deployed InferenceService {self.deployment_name} to {self.namespace}") + return StepResult.ok( + f"Deployed {self.deployment_name}", + deployment_name=self.deployment_name, + namespace=self.namespace, + deployment_uuid=self.deployment_uuid, + ) + + except subprocess.TimeoutExpired as e: + return StepResult.fail("Deployment timed out", error=e) + except Exception as e: + return StepResult.fail(f"Deployment error: {e}", error=e) + + def _generate_kserve_yaml(self) -> str: + """Generate KServe ServingRuntime and InferenceService YAML.""" + # Build vLLM args + args_lines = self._build_args_lines() + + # Build env vars + env_lines = self._build_env_lines() + + # Shared memory volume (always needed for vLLM) + volume_mounts = """ + volumeMounts: + - name: shared-memory + mountPath: /dev/shm""" + volumes = """ + volumes: + - name: shared-memory + emptyDir: + medium: Memory + sizeLimit: 8Gi""" + + # GPU resource type based on accelerator + gpu_resource = "nvidia.com/gpu" if self.accelerator == "nvidia" else "amd.com/gpu" + + # Storage URI based on source + storage_uri = self._build_storage_uri() + + return f"""--- +apiVersion: serving.kserve.io/v1alpha1 +kind: ServingRuntime +metadata: + annotations: + opendatahub.io/template-display-name: ServingRuntime for vLLM | Forge + labels: + opendatahub.io/dashboard: "true" + name: {self.deployment_name} + namespace: {self.namespace} +spec: + builtInAdapter: + modelLoadingTimeoutMillis: 300000 + imagePullSecrets: + - name: npalaska-image-pull + containers: + - command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + args: +{args_lines} + env: +{env_lines} + image: "{self.vllm_image}" + name: kserve-container + ports: + - containerPort: 8080 + protocol: TCP{volume_mounts} + multiModel: false + supportedModelFormats: + - autoSelect: true + name: pytorch{volumes} +--- +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + annotations: + prometheus.io/scrape: "true" + prometheus.io/path: "/metrics" + prometheus.io/port: "8000" + serving.kserve.io/deploymentMode: RawDeployment + serving.kserve.io/enable-prometheus-scraping: "true" + storage.kserve.io/readonly: "false" + labels: + opendatahub.io/dashboard: "true" + deployment_uuid: {self.deployment_uuid} + app: {self.deployment_name} + name: {self.deployment_name} + namespace: {self.namespace} +spec: + predictor: + minReplicas: {self.replicas} + model: + resources: + limits: + {gpu_resource}: "{self.tensor_parallel}" + requests: + {gpu_resource}: "{self.tensor_parallel}" + cpu: "{self.cpu_request}" + memory: "{self.memory_request}" + runtime: {self.deployment_name} + modelFormat: + name: pytorch + storageUri: {storage_uri} + serviceAccountName: sa +""" + + def _build_args_lines(self) -> str: + """Build vLLM command line arguments.""" + lines = [] + + # Model argument depends on storage source + if self.storage_source == "hf": + lines.append(f" - --model={self.model}") + else: + lines.append(" - --model=/mnt/models") + lines.append(f" - --served-model-name={self.model}") + + lines.append(" - --port=8080") + + # Add runtime args + for key, val in self.runtime_args.items(): + if isinstance(val, bool): + if val: + lines.append(f" - --{key}") + else: + lines.append(f" - --{key}={val}") + + return "\n".join(lines) + + def _build_env_lines(self) -> str: + """Build environment variables.""" + lines = [] + + + # HuggingFace storage source env vars + if self.storage_source == "hf": + lines.extend([ + " - name: HF_HUB_OFFLINE", + ' value: "0"', + " - name: HOME", + " value: /mnt/models", + " - name: HF_HOME", + " value: /mnt/models", + " - name: VLLM_CACHE_DIR", + " value: /mnt/models/.cache/vllm", + " - name: HF_DATASETS_CACHE", + " value: /mnt/models/.cache/huggingface/datasets", + " - name: HF_TOKEN", + " valueFrom:", + " secretKeyRef:", + " name: storage-config", + " key: HF_TOKEN", + ]) + + # Additional env vars + for key, val in self.env_vars.items(): + lines.append(f" - name: {key}") + lines.append(f' value: "{val}"') + + return "\n".join(lines) if lines else " []" + + def _build_storage_uri(self) -> str: + """Build storage URI for InferenceService.""" + if self.storage_source == "hf": + # Use PVC for HuggingFace models (model-pvc-2 is the default on H200) + pvc_name = self.storage_path or "model-pvc-2" + return f"pvc://{pvc_name}" + elif self.storage_path: + return f"{self.storage_source}://{self.storage_path}" + else: + return f"{self.storage_source}://{self.model}" + + +class WaitForReadyStep(WorkflowStep): + """Wait for InferenceService to become ready.""" + + def __init__( + self, + deployment_name: str, + namespace: str = "forge", + timeout_seconds: int = 3600, + poll_interval: int = 10, + name: str | None = None, + ): + """ + Initialize wait step. + + Args: + deployment_name: Name of InferenceService to wait for + namespace: Kubernetes namespace + timeout_seconds: Maximum wait time + poll_interval: Seconds between status checks + name: Optional step name + """ + super().__init__(name=name or "wait") + self.deployment_name = deployment_name + self.namespace = namespace + self.timeout_seconds = timeout_seconds + self.poll_interval = poll_interval + + def execute(self, ctx: "WorkflowContext") -> StepResult: + """Wait for InferenceService to be ready.""" + import click + + click.echo( + f"Waiting for InferenceService {self.deployment_name} to be ready " + f"(timeout: {self.timeout_seconds}s)..." + ) + + start_time = time.monotonic() + last_status_print = 0 + + while time.monotonic() - start_time < self.timeout_seconds: + elapsed = int(time.monotonic() - start_time) + + # Print status every 30 seconds + if elapsed - last_status_print >= 30: + click.echo(f" Still waiting... ({elapsed}s elapsed)") + last_status_print = elapsed + try: + # Check InferenceService status + result = subprocess.run( + [ + "oc", "get", "inferenceservice", + self.deployment_name, + "-n", self.namespace, + "-o", "jsonpath={.status.conditions[?(@.type=='Ready')].status}", + ], + capture_output=True, + text=True, + timeout=15, + ) + + if result.returncode == 0 and result.stdout.strip() == "True": + elapsed = time.monotonic() - start_time + logger.info(f"InferenceService ready in {elapsed:.1f}s") + + # Get the service URL + url_result = subprocess.run( + [ + "oc", "get", "inferenceservice", + self.deployment_name, + "-n", self.namespace, + "-o", "jsonpath={.status.url}", + ], + capture_output=True, + text=True, + ) + + # Health check: verify vLLM endpoint is actually responding + endpoint = f"http://{self.deployment_name}-predictor.{self.namespace}.svc.cluster.local:8080" + health_ok = self._wait_for_health_check(endpoint) + if not health_ok: + return StepResult.fail( + f"InferenceService ready but health check failed after {self.timeout_seconds}s" + ) + + total_elapsed = time.monotonic() - start_time + return StepResult.ok( + f"InferenceService ready and healthy in {total_elapsed:.1f}s", + ready_time_seconds=elapsed, + health_check_time_seconds=total_elapsed - elapsed, + service_url=url_result.stdout.strip() if url_result.returncode == 0 else None, + ) + + # Also check underlying deployment for debugging + deploy_result = subprocess.run( + [ + "oc", "rollout", "status", + f"deployment/{self.deployment_name}-predictor", + "-n", self.namespace, + "--timeout=5s", + ], + capture_output=True, + text=True, + timeout=10, + ) + + if deploy_result.returncode == 0: + logger.debug("Underlying deployment is ready, waiting for InferenceService...") + + except subprocess.TimeoutExpired: + pass # Continue waiting + except Exception as e: + logger.warning(f"Error checking status: {e}") + + time.sleep(self.poll_interval) + + # Timeout - collect debug info + self._log_debug_info() + + return StepResult.fail( + f"InferenceService not ready after {self.timeout_seconds}s" + ) + + def _log_debug_info(self): + """Log debug information on timeout.""" + try: + # Get InferenceService status + result = subprocess.run( + ["oc", "get", "inferenceservice", self.deployment_name, "-n", self.namespace, "-o", "yaml"], + capture_output=True, + text=True, + ) + if result.returncode == 0: + logger.error(f"InferenceService status:\n{result.stdout}") + + # Get pod status + result = subprocess.run( + ["oc", "get", "pods", "-l", f"serving.kserve.io/inferenceservice={self.deployment_name}", + "-n", self.namespace], + capture_output=True, + text=True, + ) + if result.returncode == 0: + logger.error(f"Pod status:\n{result.stdout}") + except Exception as e: + logger.warning(f"Failed to collect debug info: {e}") + + def _wait_for_health_check(self, endpoint: str, timeout: int = 120, interval: int = 5) -> bool: + """ + Wait for vLLM health endpoint to respond. + + Uses oc exec to curl the health endpoint from within the cluster. + + Args: + endpoint: vLLM service endpoint URL + timeout: Maximum wait time in seconds + interval: Seconds between health check attempts + + Returns: + True if health check passes, False on timeout + """ + import click + + click.echo(f"Verifying vLLM health check at {endpoint}/health ...") + + start_time = time.monotonic() + + while time.monotonic() - start_time < timeout: + try: + # Get a pod name to exec into for health check + pod_result = subprocess.run( + [ + "oc", "get", "pods", + "-l", f"serving.kserve.io/inferenceservice={self.deployment_name}", + "-n", self.namespace, + "-o", "jsonpath={.items[0].metadata.name}", + ], + capture_output=True, + text=True, + timeout=10, + ) + + if pod_result.returncode != 0 or not pod_result.stdout.strip(): + logger.warning("No pod found for health check") + time.sleep(interval) + continue + + pod_name = pod_result.stdout.strip() + + # Try /health endpoint via localhost (pod-internal) + health_result = subprocess.run( + [ + "oc", "exec", pod_name, + "-n", self.namespace, + "-c", "kserve-container", + "--", + "curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", + "http://localhost:8080/health", + ], + capture_output=True, + text=True, + timeout=15, + ) + + elapsed = int(time.monotonic() - start_time) + + if health_result.returncode == 0 and health_result.stdout.strip() == "200": + click.echo(f" Health check passed ({elapsed}s)") + return True + else: + click.echo(f" Health check pending... ({elapsed}s, status={health_result.stdout.strip()})") + + except subprocess.TimeoutExpired: + click.echo(f" Health check timeout, retrying...") + except Exception as e: + logger.warning(f"Health check error: {e}") + + time.sleep(interval) + + click.echo(f" Health check failed after {timeout}s") + return False diff --git a/projects/rhaiis/workflows/steps/operators.py b/projects/rhaiis/workflows/steps/operators.py new file mode 100644 index 0000000..9ee0059 --- /dev/null +++ b/projects/rhaiis/workflows/steps/operators.py @@ -0,0 +1,148 @@ +"""Operator installation steps for RHAIIS.""" + +import logging +import subprocess +from typing import TYPE_CHECKING + +from projects.core.workflow import StepResult, WorkflowStep + +if TYPE_CHECKING: + from projects.core.workflow import WorkflowContext + +logger = logging.getLogger(__name__) + + +class InstallNFDOperatorStep(WorkflowStep): + """Install Node Feature Discovery operator.""" + + def __init__(self, name: str | None = None): + super().__init__(name=name or "install_nfd") + + def execute(self, ctx: "WorkflowContext") -> StepResult: + """Install NFD operator.""" + step_dir = ctx.artifact_dir / f"{ctx.step_number:03d}__{ctx.current_step_name}" + + subscription_yaml = """ +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: nfd + namespace: openshift-nfd +spec: + channel: stable + name: nfd + source: redhat-operators + sourceNamespace: openshift-marketplace +""" + yaml_path = step_dir / "nfd-subscription.yaml" + yaml_path.write_text(subscription_yaml) + + # Create namespace first + self._run_oc(["create", "namespace", "openshift-nfd", "--dry-run=client", "-o", "yaml"]) + self._run_oc(["apply", "-f", str(yaml_path)]) + + return StepResult.ok("NFD operator subscription created") + + def _run_oc(self, args: list[str]) -> bool: + """Run oc command.""" + try: + result = subprocess.run( + ["oc", *args], + capture_output=True, + text=True, + timeout=60, + ) + return result.returncode == 0 + except Exception as e: + logger.warning(f"oc command failed: {e}") + return False + + +class InstallGPUOperatorStep(WorkflowStep): + """Install NVIDIA GPU operator.""" + + def __init__(self, name: str | None = None): + super().__init__(name=name or "install_gpu") + + def execute(self, ctx: "WorkflowContext") -> StepResult: + """Install GPU operator.""" + step_dir = ctx.artifact_dir / f"{ctx.step_number:03d}__{ctx.current_step_name}" + + subscription_yaml = """ +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: gpu-operator-certified + namespace: nvidia-gpu-operator +spec: + channel: v24.6 + name: gpu-operator-certified + source: certified-operators + sourceNamespace: openshift-marketplace +""" + yaml_path = step_dir / "gpu-subscription.yaml" + yaml_path.write_text(subscription_yaml) + + # Create namespace first + subprocess.run( + ["oc", "create", "namespace", "nvidia-gpu-operator"], + capture_output=True, + ) + + result = subprocess.run( + ["oc", "apply", "-f", str(yaml_path)], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + return StepResult.fail(f"Failed to install GPU operator: {result.stderr}") + + return StepResult.ok("GPU operator subscription created") + + +class InstallRHOAIOperatorStep(WorkflowStep): + """Install Red Hat OpenShift AI operator.""" + + def __init__(self, version: str = "2.19", name: str | None = None): + super().__init__(name=name or "install_rhoai") + self.version = version + + def execute(self, ctx: "WorkflowContext") -> StepResult: + """Install RHOAI operator.""" + step_dir = ctx.artifact_dir / f"{ctx.step_number:03d}__{ctx.current_step_name}" + + # Determine channel from version + channel = f"stable-{self.version}" + + subscription_yaml = f""" +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: rhods-operator + namespace: redhat-ods-operator +spec: + channel: {channel} + name: rhods-operator + source: redhat-operators + sourceNamespace: openshift-marketplace +""" + yaml_path = step_dir / "rhoai-subscription.yaml" + yaml_path.write_text(subscription_yaml) + + # Create namespace first + subprocess.run( + ["oc", "create", "namespace", "redhat-ods-operator"], + capture_output=True, + ) + + result = subprocess.run( + ["oc", "apply", "-f", str(yaml_path)], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + return StepResult.fail(f"Failed to install RHOAI operator: {result.stderr}") + + return StepResult.ok(f"RHOAI operator {self.version} subscription created") diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..f72b470 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Forge tests.""" diff --git a/tests/core/__init__.py b/tests/core/__init__.py new file mode 100644 index 0000000..ca2ce5c --- /dev/null +++ b/tests/core/__init__.py @@ -0,0 +1 @@ +"""Tests for projects/core modules.""" diff --git a/tests/core/scenarios/__init__.py b/tests/core/scenarios/__init__.py new file mode 100644 index 0000000..0047182 --- /dev/null +++ b/tests/core/scenarios/__init__.py @@ -0,0 +1 @@ +"""Tests for scenario generator.""" diff --git a/tests/core/scenarios/test_config_loader.py b/tests/core/scenarios/test_config_loader.py new file mode 100644 index 0000000..26d49ba --- /dev/null +++ b/tests/core/scenarios/test_config_loader.py @@ -0,0 +1,731 @@ +"""Unit tests for ConfigLoader.""" + +import tempfile +from pathlib import Path + +import pytest +import yaml + +from projects.core.scenarios import ConfigLoader, ResolvedModelConfig, ResolvedWorkloadConfig + + +class TestConfigLoader: + """Tests for ConfigLoader inheritance and resolution.""" + + @pytest.fixture + def config_dir(self, tmp_path): + """Create a config directory with defaults, models, and workloads.""" + # defaults.yaml + defaults = { + "defaults": { + "deploy": { + "namespace": "forge", + "replicas": 1, + "cpu_request": "4", + "memory_request": "16Gi", + }, + "vllm_args": { + "gpu-memory-utilization": 0.9, + "trust-remote-code": True, + "tensor-parallel-size": 1, # Also determines num_gpus + }, + "guidellm": { + "max_requests": 100, + "rate_type": "concurrent", + }, + }, + "accelerators": { + "nvidia": { + "image": "quay.io/rhaiis/cuda:latest", + "vllm_args": {}, + "env_vars": {}, + }, + "amd": { + "image": "quay.io/rhaiis/rocm:latest", + "vllm_args": { + "num-scheduler-steps": 8, + }, + "env_vars": { + "VLLM_ROCM_USE_AITER": "1", + }, + }, + }, + } + (tmp_path / "defaults.yaml").write_text(yaml.safe_dump(defaults)) + + # models.yaml + models = { + "models": { + "qwen-0.6b": { + "name": "Qwen3-0.6B", + "hf_model_id": "Qwen/Qwen3-0.6B", + "vllm_args": { + "max-model-len": 8192, + }, + "supported_workloads": ["balanced", "short"], + }, + "llama-70b-fp8": { + "name": "Llama-3.3-70B-FP8", + "hf_model_id": "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic", + "aliases": ["llama-70b", "llama-fp8"], + "vllm_args": { + "tensor-parallel-size": 4, + "max-model-len": 32768, + "kv-cache-dtype": "fp8", + }, + "supported_workloads": ["balanced", "short", "long-prompt"], + }, + "deepseek-r1": { + "name": "DeepSeek-R1", + "hf_model_id": "deepseek-ai/DeepSeek-R1-0528", + "vllm_args": { + "tensor-parallel-size": 8, + }, + "accelerator_overrides": { + "amd": { + "env_vars": { + "VLLM_ROCM_USE_AITER": "0", + }, + }, + }, + }, + # Model with env_vars that apply to all accelerators + "model-with-env": { + "name": "Model With Env", + "hf_model_id": "test/model-with-env", + "env_vars": { + "VLLM_MXFP4_USE_MARLIN": "1", + "CUSTOM_VAR": "model-value", + }, + }, + # Model with both model-level and accelerator-specific env_vars + "model-with-overrides": { + "name": "Model With Overrides", + "hf_model_id": "test/model-with-overrides", + "env_vars": { + "SHARED_VAR": "model-default", + "MODEL_ONLY_VAR": "from-model", + }, + "accelerator_overrides": { + "nvidia": { + "env_vars": { + "TORCH_CUDA_ARCH_LIST": "9.0", + "SHARED_VAR": "nvidia-override", + }, + }, + "amd": { + "env_vars": { + "SHARED_VAR": "amd-override", + }, + }, + }, + }, + }, + } + (tmp_path / "models.yaml").write_text(yaml.safe_dump(models)) + + # workloads.yaml + workloads = { + "workloads": { + "balanced": { + "name": "Balanced", + "description": "Balanced prompt and output (1k/1k)", + "guidellm": { + "data": "prompt_tokens=1000,output_tokens=1000", + "rates": [1, 50, 100], + }, + "max_seconds": 180, + }, + "short": { + "name": "Short", + "description": "Short prompt and output (256/256)", + "guidellm": { + "data": "prompt_tokens=256,output_tokens=256", + }, + "max_seconds": 120, + }, + "long-prompt": { + "name": "Long Prompt", + "description": "Long prompt (8k/1k) - requires larger context", + "guidellm": { + "data": "prompt_tokens=8000,output_tokens=1000", + }, + "max_seconds": 300, + "vllm_args": { + "max-model-len": 10000, + }, + }, + "very-long-prompt": { + "name": "Very Long Prompt", + "description": "Very long prompt (16k/1k)", + "guidellm": { + "data": "prompt_tokens=16000,output_tokens=1000", + }, + "max_seconds": 600, + "vllm_args": { + "max-model-len": 20000, + }, + }, + }, + } + (tmp_path / "workloads.yaml").write_text(yaml.safe_dump(workloads)) + + return tmp_path + + def test_load_model_basic(self, config_dir): + """ConfigLoader loads model with defaults applied.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + model = loader.load_model("qwen-0.6b") + + assert isinstance(model, ResolvedModelConfig) + assert model.key == "qwen-0.6b" + assert model.name == "Qwen3-0.6B" + assert model.hf_model_id == "Qwen/Qwen3-0.6B" + + def test_defaults_inheritance(self, config_dir): + """Model inherits from global defaults.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + model = loader.load_model("qwen-0.6b") + + # From defaults + assert model.vllm_args["gpu-memory-utilization"] == 0.9 + assert model.vllm_args["trust-remote-code"] is True + + # From model config (overrides default) + assert model.vllm_args["max-model-len"] == 8192 + + def test_accelerator_nvidia_defaults(self, config_dir): + """NVIDIA accelerator uses correct settings.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + model = loader.load_model("qwen-0.6b") + + # NVIDIA has no special vllm_args or env_vars + assert "num-scheduler-steps" not in model.vllm_args + assert model.env_vars == {} + + def test_accelerator_amd_defaults(self, config_dir): + """AMD accelerator applies accelerator-specific settings.""" + loader = ConfigLoader(config_dir, accelerator="amd") + model = loader.load_model("qwen-0.6b") + + # AMD accelerator defaults + assert model.vllm_args["num-scheduler-steps"] == 8 + assert model.env_vars["VLLM_ROCM_USE_AITER"] == "1" + + def test_accelerator_overrides_in_model(self, config_dir): + """Model-specific accelerator overrides take precedence.""" + # DeepSeek needs AITER disabled on AMD + loader = ConfigLoader(config_dir, accelerator="amd") + model = loader.load_model("deepseek-r1") + + # Model accelerator_override takes precedence over accelerator defaults + assert model.env_vars["VLLM_ROCM_USE_AITER"] == "0" + + def test_model_level_env_vars(self, config_dir): + """Model-level env_vars apply to all accelerators.""" + # Test on NVIDIA + nvidia_loader = ConfigLoader(config_dir, accelerator="nvidia") + model_nvidia = nvidia_loader.load_model("model-with-env") + + assert model_nvidia.env_vars["VLLM_MXFP4_USE_MARLIN"] == "1" + assert model_nvidia.env_vars["CUSTOM_VAR"] == "model-value" + + # Test on AMD - same model env_vars plus AMD accelerator defaults + amd_loader = ConfigLoader(config_dir, accelerator="amd") + model_amd = amd_loader.load_model("model-with-env") + + assert model_amd.env_vars["VLLM_MXFP4_USE_MARLIN"] == "1" + assert model_amd.env_vars["CUSTOM_VAR"] == "model-value" + # Also gets AMD accelerator defaults + assert model_amd.env_vars["VLLM_ROCM_USE_AITER"] == "1" + + def test_env_vars_inheritance_chain(self, config_dir): + """Env vars follow inheritance: accelerator → model → model.accelerator_overrides.""" + # NVIDIA: accelerator has no env_vars, model has some, model.accelerator_overrides adds CUDA arch + nvidia_loader = ConfigLoader(config_dir, accelerator="nvidia") + model_nvidia = nvidia_loader.load_model("model-with-overrides") + + assert model_nvidia.env_vars["MODEL_ONLY_VAR"] == "from-model" + assert model_nvidia.env_vars["TORCH_CUDA_ARCH_LIST"] == "9.0" + # SHARED_VAR: nvidia override wins over model default + assert model_nvidia.env_vars["SHARED_VAR"] == "nvidia-override" + + # AMD: accelerator has AITER, model has its vars, model.accelerator_overrides overrides SHARED_VAR + amd_loader = ConfigLoader(config_dir, accelerator="amd") + model_amd = amd_loader.load_model("model-with-overrides") + + assert model_amd.env_vars["MODEL_ONLY_VAR"] == "from-model" + assert model_amd.env_vars["VLLM_ROCM_USE_AITER"] == "1" # From AMD accelerator defaults + # SHARED_VAR: amd override wins over model default + assert model_amd.env_vars["SHARED_VAR"] == "amd-override" + # No CUDA arch on AMD + assert "TORCH_CUDA_ARCH_LIST" not in model_amd.env_vars + + def test_model_alias_lookup(self, config_dir): + """ConfigLoader finds model by alias.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + model = loader.load_model("llama-70b") + + assert model.key == "llama-70b-fp8" + assert model.hf_model_id == "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" + + def test_model_hf_id_lookup(self, config_dir): + """ConfigLoader finds model by HuggingFace ID.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + model = loader.load_model("Qwen/Qwen3-0.6B") + + assert model.key == "qwen-0.6b" + + def test_model_not_found(self, config_dir): + """ConfigLoader raises KeyError for unknown model.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + + with pytest.raises(KeyError, match="not found"): + loader.load_model("nonexistent-model") + + def test_num_gpus_property(self, config_dir): + """ResolvedModelConfig.num_gpus returns correct value.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + + # Small model - 1 GPU + small = loader.load_model("qwen-0.6b") + assert small.num_gpus == 1 + + # Large model - 4 GPUs + large = loader.load_model("llama-70b-fp8") + assert large.num_gpus == 4 + + def test_tensor_parallel_property(self, config_dir): + """ResolvedModelConfig.tensor_parallel returns correct value.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + + # Default TP=1 + small = loader.load_model("qwen-0.6b") + assert small.tensor_parallel == 1 + + # TP=4 from model config + large = loader.load_model("llama-70b-fp8") + assert large.tensor_parallel == 4 + + def test_load_workload(self, config_dir): + """ConfigLoader loads workload with guidellm defaults merged.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + workload = loader.load_workload("balanced") + + assert isinstance(workload, ResolvedWorkloadConfig) + assert workload.key == "balanced" + assert workload.name == "Balanced" + assert workload.max_seconds == 180 + + # Guidellm config merged with defaults + assert workload.guidellm["data"] == "prompt_tokens=1000,output_tokens=1000" + assert workload.guidellm["rate_type"] == "concurrent" # From defaults + assert workload.guidellm["rates"] == [1, 50, 100] # From workload + + def test_load_workload_without_vllm_args(self, config_dir): + """Workload without vllm_args has empty dict.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + workload = loader.load_workload("balanced") + + assert workload.vllm_args == {} + + def test_load_workload_with_vllm_args(self, config_dir): + """Workload with vllm_args returns the override.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + workload = loader.load_workload("long-prompt") + + assert workload.key == "long-prompt" + assert workload.vllm_args == {"max-model-len": 10000} + + def test_workload_not_found(self, config_dir): + """ConfigLoader raises KeyError for unknown workload.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + + with pytest.raises(KeyError, match="not found"): + loader.load_workload("nonexistent") + + def test_get_image(self, config_dir): + """ConfigLoader returns correct image for accelerator.""" + nvidia_loader = ConfigLoader(config_dir, accelerator="nvidia") + assert nvidia_loader.get_image() == "quay.io/rhaiis/cuda:latest" + + amd_loader = ConfigLoader(config_dir, accelerator="amd") + assert amd_loader.get_image() == "quay.io/rhaiis/rocm:latest" + + def test_list_models(self, config_dir): + """ConfigLoader.list_models returns all model keys.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + models = loader.list_models() + + assert "qwen-0.6b" in models + assert "llama-70b-fp8" in models + assert "deepseek-r1" in models + assert "model-with-env" in models + assert "model-with-overrides" in models + assert len(models) == 5 + + def test_list_workloads(self, config_dir): + """ConfigLoader.list_workloads returns all workload keys.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + workloads = loader.list_workloads() + + assert "balanced" in workloads + assert "short" in workloads + assert "long-prompt" in workloads + assert "very-long-prompt" in workloads + assert len(workloads) == 4 + + def test_caching(self, config_dir): + """ConfigLoader caches loaded configs.""" + loader = ConfigLoader(config_dir, accelerator="nvidia") + + # Access defaults twice - should be same object + defaults1 = loader.defaults + defaults2 = loader.defaults + assert defaults1 is defaults2 + + # Same for models + models1 = loader.models + models2 = loader.models + assert models1 is models2 + + +class TestConfigLoaderScenarios: + """Tests for ConfigLoader scenario loading.""" + + @pytest.fixture + def full_config_dir(self, tmp_path): + """Create full config directory with scenarios.""" + # Create base configs + defaults = { + "defaults": { + "deploy": {"namespace": "forge"}, + "vllm_args": {"gpu-memory-utilization": 0.9}, + "guidellm": {"max_requests": 100}, + }, + "accelerators": { + "nvidia": {"image": "cuda:latest"}, + }, + } + (tmp_path / "defaults.yaml").write_text(yaml.safe_dump(defaults)) + + models = { + "models": { + "test-model": { + "hf_model_id": "test/model", + "vllm_args": {"max-model-len": 4096}, + }, + }, + } + (tmp_path / "models.yaml").write_text(yaml.safe_dump(models)) + + workloads = { + "workloads": { + "balanced": {"guidellm": {"data": "1k"}}, + }, + } + (tmp_path / "workloads.yaml").write_text(yaml.safe_dump(workloads)) + + # Create scenarios directory + scenarios_dir = tmp_path / "scenarios" + scenarios_dir.mkdir() + + scenario = { + "name": "test-scenario", + "defaults": { + "deploy": { + "namespace": "test-ns", + }, + }, + "scenarios": [ + { + "model": "test-model", + "workloads": ["balanced"], + }, + ], + } + (scenarios_dir / "test.yaml").write_text(yaml.safe_dump(scenario)) + + return tmp_path + + def test_load_scenario(self, full_config_dir): + """ConfigLoader loads scenario with resolved defaults.""" + loader = ConfigLoader(full_config_dir, accelerator="nvidia") + scenario = loader.load_scenario(full_config_dir / "scenarios" / "test.yaml") + + assert scenario["name"] == "test-scenario" + assert scenario["_accelerator"] == "nvidia" + + # Resolved defaults merge global + scenario + resolved = scenario["_resolved_defaults"] + assert resolved["deploy"]["namespace"] == "test-ns" # From scenario + assert resolved["vllm_args"]["gpu-memory-utilization"] == 0.9 # From global + + +class TestDeepMerge: + """Tests for deep_merge utility function.""" + + def test_basic_merge(self): + """Basic dictionary merge.""" + from projects.core.scenarios.config_loader import deep_merge + + base = {"a": 1, "b": 2} + override = {"b": 3, "c": 4} + result = deep_merge(base, override) + + assert result == {"a": 1, "b": 3, "c": 4} + + def test_nested_merge(self): + """Nested dictionaries are merged recursively.""" + from projects.core.scenarios.config_loader import deep_merge + + base = {"outer": {"a": 1, "b": 2}} + override = {"outer": {"b": 3, "c": 4}} + result = deep_merge(base, override) + + assert result == {"outer": {"a": 1, "b": 3, "c": 4}} + + def test_lists_replaced(self): + """Lists are replaced, not merged.""" + from projects.core.scenarios.config_loader import deep_merge + + base = {"items": [1, 2, 3]} + override = {"items": [4, 5]} + result = deep_merge(base, override) + + assert result == {"items": [4, 5]} + + def test_no_mutation(self): + """Original dicts are not mutated.""" + from projects.core.scenarios.config_loader import deep_merge + + base = {"a": {"b": 1}} + override = {"a": {"c": 2}} + + result = deep_merge(base, override) + + # Original unchanged + assert base == {"a": {"b": 1}} + assert override == {"a": {"c": 2}} + # Result has both + assert result == {"a": {"b": 1, "c": 2}} + + +class TestWorkloadVllmArgsGrouping: + """Tests for workload-specific vllm_args and deployment grouping.""" + + @pytest.fixture + def config_dir_with_vllm_args(self, tmp_path): + """Create config directory with workloads that have vllm_args.""" + # defaults.yaml + defaults = { + "defaults": { + "deploy": {"namespace": "forge"}, + "vllm_args": {"gpu-memory-utilization": 0.9, "max-model-len": 4096}, + "guidellm": {"max_requests": 100}, + }, + "accelerators": { + "nvidia": {"image": "cuda:latest"}, + }, + } + (tmp_path / "defaults.yaml").write_text(yaml.safe_dump(defaults)) + + # models.yaml + models = { + "models": { + "test-model": { + "hf_model_id": "test/model", + "vllm_args": {"trust-remote-code": True}, + }, + }, + } + (tmp_path / "models.yaml").write_text(yaml.safe_dump(models)) + + # workloads.yaml - some with vllm_args, some without + workloads = { + "workloads": { + "balanced": { + "name": "Balanced", + "guidellm": {"data": "1k/1k"}, + }, + "short": { + "name": "Short", + "guidellm": {"data": "256/256"}, + }, + "long-prompt": { + "name": "Long Prompt", + "guidellm": {"data": "8k/1k"}, + "vllm_args": {"max-model-len": 10000}, + }, + "very-long-prompt": { + "name": "Very Long Prompt", + "guidellm": {"data": "16k/1k"}, + "vllm_args": {"max-model-len": 20000}, + }, + }, + } + (tmp_path / "workloads.yaml").write_text(yaml.safe_dump(workloads)) + + # scenarios/test.yaml + scenarios_dir = tmp_path / "scenarios" + scenarios_dir.mkdir() + scenario = { + "name": "test-scenario", + "scenarios": [ + { + "model": "test-model", + "workloads": ["balanced", "short", "long-prompt", "very-long-prompt"], + }, + ], + } + (scenarios_dir / "test.yaml").write_text(yaml.safe_dump(scenario)) + + return tmp_path + + def test_workload_config_has_vllm_args(self, config_dir_with_vllm_args): + """WorkloadConfig parses vllm_args from config.""" + from projects.core.scenarios.generator import WorkloadConfig + + wl = WorkloadConfig.from_dict("long-prompt", { + "name": "Long Prompt", + "guidellm": {"data": "8k/1k"}, + "vllm_args": {"max-model-len": 10000}, + }) + + assert wl.vllm_args == {"max-model-len": 10000} + + def test_workload_config_empty_vllm_args(self, config_dir_with_vllm_args): + """WorkloadConfig without vllm_args has empty dict.""" + from projects.core.scenarios.generator import WorkloadConfig + + wl = WorkloadConfig.from_dict("balanced", { + "name": "Balanced", + "guidellm": {"data": "1k/1k"}, + }) + + assert wl.vllm_args == {} + + def test_deployment_group_merged_vllm_args(self, config_dir_with_vllm_args): + """DeploymentGroup.merged_vllm_args combines model and workload args.""" + from projects.core.scenarios.generator import DeploymentGroup, ModelConfig + + model = ModelConfig( + key="test-model", + name="Test Model", + hf_model_id="test/model", + vllm_args={"gpu-memory-utilization": 0.9, "max-model-len": 4096}, + ) + group = DeploymentGroup( + model=model, + tensor_parallel=1, + routing="direct", + workloads=[], + vllm_args_override={"max-model-len": 10000}, + ) + + merged = group.merged_vllm_args + + # Model args preserved + assert merged["gpu-memory-utilization"] == 0.9 + # Workload override wins + assert merged["max-model-len"] == 10000 + + def test_expand_grouped_separates_by_vllm_args(self, config_dir_with_vllm_args): + """ScenarioGenerator groups workloads by vllm_args.""" + from projects.core.scenarios.generator import ScenarioGenerator + + gen = ScenarioGenerator( + scenarios_path=config_dir_with_vllm_args / "scenarios" / "test.yaml", + config_dir=config_dir_with_vllm_args, + accelerator="nvidia", + ) + gen.load() + groups = gen.expand_grouped() + + # Should have 3 groups: + # 1. balanced + short (no vllm_args) + # 2. long-prompt (max-model-len: 10000) + # 3. very-long-prompt (max-model-len: 20000) + assert len(groups) == 3 + + # Find each group by its vllm_args + no_override_group = None + long_prompt_group = None + very_long_group = None + + for g in groups: + if not g.vllm_args_override: + no_override_group = g + elif g.vllm_args_override.get("max-model-len") == 10000: + long_prompt_group = g + elif g.vllm_args_override.get("max-model-len") == 20000: + very_long_group = g + + # Group without override has balanced + short + assert no_override_group is not None + assert len(no_override_group.workloads) == 2 + assert {w.key for w in no_override_group.workloads} == {"balanced", "short"} + + # long-prompt group + assert long_prompt_group is not None + assert len(long_prompt_group.workloads) == 1 + assert long_prompt_group.workloads[0].key == "long-prompt" + assert long_prompt_group.vllm_args_override == {"max-model-len": 10000} + + # very-long-prompt group + assert very_long_group is not None + assert len(very_long_group.workloads) == 1 + assert very_long_group.workloads[0].key == "very-long-prompt" + assert very_long_group.vllm_args_override == {"max-model-len": 20000} + + def test_same_vllm_args_same_group(self, tmp_path): + """Workloads with identical vllm_args share a deployment group.""" + # Create config where two workloads have same vllm_args + defaults = { + "defaults": {"vllm_args": {}}, + "accelerators": {"nvidia": {"image": "cuda:latest"}}, + } + (tmp_path / "defaults.yaml").write_text(yaml.safe_dump(defaults)) + + models = { + "models": { + "test-model": {"hf_model_id": "test/model"}, + }, + } + (tmp_path / "models.yaml").write_text(yaml.safe_dump(models)) + + workloads = { + "workloads": { + "long-a": { + "guidellm": {"data": "a"}, + "vllm_args": {"max-model-len": 10000}, + }, + "long-b": { + "guidellm": {"data": "b"}, + "vllm_args": {"max-model-len": 10000}, # Same as long-a + }, + }, + } + (tmp_path / "workloads.yaml").write_text(yaml.safe_dump(workloads)) + + scenarios_dir = tmp_path / "scenarios" + scenarios_dir.mkdir() + scenario = { + "name": "test", + "scenarios": [{"model": "test-model", "workloads": ["long-a", "long-b"]}], + } + (scenarios_dir / "test.yaml").write_text(yaml.safe_dump(scenario)) + + from projects.core.scenarios.generator import ScenarioGenerator + + gen = ScenarioGenerator( + scenarios_path=tmp_path / "scenarios" / "test.yaml", + config_dir=tmp_path, + ) + gen.load() + groups = gen.expand_grouped() + + # Both workloads have same vllm_args -> 1 group + assert len(groups) == 1 + assert len(groups[0].workloads) == 2 + assert {w.key for w in groups[0].workloads} == {"long-a", "long-b"} diff --git a/tests/core/scenarios/test_generator.py b/tests/core/scenarios/test_generator.py new file mode 100644 index 0000000..330de19 --- /dev/null +++ b/tests/core/scenarios/test_generator.py @@ -0,0 +1,245 @@ +"""Unit tests for ScenarioGenerator.""" + +import tempfile +from pathlib import Path + +import pytest +import yaml + +from projects.core.scenarios import ExpandedScenario, ScenarioConfig, ScenarioGenerator + + +class TestScenarioGenerator: + """Tests for ScenarioGenerator.""" + + @pytest.fixture + def sample_config_path(self): + """Create a sample scenarios.yaml file using new format.""" + config = { + "name": "test-scenarios", + "description": "Test scenario configuration", + "common": { + "namespace": "forge", + "runtime_args": { + "dtype": "auto", + "gpu-memory-utilization": 0.9, + }, + }, + "workloads": { + "balanced": { + "description": "Balanced workload", + "guidellm": {"max_requests": 100}, + }, + "short": { + "description": "Short workload", + "guidellm": {"max_requests": 50}, + }, + }, + "routing": { + "direct": {"mode": "direct"}, + }, + # New format: models section with model definitions + "models": { + "qwen-0.6b": { + "hf_model_id": "Qwen/Qwen3-0.6B", + "name": "qwen3-0-6b", + "vllm_args": {"max-model-len": 4096}, + }, + }, + # New format: scenarios list references model keys + "scenarios": [ + { + "model": "qwen-0.6b", + "workloads": ["balanced", "short"], + "routing": ["direct"], + "tensor_parallel": [1, 2], + }, + ], + } + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".yaml", delete=False + ) as f: + yaml.safe_dump(config, f) + yield Path(f.name) + + def test_load_config(self, sample_config_path): + """Generator loads and parses YAML config.""" + gen = ScenarioGenerator(sample_config_path) + config = gen.load() + + assert config.name == "test-scenarios" + assert config.description == "Test scenario configuration" + assert "qwen-0.6b" in config.models + + def test_matrix_expansion(self, sample_config_path): + """Matrix expansion produces correct number of scenarios.""" + gen = ScenarioGenerator(sample_config_path) + gen.load() + + scenarios = gen.expand() + + # 2 workloads × 1 routing × 2 TP = 4 scenarios + assert len(scenarios) == 4 + + def test_expanded_scenario_ids(self, sample_config_path): + """Expanded scenarios have deterministic IDs.""" + gen = ScenarioGenerator(sample_config_path) + gen.load() + + scenarios = gen.expand() + scenario_ids = [s.scenario_id for s in scenarios] + + # Check expected scenario IDs (model_short derived from model key) + assert "qwen-0-6b_balanced_direct_tp1" in scenario_ids + assert "qwen-0-6b_balanced_direct_tp2" in scenario_ids + assert "qwen-0-6b_short_direct_tp1" in scenario_ids + assert "qwen-0-6b_short_direct_tp2" in scenario_ids + + def test_runtime_args_merging(self, sample_config_path): + """Runtime args come from model vllm_args + tensor_parallel.""" + gen = ScenarioGenerator(sample_config_path) + gen.load() + + scenarios = gen.expand() + + for s in scenarios: + # Model-specific vllm_args + assert s.runtime_args["max-model-len"] == 4096 + # TP from matrix + assert s.runtime_args["tensor-parallel-size"] == s.tensor_parallel + + def test_workload_config_applied(self, sample_config_path): + """Workload config is available in workload_config.""" + gen = ScenarioGenerator(sample_config_path) + gen.load() + + scenarios = gen.expand() + + for s in scenarios: + # Workload guidellm config is in workload_config + assert "max_requests" in s.workload_config + + def test_deploy_config_num_gpus(self, sample_config_path): + """Deploy config num_gpus matches tensor-parallel-size.""" + gen = ScenarioGenerator(sample_config_path) + gen.load() + + scenarios = gen.expand() + + for s in scenarios: + assert s.deploy_config["num_gpus"] == s.tensor_parallel + + def test_summary(self, sample_config_path): + """Summary produces readable output.""" + gen = ScenarioGenerator(sample_config_path) + gen.load() + + summary = gen.summary() + + assert "test-scenarios" in summary + assert "qwen-0.6b" in summary # Model key appears in deployment groups + assert "Total Benchmark Runs: 4" in summary + + def test_to_scenario_config(self, sample_config_path): + """ExpandedScenario converts to ScenarioConfig.""" + gen = ScenarioGenerator(sample_config_path) + gen.load() + + scenarios = gen.expand() + scenario_config = scenarios[0].to_scenario_config(namespace="test-ns") + + assert isinstance(scenario_config, ScenarioConfig) + assert scenario_config.namespace == "test-ns" + assert scenario_config.model_id == scenarios[0].model_id + + def test_to_dict(self, sample_config_path): + """ExpandedScenario serializes to dict.""" + gen = ScenarioGenerator(sample_config_path) + gen.load() + + scenarios = gen.expand() + d = scenarios[0].to_dict() + + assert "model_id" in d + assert "scenario_id" in d + assert "runtime_args" in d + assert d["model_id"] == scenarios[0].model_id + + +class TestScenarioConfig: + """Tests for ScenarioConfig utilities.""" + + def test_sanitize_name(self): + """sanitize_name produces K8s-compatible names.""" + # Dots are removed for K8s compatibility + assert ScenarioConfig.sanitize_name("Qwen/Qwen3-0.6B") == "qwen-qwen3-06b" + assert ScenarioConfig.sanitize_name("test_name") == "test-name" + assert ( + ScenarioConfig.sanitize_name("very-long-name" * 10, max_len=20) + == "very-long-namevery-l" + ) + + def test_shorten_model_name(self): + """shorten_model_name extracts short name.""" + assert ScenarioConfig.shorten_model_name("Qwen/Qwen3-0.6B") == "qwen3-0-6b" + assert ( + ScenarioConfig.shorten_model_name("openai/gpt-oss-120b") == "gpt-oss-120b" + ) + assert ( + ScenarioConfig.shorten_model_name("RedHatAI/model-instruct") + == "model" + ) + assert ( + ScenarioConfig.shorten_model_name("org/model-dynamic") == "model" + ) + + +class TestExplicitRuns: + """Tests for explicit run definitions (no matrix).""" + + @pytest.fixture + def explicit_runs_config(self): + """Config with explicit runs instead of matrix.""" + config = { + "name": "explicit-runs", + "common": {"namespace": "forge"}, + "workloads": {"balanced": {"guidellm": {"max_requests": 100}}}, + "routing": {"direct": {"mode": "direct"}}, + "models": { + "test-model": { + "hf_model_id": "test/model", + "name": "test-model", + "vllm_args": {"extra": "value"}, + }, + }, + "runs": [ + { + "model": "test-model", + "workload": "balanced", + "routing": "direct", + "tensor_parallel": 4, + }, + ], + } + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".yaml", delete=False + ) as f: + yaml.safe_dump(config, f) + yield Path(f.name) + + def test_explicit_run_expansion(self, explicit_runs_config): + """Explicit runs expand without matrix.""" + gen = ScenarioGenerator(explicit_runs_config) + gen.load() + + scenarios = gen.expand() + + assert len(scenarios) == 1 + s = scenarios[0] + assert s.model_id == "test/model" + assert s.workload == "balanced" + assert s.tensor_parallel == 4 + assert s.runtime_args["tensor-parallel-size"] == 4 + assert s.runtime_args["extra"] == "value" diff --git a/tests/core/steps/__init__.py b/tests/core/steps/__init__.py new file mode 100644 index 0000000..67b31d7 --- /dev/null +++ b/tests/core/steps/__init__.py @@ -0,0 +1 @@ +"""Tests for shared steps.""" diff --git a/tests/core/steps/test_artifacts.py b/tests/core/steps/test_artifacts.py new file mode 100644 index 0000000..6f4bfeb --- /dev/null +++ b/tests/core/steps/test_artifacts.py @@ -0,0 +1,122 @@ +"""Unit tests for artifact collection steps.""" + +import subprocess +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from projects.core.steps import CleanupDeploymentStep, CollectArtifactsStep +from projects.core.workflow import WorkflowContext + + +class TestCollectArtifactsStep: + """Tests for CollectArtifactsStep.""" + + @pytest.fixture + def temp_artifact_dir(self): + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def context(self, temp_artifact_dir): + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + ctx.get_step_artifact_dir("collect_artifacts") + return ctx + + def test_step_name(self): + """Step has correct default name.""" + step = CollectArtifactsStep() + assert step.name == "collect_artifacts" + + def test_custom_app_label(self): + """Step accepts custom app label.""" + step = CollectArtifactsStep(app_label="custom-app") + assert step.app_label == "custom-app" + + @patch("subprocess.run") + def test_collects_logs(self, mock_run, context): + """CollectArtifactsStep collects pod logs.""" + mock_run.return_value = MagicMock( + returncode=0, stdout="log output", stderr="" + ) + + step = CollectArtifactsStep(app_label="test-app") + result = step.execute(context) + + assert result.success + # Verify oc logs was called + calls = [str(c) for c in mock_run.call_args_list] + assert any("logs" in str(c) for c in calls) + + @patch("subprocess.run") + def test_never_fails(self, mock_run, context): + """CollectArtifactsStep never fails the workflow.""" + mock_run.return_value = MagicMock( + returncode=1, stdout="", stderr="command failed" + ) + + step = CollectArtifactsStep() + result = step.execute(context) + + # Should succeed even if oc commands fail + assert result.success + + +class TestCleanupDeploymentStep: + """Tests for CleanupDeploymentStep.""" + + @pytest.fixture + def temp_artifact_dir(self): + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def context(self, temp_artifact_dir): + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + ctx.get_step_artifact_dir("cleanup") + return ctx + + def test_step_name(self): + """Step has correct default name.""" + step = CleanupDeploymentStep(deployment_name="test") + assert step.name == "cleanup" + + @patch("subprocess.run") + def test_deletes_deployment(self, mock_run, context): + """CleanupDeploymentStep deletes deployment.""" + mock_run.return_value = MagicMock(returncode=0) + + step = CleanupDeploymentStep(deployment_name="test-deploy") + result = step.execute(context) + + assert result.success + # Check deployment was deleted + calls = [str(c) for c in mock_run.call_args_list] + assert any("deployment" in str(c) for c in calls) + + @patch("subprocess.run") + def test_deletes_service_and_route(self, mock_run, context): + """CleanupDeploymentStep deletes associated resources.""" + mock_run.return_value = MagicMock(returncode=0) + + step = CleanupDeploymentStep( + deployment_name="test", + delete_service=True, + delete_route=True, + ) + result = step.execute(context) + + assert result.success + + @patch("subprocess.run") + def test_never_fails(self, mock_run, context): + """CleanupDeploymentStep never fails the workflow.""" + mock_run.return_value = MagicMock(returncode=1) + + step = CleanupDeploymentStep(deployment_name="test") + result = step.execute(context) + + # Should succeed even if deletes fail + assert result.success diff --git a/tests/core/steps/test_guidellm.py b/tests/core/steps/test_guidellm.py new file mode 100644 index 0000000..867878c --- /dev/null +++ b/tests/core/steps/test_guidellm.py @@ -0,0 +1,117 @@ +"""Unit tests for GuideLLM step.""" + +import subprocess +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from projects.core.steps import RunGuideLLMStep +from projects.core.workflow import WorkflowContext + + +class TestRunGuideLLMStep: + """Tests for RunGuideLLMStep.""" + + @pytest.fixture + def temp_artifact_dir(self): + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def context(self, temp_artifact_dir): + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + ctx.get_step_artifact_dir("benchmark") + return ctx + + def test_step_name(self): + """Step has correct default name.""" + step = RunGuideLLMStep( + endpoint="http://localhost:8080/v1", + model="test-model", + ) + assert step.name == "benchmark" + + def test_custom_step_name(self): + """Step accepts custom name.""" + step = RunGuideLLMStep( + endpoint="http://localhost:8080/v1", + model="test-model", + name="custom-benchmark", + ) + assert step.name == "custom-benchmark" + + @patch("subprocess.run") + def test_execute_success(self, mock_run, context): + """RunGuideLLMStep executes successfully.""" + # Create mock output file + step_dir = context.artifact_dir / f"{context.step_number:03d}__{context.current_step_name}" + step_dir.mkdir(parents=True, exist_ok=True) + output_file = step_dir / "guidellm_results.json" + output_file.write_text('{"results": []}') + + # Mock responses for: oc apply, get phase, get logs (with marker), get logs (collect), delete + mock_run.side_effect = [ + MagicMock(returncode=0, stdout="pod created", stderr=""), # oc apply + MagicMock(returncode=0, stdout="Running", stderr=""), # get phase + MagicMock(returncode=0, stdout="BENCHMARK_COMPLETE", stderr=""), # get logs (marker check) + MagicMock(returncode=0, stdout="benchmark logs", stderr=""), # collect logs + MagicMock(returncode=0, stdout="pod deleted", stderr=""), # delete pod + ] + + step = RunGuideLLMStep( + endpoint="http://localhost:8080/v1", + model="test-model", + workload="balanced", + max_requests=10, + ) + result = step.execute(context) + + assert result.success + assert mock_run.called + + @patch("subprocess.run") + def test_execute_failure(self, mock_run, context): + """RunGuideLLMStep handles failure.""" + mock_run.return_value = MagicMock( + returncode=1, stdout="", stderr="benchmark failed" + ) + + step = RunGuideLLMStep( + endpoint="http://localhost:8080/v1", + model="test-model", + ) + result = step.execute(context) + + assert not result.success + assert "failed" in result.message.lower() + + @patch("subprocess.run") + def test_handles_timeout(self, mock_run, context): + """RunGuideLLMStep handles timeout.""" + mock_run.side_effect = subprocess.TimeoutExpired("guidellm", 60) + + step = RunGuideLLMStep( + endpoint="http://localhost:8080/v1", + model="test-model", + max_seconds=60, + ) + result = step.execute(context) + + assert not result.success + assert "timed out" in result.message.lower() + + @patch("subprocess.run") + def test_handles_missing_command(self, mock_run, context): + """RunGuideLLMStep handles missing guidellm command.""" + mock_run.side_effect = FileNotFoundError("guidellm not found") + + step = RunGuideLLMStep( + endpoint="http://localhost:8080/v1", + model="test-model", + ) + result = step.execute(context) + + assert not result.success + assert "not found" in result.message.lower() diff --git a/tests/core/utils/__init__.py b/tests/core/utils/__init__.py new file mode 100644 index 0000000..96c5c33 --- /dev/null +++ b/tests/core/utils/__init__.py @@ -0,0 +1 @@ +"""Tests for core utilities.""" diff --git a/tests/core/utils/test_oc.py b/tests/core/utils/test_oc.py new file mode 100644 index 0000000..bd24016 --- /dev/null +++ b/tests/core/utils/test_oc.py @@ -0,0 +1,496 @@ +"""Unit tests for OC wrapper with retry logic.""" + +import subprocess +from unittest.mock import MagicMock, patch + +import pytest + +from projects.core.utils import OC, OCResult, RetryConfig + + +class TestRetryConfig: + """Tests for RetryConfig dataclass.""" + + def test_default_values(self): + """RetryConfig has sensible defaults.""" + config = RetryConfig() + + assert config.max_retries == 3 + assert config.initial_delay == 1.0 + assert config.max_delay == 30.0 + assert config.backoff_multiplier == 2.0 + assert config.retry_on_timeout is True + + def test_custom_values(self): + """RetryConfig accepts custom values.""" + config = RetryConfig( + max_retries=5, + initial_delay=0.5, + max_delay=60.0, + backoff_multiplier=3.0, + retry_on_timeout=False, + ) + + assert config.max_retries == 5 + assert config.initial_delay == 0.5 + assert config.max_delay == 60.0 + assert config.backoff_multiplier == 3.0 + assert config.retry_on_timeout is False + + +class TestOCResult: + """Tests for OCResult dataclass.""" + + def test_from_completed_process_success(self): + """OCResult created from successful subprocess.""" + mock_result = MagicMock(spec=subprocess.CompletedProcess) + mock_result.returncode = 0 + mock_result.stdout = "pod/my-pod created" + mock_result.stderr = "" + + result = OCResult.from_completed_process( + mock_result, + command=["oc", "apply", "-f", "test.yaml"], + attempts=1, + duration=0.5, + ) + + assert result.success is True + assert result.returncode == 0 + assert result.stdout == "pod/my-pod created" + assert result.stderr == "" + assert result.attempts == 1 + assert result.duration == 0.5 + + def test_from_completed_process_failure(self): + """OCResult created from failed subprocess.""" + mock_result = MagicMock(spec=subprocess.CompletedProcess) + mock_result.returncode = 1 + mock_result.stdout = "" + mock_result.stderr = "error: resource not found" + + result = OCResult.from_completed_process( + mock_result, + command=["oc", "get", "pod", "missing"], + attempts=3, + duration=5.0, + ) + + assert result.success is False + assert result.returncode == 1 + assert result.stderr == "error: resource not found" + assert result.attempts == 3 + + def test_from_error(self): + """OCResult created from exception.""" + error = subprocess.TimeoutExpired(cmd=["oc", "get", "pods"], timeout=30) + + result = OCResult.from_error( + error, + command=["oc", "get", "pods"], + attempts=4, + duration=120.0, + ) + + assert result.success is False + assert result.returncode == -1 + assert "timed out" in result.stderr.lower() or "timeout" in result.stderr.lower() + assert result.attempts == 4 + + +class TestOC: + """Tests for OC wrapper class.""" + + def test_init_defaults(self): + """OC initializes with sensible defaults.""" + oc = OC() + + assert oc.namespace is None + assert oc.timeout == 60 + assert isinstance(oc.retry, RetryConfig) + + def test_init_with_namespace(self): + """OC accepts namespace.""" + oc = OC(namespace="forge") + + assert oc.namespace == "forge" + + def test_init_with_custom_retry(self): + """OC accepts custom retry config.""" + config = RetryConfig(max_retries=10) + oc = OC(retry=config) + + assert oc.retry.max_retries == 10 + + def test_build_cmd_with_namespace(self): + """Commands include namespace when set.""" + oc = OC(namespace="forge") + cmd = oc._build_cmd(["get", "pods"]) + + assert cmd == ["oc", "-n", "forge", "get", "pods"] + + def test_build_cmd_without_namespace(self): + """Commands work without namespace.""" + oc = OC() + cmd = oc._build_cmd(["get", "namespaces"]) + + assert cmd == ["oc", "get", "namespaces"] + + def test_build_cmd_namespace_override(self): + """Namespace can be overridden per command.""" + oc = OC(namespace="default") + cmd = oc._build_cmd(["get", "pods"], namespace="other") + + assert cmd == ["oc", "-n", "other", "get", "pods"] + + @patch("subprocess.run") + def test_get_success(self, mock_run): + """get() returns successful result.""" + mock_run.return_value = MagicMock( + returncode=0, + stdout="NAME READY STATUS\nmy-pod 1/1 Running", + stderr="", + ) + + oc = OC(namespace="forge") + result = oc.get("pods") + + assert result.success is True + assert "my-pod" in result.stdout + mock_run.assert_called_once() + call_args = mock_run.call_args + assert "oc" in call_args[0][0] + assert "-n" in call_args[0][0] + assert "forge" in call_args[0][0] + assert "get" in call_args[0][0] + assert "pods" in call_args[0][0] + + @patch("subprocess.run") + def test_get_with_selector(self, mock_run): + """get() passes additional arguments.""" + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + + oc = OC(namespace="forge") + oc.get("pods", "-l", "app=vllm", "-o", "yaml") + + call_args = mock_run.call_args[0][0] + assert "-l" in call_args + assert "app=vllm" in call_args + assert "-o" in call_args + assert "yaml" in call_args + + @patch("subprocess.run") + def test_apply_success(self, mock_run): + """apply() works with file path.""" + mock_run.return_value = MagicMock( + returncode=0, + stdout="pod/my-pod created", + stderr="", + ) + + oc = OC(namespace="forge") + result = oc.apply("-f", "manifest.yaml") + + assert result.success is True + call_args = mock_run.call_args[0][0] + assert "apply" in call_args + assert "-f" in call_args + assert "manifest.yaml" in call_args + + @patch("subprocess.run") + def test_apply_with_stdin(self, mock_run): + """apply() accepts input via stdin.""" + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + + oc = OC(namespace="forge") + yaml_content = "apiVersion: v1\nkind: Pod\n..." + oc.apply("-f", "-", input=yaml_content) + + call_args = mock_run.call_args + assert call_args.kwargs.get("input") == yaml_content + + @patch("subprocess.run") + def test_delete_success(self, mock_run): + """delete() deletes resources.""" + mock_run.return_value = MagicMock( + returncode=0, + stdout='pod "my-pod" deleted', + stderr="", + ) + + oc = OC(namespace="forge") + result = oc.delete("pod", "my-pod") + + assert result.success is True + call_args = mock_run.call_args[0][0] + assert "delete" in call_args + assert "pod" in call_args + assert "my-pod" in call_args + + @patch("subprocess.run") + def test_delete_with_ignore_not_found(self, mock_run): + """delete() passes extra flags.""" + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + + oc = OC(namespace="forge") + oc.delete("pod", "my-pod", "--ignore-not-found") + + call_args = mock_run.call_args[0][0] + assert "--ignore-not-found" in call_args + + @patch("subprocess.run") + def test_logs_success(self, mock_run): + """logs() retrieves pod logs.""" + mock_run.return_value = MagicMock( + returncode=0, + stdout="INFO: Server started\nINFO: Ready", + stderr="", + ) + + oc = OC(namespace="forge") + result = oc.logs("my-pod") + + assert result.success is True + assert "Server started" in result.stdout + call_args = mock_run.call_args[0][0] + assert "logs" in call_args + assert "my-pod" in call_args + + @patch("subprocess.run") + def test_logs_with_container(self, mock_run): + """logs() accepts container flag.""" + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + + oc = OC(namespace="forge") + oc.logs("my-pod", "-c", "sidecar", "--tail=100") + + call_args = mock_run.call_args[0][0] + assert "-c" in call_args + assert "sidecar" in call_args + assert "--tail=100" in call_args + + @patch("subprocess.run") + def test_exec_success(self, mock_run): + """exec() runs command in pod.""" + mock_run.return_value = MagicMock( + returncode=0, + stdout='{"status": "healthy"}', + stderr="", + ) + + oc = OC(namespace="forge") + result = oc.exec("my-pod", "--", "curl", "localhost:8080/health") + + assert result.success is True + call_args = mock_run.call_args[0][0] + assert "exec" in call_args + assert "my-pod" in call_args + assert "--" in call_args + assert "curl" in call_args + + +class TestOCRetry: + """Tests for OC retry behavior.""" + + @patch("time.sleep") + @patch("subprocess.run") + def test_retry_on_transient_error(self, mock_run, mock_sleep): + """OC retries on transient network errors.""" + # First call fails with connection error, second succeeds + mock_run.side_effect = [ + MagicMock(returncode=1, stdout="", stderr="connection refused"), + MagicMock(returncode=0, stdout="success", stderr=""), + ] + + oc = OC(namespace="forge", retry=RetryConfig(max_retries=3, initial_delay=0.1)) + result = oc.get("pods") + + assert result.success is True + assert result.attempts == 2 + assert mock_run.call_count == 2 + mock_sleep.assert_called_once() # Slept once between retries + + @patch("time.sleep") + @patch("subprocess.run") + def test_no_retry_on_permanent_error(self, mock_run, mock_sleep): + """OC does not retry on non-transient errors.""" + mock_run.return_value = MagicMock( + returncode=1, + stdout="", + stderr="error: resource not found", + ) + + oc = OC(namespace="forge", retry=RetryConfig(max_retries=3)) + result = oc.get("pod", "nonexistent") + + assert result.success is False + assert result.attempts == 1 + assert mock_run.call_count == 1 + mock_sleep.assert_not_called() # No sleep - didn't retry + + @patch("time.sleep") + @patch("subprocess.run") + def test_retry_exhausted(self, mock_run, mock_sleep): + """OC returns failure after exhausting retries.""" + mock_run.return_value = MagicMock( + returncode=1, + stdout="", + stderr="connection timed out", + ) + + oc = OC(namespace="forge", retry=RetryConfig(max_retries=2, initial_delay=0.1)) + result = oc.get("pods") + + assert result.success is False + assert result.attempts == 3 # Initial + 2 retries + assert mock_run.call_count == 3 + assert mock_sleep.call_count == 2 # Slept between each retry + + @patch("time.sleep") + @patch("subprocess.run") + def test_retry_on_timeout(self, mock_run, mock_sleep): + """OC retries on subprocess timeout.""" + mock_run.side_effect = [ + subprocess.TimeoutExpired(cmd=["oc"], timeout=30), + MagicMock(returncode=0, stdout="success", stderr=""), + ] + + oc = OC( + namespace="forge", + retry=RetryConfig(max_retries=3, retry_on_timeout=True, initial_delay=0.1), + ) + result = oc.get("pods") + + assert result.success is True + assert result.attempts == 2 + + @patch("time.sleep") + @patch("subprocess.run") + def test_no_retry_on_timeout_when_disabled(self, mock_run, mock_sleep): + """OC does not retry timeout when disabled.""" + mock_run.side_effect = subprocess.TimeoutExpired(cmd=["oc"], timeout=30) + + oc = OC( + namespace="forge", + retry=RetryConfig(max_retries=3, retry_on_timeout=False), + ) + result = oc.get("pods") + + assert result.success is False + assert result.attempts == 1 + mock_sleep.assert_not_called() + + @patch("time.sleep") + @patch("subprocess.run") + def test_exponential_backoff(self, mock_run, mock_sleep): + """OC uses exponential backoff between retries.""" + mock_run.return_value = MagicMock( + returncode=1, + stdout="", + stderr="service unavailable", + ) + + oc = OC( + namespace="forge", + retry=RetryConfig( + max_retries=3, + initial_delay=1.0, + backoff_multiplier=2.0, + max_delay=10.0, + ), + ) + oc.get("pods") + + # Check backoff delays: 1.0, 2.0, 4.0 + calls = mock_sleep.call_args_list + assert len(calls) == 3 + assert calls[0][0][0] == 1.0 + assert calls[1][0][0] == 2.0 + assert calls[2][0][0] == 4.0 + + @patch("time.sleep") + @patch("subprocess.run") + def test_max_delay_cap(self, mock_run, mock_sleep): + """OC caps delay at max_delay.""" + mock_run.return_value = MagicMock( + returncode=1, + stdout="", + stderr="service unavailable", + ) + + oc = OC( + namespace="forge", + retry=RetryConfig( + max_retries=5, + initial_delay=10.0, + backoff_multiplier=2.0, + max_delay=15.0, + ), + ) + oc.get("pods") + + # Delays should be: 10.0, 15.0 (capped), 15.0, 15.0, 15.0 + calls = mock_sleep.call_args_list + assert calls[0][0][0] == 10.0 + assert calls[1][0][0] == 15.0 # Capped + assert calls[2][0][0] == 15.0 + assert calls[3][0][0] == 15.0 + assert calls[4][0][0] == 15.0 + + +class TestOCTransientErrorDetection: + """Tests for transient error detection.""" + + @pytest.mark.parametrize( + "stderr", + [ + "connection refused", + "Connection reset by peer", + "Unable to connect to the server", + "no route to host", + "etcdserver: request timed out", + "context deadline exceeded", + "the server was unable to return a response", + "unexpected EOF", + "i/o timeout", + "TLS handshake timeout", + "Service Unavailable", + "too many requests", + ], + ) + @patch("time.sleep") + @patch("subprocess.run") + def test_transient_error_patterns(self, mock_run, mock_sleep, stderr): + """OC recognizes various transient error patterns.""" + mock_run.side_effect = [ + MagicMock(returncode=1, stdout="", stderr=stderr), + MagicMock(returncode=0, stdout="success", stderr=""), + ] + + oc = OC(retry=RetryConfig(max_retries=1, initial_delay=0.1)) + result = oc.get("pods") + + assert result.success is True + assert mock_run.call_count == 2, f"Should retry for: {stderr}" + + @pytest.mark.parametrize( + "stderr", + [ + "error: resource not found", + "Error: pod not found", + "forbidden: User cannot get resource", + "invalid: spec.containers: Required value", + ], + ) + @patch("time.sleep") + @patch("subprocess.run") + def test_non_transient_error_patterns(self, mock_run, mock_sleep, stderr): + """OC does not retry non-transient errors.""" + mock_run.return_value = MagicMock(returncode=1, stdout="", stderr=stderr) + + oc = OC(retry=RetryConfig(max_retries=3)) + result = oc.get("pods") + + assert result.success is False + assert mock_run.call_count == 1, f"Should not retry for: {stderr}" + mock_sleep.assert_not_called() diff --git a/tests/core/workflow/__init__.py b/tests/core/workflow/__init__.py new file mode 100644 index 0000000..a659d00 --- /dev/null +++ b/tests/core/workflow/__init__.py @@ -0,0 +1 @@ +"""Tests for workflow engine.""" diff --git a/tests/core/workflow/test_context.py b/tests/core/workflow/test_context.py new file mode 100644 index 0000000..6940a17 --- /dev/null +++ b/tests/core/workflow/test_context.py @@ -0,0 +1,106 @@ +"""Unit tests for WorkflowContext.""" + +import os +import tempfile +from pathlib import Path + +import pytest + +from projects.core.workflow import WorkflowContext + + +class TestWorkflowContext: + """Tests for WorkflowContext.""" + + @pytest.fixture + def temp_artifact_dir(self): + """Create temporary artifact directory.""" + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + def test_from_environment_creates_uuid(self, temp_artifact_dir): + """Context generates a unique run UUID.""" + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + + assert ctx.run_uuid is not None + assert len(ctx.run_uuid) == 36 # UUID format + + def test_from_environment_creates_artifact_dir(self, temp_artifact_dir): + """Context creates artifact directory.""" + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + + assert ctx.artifact_dir.exists() + assert ctx.artifact_dir.is_dir() + assert (ctx.artifact_dir / "_meta").exists() + + def test_from_environment_captures_forge_vars(self, temp_artifact_dir, monkeypatch): + """Context captures FORGE_* environment variables.""" + monkeypatch.setenv("FORGE_MODEL", "test-model") + monkeypatch.setenv("FORGE_VLLM_IMAGE", "test-image") + monkeypatch.setenv("OTHER_VAR", "should-not-capture") + + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + + assert ctx.env_vars["FORGE_MODEL"] == "test-model" + assert ctx.env_vars["FORGE_VLLM_IMAGE"] == "test-image" + assert "OTHER_VAR" not in ctx.env_vars + + def test_get_env_with_prefix(self, temp_artifact_dir, monkeypatch): + """get_env works with FORGE_ prefix.""" + monkeypatch.setenv("FORGE_MODEL", "my-model") + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + + # With prefix + assert ctx.get_env("FORGE_MODEL") == "my-model" + # Without prefix (auto-added) + assert ctx.get_env("MODEL") == "my-model" + + def test_get_env_default(self, temp_artifact_dir): + """get_env returns default for missing vars.""" + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + + assert ctx.get_env("NONEXISTENT") is None + assert ctx.get_env("NONEXISTENT", "default-value") == "default-value" + + def test_get_step_artifact_dir(self, temp_artifact_dir): + """get_step_artifact_dir creates numbered directories.""" + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + + dir1 = ctx.get_step_artifact_dir("deploy") + dir2 = ctx.get_step_artifact_dir("benchmark") + dir3 = ctx.get_step_artifact_dir("cleanup") + + assert dir1.name == "001__deploy" + assert dir2.name == "002__benchmark" + assert dir3.name == "003__cleanup" + + assert dir1.exists() + assert dir2.exists() + assert dir3.exists() + + def test_write_metadata(self, temp_artifact_dir): + """write_metadata creates YAML file.""" + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + ctx.config = {"key": "value"} + + path = ctx.write_metadata(args={"model": "test"}) + + assert path.exists() + content = path.read_text() + assert "run_uuid" in content + assert "model: test" in content + + def test_write_restart_script(self, temp_artifact_dir): + """write_restart_script creates executable script.""" + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + + command = "python run.py --model test" + path = ctx.write_restart_script(command) + + assert path.exists() + assert os.access(path, os.X_OK) # Executable + + content = path.read_text() + assert "#!/bin/bash" in content + assert command in content + assert ctx.run_uuid in content diff --git a/tests/core/workflow/test_executor.py b/tests/core/workflow/test_executor.py new file mode 100644 index 0000000..5d4b129 --- /dev/null +++ b/tests/core/workflow/test_executor.py @@ -0,0 +1,220 @@ +"""Unit tests for SequentialExecutor.""" + +import tempfile +from pathlib import Path + +import pytest + +from projects.core.workflow import ( + SequentialExecutor, + StepResult, + Workflow, + WorkflowContext, + WorkflowStep, +) + + +class PassingStep(WorkflowStep): + """A step that always succeeds.""" + + def __init__(self, name: str, record: list[str]): + super().__init__(name=name) + self.record = record + + def execute(self, ctx: WorkflowContext) -> StepResult: + self.record.append(f"executed:{self.name}") + return StepResult.ok(f"Step {self.name} passed") + + +class FailingStep(WorkflowStep): + """A step that always fails.""" + + def __init__(self, name: str, record: list[str]): + super().__init__(name=name) + self.record = record + + def execute(self, ctx: WorkflowContext) -> StepResult: + self.record.append(f"executed:{self.name}") + return StepResult.fail(f"Step {self.name} failed") + + +class ExceptionStep(WorkflowStep): + """A step that raises an exception.""" + + def __init__(self, name: str, record: list[str]): + super().__init__(name=name) + self.record = record + + def execute(self, ctx: WorkflowContext) -> StepResult: + self.record.append(f"executed:{self.name}") + raise RuntimeError(f"Step {self.name} exploded") + + +class TestSequentialExecutor: + """Tests for SequentialExecutor.""" + + @pytest.fixture + def temp_artifact_dir(self): + """Create temporary artifact directory.""" + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def context(self, temp_artifact_dir): + """Create workflow context with temp artifact dir.""" + return WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + + def test_steps_run_in_order(self, context): + """Steps execute in registration order.""" + record: list[str] = [] + + class TestWorkflow(Workflow): + def define_steps(self): + self.add_step(PassingStep("first", record)) + self.add_step(PassingStep("second", record)) + self.add_step(PassingStep("third", record)) + + workflow = TestWorkflow(context) + result = workflow.execute() + + assert result.success + assert record == ["executed:first", "executed:second", "executed:third"] + assert "first" in result.step_results + assert "second" in result.step_results + assert "third" in result.step_results + + def test_finally_runs_on_success(self, context): + """Finally steps run after successful completion.""" + record: list[str] = [] + + class TestWorkflow(Workflow): + def define_steps(self): + self.add_step(PassingStep("main", record)) + self.add_finally(PassingStep("cleanup", record)) + + workflow = TestWorkflow(context) + result = workflow.execute() + + assert result.success + assert record == ["executed:main", "executed:cleanup"] + + def test_finally_runs_on_failure(self, context): + """Finally steps execute even when normal steps fail.""" + record: list[str] = [] + + class TestWorkflow(Workflow): + def define_steps(self): + self.add_step(PassingStep("first", record)) + self.add_step(FailingStep("failing", record)) + self.add_step(PassingStep("skipped", record)) + self.add_finally(PassingStep("cleanup1", record)) + self.add_finally(PassingStep("cleanup2", record)) + + workflow = TestWorkflow(context) + result = workflow.execute() + + assert not result.success + assert result.failed_step == "failing" + # "skipped" should NOT be in the record + assert record == [ + "executed:first", + "executed:failing", + "executed:cleanup1", + "executed:cleanup2", + ] + + def test_finally_runs_on_exception(self, context): + """Finally steps run even when a step raises an exception.""" + record: list[str] = [] + + class TestWorkflow(Workflow): + def define_steps(self): + self.add_step(PassingStep("first", record)) + self.add_step(ExceptionStep("exploding", record)) + self.add_step(PassingStep("skipped", record)) + self.add_finally(PassingStep("cleanup", record)) + + workflow = TestWorkflow(context) + result = workflow.execute() + + assert not result.success + assert result.failed_step == "exploding" + assert "exploding" in result.step_results + assert result.step_results["exploding"].error is not None + assert record == ["executed:first", "executed:exploding", "executed:cleanup"] + + def test_all_finally_steps_run_even_if_one_fails(self, context): + """All finally steps run even if one fails.""" + record: list[str] = [] + + class TestWorkflow(Workflow): + def define_steps(self): + self.add_step(PassingStep("main", record)) + self.add_finally(FailingStep("cleanup1", record)) + self.add_finally(PassingStep("cleanup2", record)) + self.add_finally(ExceptionStep("cleanup3", record)) + self.add_finally(PassingStep("cleanup4", record)) + + workflow = TestWorkflow(context) + result = workflow.execute() + + # Main workflow succeeded, finally failures don't affect overall success + assert result.success + assert record == [ + "executed:main", + "executed:cleanup1", + "executed:cleanup2", + "executed:cleanup3", + "executed:cleanup4", + ] + + def test_empty_workflow(self, context): + """Empty workflow completes successfully.""" + + class TestWorkflow(Workflow): + def define_steps(self): + pass + + workflow = TestWorkflow(context) + result = workflow.execute() + + assert result.success + assert len(result.step_results) == 0 + + def test_duration_tracking(self, context): + """Workflow tracks total duration.""" + record: list[str] = [] + + class TestWorkflow(Workflow): + def define_steps(self): + self.add_step(PassingStep("step1", record)) + + workflow = TestWorkflow(context) + result = workflow.execute() + + assert result.duration_seconds >= 0 + assert result.start_time is not None + assert result.end_time is not None + assert result.run_uuid == context.run_uuid + + +class TestStepResult: + """Tests for StepResult helper methods.""" + + def test_ok_result(self): + """StepResult.ok creates successful result.""" + result = StepResult.ok("All good", foo="bar") + + assert result.success + assert result.message == "All good" + assert result.data == {"foo": "bar"} + assert result.error is None + + def test_fail_result(self): + """StepResult.fail creates failed result.""" + error = ValueError("bad input") + result = StepResult.fail("Something went wrong", error=error) + + assert not result.success + assert result.message == "Something went wrong" + assert result.error is error diff --git a/tests/rhaiis/__init__.py b/tests/rhaiis/__init__.py new file mode 100644 index 0000000..0197469 --- /dev/null +++ b/tests/rhaiis/__init__.py @@ -0,0 +1 @@ +"""Tests for RHAIIS project.""" diff --git a/tests/rhaiis/test_ci.py b/tests/rhaiis/test_ci.py new file mode 100644 index 0000000..cf7275f --- /dev/null +++ b/tests/rhaiis/test_ci.py @@ -0,0 +1,82 @@ +"""Unit tests for RHAIIS CI CLI.""" + +import os +from pathlib import Path +from unittest.mock import patch + +import pytest +from click.testing import CliRunner + +from projects.rhaiis.orchestration.ci import ci + + +class TestCiPrepare: + """Tests for ci prepare command.""" + + @pytest.fixture + def runner(self): + return CliRunner() + + def test_prepare_dry_run(self, runner): + """prepare --dry-run shows what would be done.""" + result = runner.invoke(ci, ["prepare", "--dry-run"]) + + assert result.exit_code == 0 + assert "DRY-RUN" in result.output + assert "RHOAI" in result.output + + +class TestCiTest: + """Tests for ci test command.""" + + @pytest.fixture + def runner(self): + return CliRunner(env={"FORGE_ARTIFACT_DIR": "/tmp/artifacts"}) + + def test_test_dry_run(self, runner): + """test --dry-run shows model from env.""" + result = runner.invoke( + ci, ["test", "--dry-run"], + env={"FORGE_MODEL": "test/model", "FORGE_ARTIFACT_DIR": "/tmp/artifacts"} + ) + + assert result.exit_code == 0 + assert "test/model" in result.output + + def test_test_dry_run_with_workloads(self, runner): + """test --dry-run shows workloads from env.""" + result = runner.invoke( + ci, ["test", "--dry-run"], + env={ + "FORGE_MODEL": "test/model", + "FORGE_WORKLOADS": "balanced,heterogeneous", + "FORGE_ARTIFACT_DIR": "/tmp/artifacts", + } + ) + + assert result.exit_code == 0 + assert "balanced" in result.output or "heterogeneous" in result.output + +class TestCiCleanup: + """Tests for ci cleanup command.""" + + @pytest.fixture + def runner(self): + return CliRunner() + + def test_cleanup_dry_run(self, runner): + """cleanup --dry-run shows what would be done.""" + result = runner.invoke(ci, ["cleanup", "--dry-run"]) + + assert result.exit_code == 0 + assert "DRY-RUN" in result.output + + def test_cleanup_with_namespace(self, runner): + """cleanup accepts custom namespace via env.""" + result = runner.invoke( + ci, ["cleanup", "--dry-run"], + env={"FORGE_NAMESPACE": "custom-ns"} + ) + + assert result.exit_code == 0 + assert "custom-ns" in result.output diff --git a/tests/rhaiis/test_operators.py b/tests/rhaiis/test_operators.py new file mode 100644 index 0000000..16d575e --- /dev/null +++ b/tests/rhaiis/test_operators.py @@ -0,0 +1,178 @@ +"""Unit tests for RHAIIS operator installation steps.""" + +import subprocess +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from projects.core.workflow import WorkflowContext +from projects.rhaiis.workflows.steps import ( + InstallGPUOperatorStep, + InstallNFDOperatorStep, + InstallRHOAIOperatorStep, +) + + +class TestInstallNFDOperatorStep: + """Tests for InstallNFDOperatorStep.""" + + @pytest.fixture + def temp_artifact_dir(self): + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def context(self, temp_artifact_dir): + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + ctx.get_step_artifact_dir("install_nfd") + return ctx + + def test_step_name(self): + """Step has correct default name.""" + step = InstallNFDOperatorStep() + assert step.name == "install_nfd" + + @patch("subprocess.run") + def test_execute_success(self, mock_run, context): + """InstallNFDOperatorStep executes successfully.""" + mock_run.return_value = MagicMock(returncode=0, stdout="created", stderr="") + + step = InstallNFDOperatorStep() + result = step.execute(context) + + assert result.success + assert "NFD" in result.message + + @patch("subprocess.run") + def test_creates_subscription_yaml(self, mock_run, context): + """InstallNFDOperatorStep creates subscription YAML.""" + mock_run.return_value = MagicMock(returncode=0) + + step = InstallNFDOperatorStep() + result = step.execute(context) + + # Check YAML file was created + step_dir = context.artifact_dir / f"{context.step_number:03d}__{context.current_step_name}" + yaml_file = step_dir / "nfd-subscription.yaml" + assert yaml_file.exists() + content = yaml_file.read_text() + assert "openshift-nfd" in content + + +class TestInstallGPUOperatorStep: + """Tests for InstallGPUOperatorStep.""" + + @pytest.fixture + def temp_artifact_dir(self): + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def context(self, temp_artifact_dir): + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + ctx.get_step_artifact_dir("install_gpu") + return ctx + + def test_step_name(self): + """Step has correct default name.""" + step = InstallGPUOperatorStep() + assert step.name == "install_gpu" + + @patch("subprocess.run") + def test_execute_success(self, mock_run, context): + """InstallGPUOperatorStep executes successfully.""" + mock_run.return_value = MagicMock(returncode=0, stdout="created", stderr="") + + step = InstallGPUOperatorStep() + result = step.execute(context) + + assert result.success + assert "GPU" in result.message + + @patch("subprocess.run") + def test_execute_failure(self, mock_run, context): + """InstallGPUOperatorStep handles failure.""" + mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="error") + + step = InstallGPUOperatorStep() + result = step.execute(context) + + assert not result.success + + @patch("subprocess.run") + def test_creates_subscription_yaml(self, mock_run, context): + """InstallGPUOperatorStep creates subscription YAML.""" + mock_run.return_value = MagicMock(returncode=0) + + step = InstallGPUOperatorStep() + step.execute(context) + + step_dir = context.artifact_dir / f"{context.step_number:03d}__{context.current_step_name}" + yaml_file = step_dir / "gpu-subscription.yaml" + assert yaml_file.exists() + content = yaml_file.read_text() + assert "gpu-operator-certified" in content + + +class TestInstallRHOAIOperatorStep: + """Tests for InstallRHOAIOperatorStep.""" + + @pytest.fixture + def temp_artifact_dir(self): + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def context(self, temp_artifact_dir): + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + ctx.get_step_artifact_dir("install_rhoai") + return ctx + + def test_step_name(self): + """Step has correct default name.""" + step = InstallRHOAIOperatorStep() + assert step.name == "install_rhoai" + + def test_custom_version(self): + """Step accepts custom RHOAI version.""" + step = InstallRHOAIOperatorStep(version="2.20") + assert step.version == "2.20" + + @patch("subprocess.run") + def test_execute_success(self, mock_run, context): + """InstallRHOAIOperatorStep executes successfully.""" + mock_run.return_value = MagicMock(returncode=0, stdout="created", stderr="") + + step = InstallRHOAIOperatorStep(version="2.19") + result = step.execute(context) + + assert result.success + assert "RHOAI" in result.message + assert "2.19" in result.message + + @patch("subprocess.run") + def test_execute_failure(self, mock_run, context): + """InstallRHOAIOperatorStep handles failure.""" + mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="error") + + step = InstallRHOAIOperatorStep() + result = step.execute(context) + + assert not result.success + + @patch("subprocess.run") + def test_creates_subscription_yaml_with_channel(self, mock_run, context): + """InstallRHOAIOperatorStep creates subscription with correct channel.""" + mock_run.return_value = MagicMock(returncode=0) + + step = InstallRHOAIOperatorStep(version="2.19") + step.execute(context) + + step_dir = context.artifact_dir / f"{context.step_number:03d}__{context.current_step_name}" + yaml_file = step_dir / "rhoai-subscription.yaml" + assert yaml_file.exists() + content = yaml_file.read_text() + assert "stable-2.19" in content + assert "rhods-operator" in content diff --git a/tests/rhaiis/test_steps.py b/tests/rhaiis/test_steps.py new file mode 100644 index 0000000..2623db1 --- /dev/null +++ b/tests/rhaiis/test_steps.py @@ -0,0 +1,265 @@ +"""Unit tests for RHAIIS workflow steps.""" + +import subprocess +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from projects.core.workflow import WorkflowContext +from projects.rhaiis.workflows.steps import ( + CleanupNamespaceStep, + DeployVLLMStep, + WaitForReadyStep, +) + + +class TestDeployVLLMStep: + """Tests for DeployVLLMStep (KServe-based).""" + + @pytest.fixture + def temp_artifact_dir(self): + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def context(self, temp_artifact_dir): + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + # Pre-increment step number to simulate workflow execution + ctx.get_step_artifact_dir("deploy") + return ctx + + @pytest.fixture + def default_runtime_args(self): + """Default runtime args for tests.""" + return { + "gpu-memory-utilization": 0.9, + "max-model-len": 4096, + "tensor-parallel-size": 1, + } + + def test_generates_kserve_yaml(self, context, default_runtime_args): + """DeployVLLMStep generates valid KServe YAML.""" + step = DeployVLLMStep( + model="test/model", + deployment_name="test-deploy", + vllm_image="test/image:v1", + runtime_args={**default_runtime_args, "tensor-parallel-size": 2}, + tensor_parallel=2, + namespace="test-ns", + ) + + yaml_content = step._generate_kserve_yaml() + + assert "apiVersion: serving.kserve.io/v1alpha1" in yaml_content + assert "kind: ServingRuntime" in yaml_content + assert "apiVersion: serving.kserve.io/v1beta1" in yaml_content + assert "kind: InferenceService" in yaml_content + assert "name: test-deploy" in yaml_content + assert "namespace: test-ns" in yaml_content + assert 'nvidia.com/gpu: "2"' in yaml_content + + def test_generates_shared_memory_for_tp(self, context, default_runtime_args): + """DeployVLLMStep includes shared memory volume for tensor parallel > 1.""" + step = DeployVLLMStep( + model="test/model", + deployment_name="test-deploy", + vllm_image="test/image:v1", + runtime_args={**default_runtime_args, "tensor-parallel-size": 4}, + tensor_parallel=4, + ) + + yaml_content = step._generate_kserve_yaml() + + assert "shared-memory" in yaml_content + assert "/dev/shm" in yaml_content + assert "sizeLimit: 8Gi" in yaml_content + + def test_shared_memory_always_present(self, context, default_runtime_args): + """DeployVLLMStep includes shared memory even for tensor parallel = 1.""" + step = DeployVLLMStep( + model="test/model", + deployment_name="test-deploy", + vllm_image="test/image:v1", + runtime_args=default_runtime_args, + tensor_parallel=1, + ) + + yaml_content = step._generate_kserve_yaml() + + # Shared memory is always required for vLLM + assert "shared-memory" in yaml_content + + def test_amd_accelerator(self, context, default_runtime_args): + """DeployVLLMStep uses AMD GPU resources.""" + step = DeployVLLMStep( + model="test/model", + deployment_name="test-deploy", + vllm_image="test/image:v1", + runtime_args=default_runtime_args, + accelerator="amd", + tensor_parallel=1, + ) + + yaml_content = step._generate_kserve_yaml() + + assert "amd.com/gpu" in yaml_content + + def test_hf_storage_source(self, context, default_runtime_args): + """DeployVLLMStep configures HuggingFace storage.""" + step = DeployVLLMStep( + model="test/model", + deployment_name="test-deploy", + vllm_image="test/image:v1", + runtime_args=default_runtime_args, + storage_source="hf", + storage_path="models-pvc", + ) + + yaml_content = step._generate_kserve_yaml() + + assert "HF_TOKEN" in yaml_content + assert "HF_HOME" in yaml_content + assert "pvc://models-pvc" in yaml_content + + def test_custom_runtime_args(self, context): + """DeployVLLMStep includes custom runtime args.""" + step = DeployVLLMStep( + model="test/model", + deployment_name="test-deploy", + vllm_image="test/image:v1", + runtime_args={"enable-prefix-caching": True, "max-num-seqs": 256}, + ) + + yaml_content = step._generate_kserve_yaml() + + assert "--enable-prefix-caching" in yaml_content + assert "--max-num-seqs=256" in yaml_content + + @patch("subprocess.run") + def test_execute_success(self, mock_run, context, default_runtime_args): + """DeployVLLMStep executes successfully.""" + mock_run.return_value = MagicMock(returncode=0, stdout="created", stderr="") + + step = DeployVLLMStep( + model="test/model", + deployment_name="test-deploy", + vllm_image="test/image:v1", + runtime_args=default_runtime_args, + ) + result = step.execute(context) + + assert result.success + assert mock_run.called + + @patch("subprocess.run") + def test_execute_failure(self, mock_run, context, default_runtime_args): + """DeployVLLMStep handles apply failure.""" + # First two calls (namespace creation) succeed, third (apply) fails + mock_run.side_effect = [ + MagicMock(returncode=0), # namespace dry-run + MagicMock(returncode=0), # namespace apply + MagicMock(returncode=1, stdout="", stderr="error applying"), # kserve apply + ] + + step = DeployVLLMStep( + model="test/model", + deployment_name="test-deploy", + vllm_image="test/image:v1", + runtime_args=default_runtime_args, + ) + result = step.execute(context) + + assert not result.success + assert "error applying" in result.message + + +class TestWaitForReadyStep: + """Tests for WaitForReadyStep (InferenceService).""" + + @pytest.fixture + def temp_artifact_dir(self): + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def context(self, temp_artifact_dir): + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + ctx.get_step_artifact_dir("wait") + return ctx + + @patch("subprocess.run") + def test_wait_success_immediate(self, mock_run, context): + """WaitForReadyStep succeeds when InferenceService is ready.""" + # Order: status check -> URL -> get pod name -> health check + mock_run.side_effect = [ + MagicMock(returncode=0, stdout="True", stderr=""), # status check + MagicMock(returncode=0, stdout="http://test.svc", stderr=""), # URL + MagicMock(returncode=0, stdout="test-pod-abc", stderr=""), # get pod name + MagicMock(returncode=0, stdout="200", stderr=""), # health check curl + ] + + step = WaitForReadyStep( + deployment_name="test", + timeout_seconds=30, + poll_interval=1, + ) + result = step.execute(context) + + assert result.success + assert "ready" in result.message.lower() + assert result.data.get("service_url") == "http://test.svc" + + @patch("subprocess.run") + @patch("time.sleep") + def test_wait_timeout(self, mock_sleep, mock_run, context): + """WaitForReadyStep fails on timeout.""" + mock_run.return_value = MagicMock(returncode=0, stdout="False", stderr="") + + step = WaitForReadyStep( + deployment_name="test", + timeout_seconds=2, + poll_interval=1, + ) + result = step.execute(context) + + assert not result.success + assert "not ready" in result.message.lower() + + +class TestCleanupNamespaceStep: + """Tests for CleanupNamespaceStep.""" + + @pytest.fixture + def temp_artifact_dir(self): + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def context(self, temp_artifact_dir): + ctx = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + ctx.get_step_artifact_dir("cleanup") + return ctx + + @patch("subprocess.run") + def test_cleanup_success(self, mock_run, context): + """CleanupNamespaceStep cleans up resources.""" + mock_run.return_value = MagicMock(returncode=0) + + step = CleanupNamespaceStep(namespace="test-ns") + result = step.execute(context) + + # Cleanup step never fails + assert result.success + + @patch("subprocess.run") + def test_cleanup_with_errors(self, mock_run, context): + """CleanupNamespaceStep handles errors gracefully.""" + mock_run.return_value = MagicMock(returncode=1, stderr="not found") + + step = CleanupNamespaceStep(namespace="test-ns") + result = step.execute(context) + + # Still succeeds - cleanup is best effort + assert result.success diff --git a/tests/rhaiis/test_workflows.py b/tests/rhaiis/test_workflows.py new file mode 100644 index 0000000..92cbbcc --- /dev/null +++ b/tests/rhaiis/test_workflows.py @@ -0,0 +1,118 @@ +"""Unit tests for RHAIIS workflows.""" + +import tempfile +from pathlib import Path + +import pytest + +from projects.core.workflow import WorkflowContext +from projects.rhaiis.workflows import BenchmarkWorkflow, CleanupWorkflow, PrepareWorkflow + + +class TestBenchmarkWorkflow: + """Tests for BenchmarkWorkflow.""" + + @pytest.fixture + def temp_artifact_dir(self): + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def context(self, temp_artifact_dir): + return WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + + def test_workflow_defines_steps(self, context): + """BenchmarkWorkflow defines required steps.""" + workflow = BenchmarkWorkflow( + ctx=context, + model="Qwen/Qwen3-0.6B", + workload="balanced", + ) + + # Force step definition + workflow._ensure_defined() + + # Should have deploy, wait, benchmark steps + step_names = [s.name for s in workflow.steps] + assert "deploy" in step_names + assert "wait" in step_names + assert "benchmark" in step_names + + # Should have finally steps + finally_names = [s.name for s in workflow.finally_steps] + assert "collect_artifacts" in finally_names + assert "cleanup" in finally_names + + def test_workflow_uses_custom_image(self, context): + """BenchmarkWorkflow uses custom vLLM image.""" + custom_image = "custom/vllm:latest" + workflow = BenchmarkWorkflow( + ctx=context, + model="test/model", + vllm_image=custom_image, + ) + + assert workflow.vllm_image == custom_image + + def test_workflow_sanitizes_deployment_name(self, context): + """BenchmarkWorkflow sanitizes deployment name.""" + workflow = BenchmarkWorkflow( + ctx=context, + model="Qwen/Qwen3-0.6B-Instruct", + ) + + # Should be lowercase, no special chars + assert workflow.deployment_name == "qwen3-0-6b-instruct" + + def test_workflow_uses_env_image(self, temp_artifact_dir, monkeypatch): + """BenchmarkWorkflow uses FORGE_VLLM_IMAGE from env.""" + monkeypatch.setenv("FORGE_VLLM_IMAGE", "env/vllm:test") + context = WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + + workflow = BenchmarkWorkflow(ctx=context, model="test/model") + + assert workflow.vllm_image == "env/vllm:test" + + +class TestPrepareWorkflow: + """Tests for PrepareWorkflow.""" + + @pytest.fixture + def temp_artifact_dir(self): + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def context(self, temp_artifact_dir): + return WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + + def test_prepare_defines_operator_steps(self, context): + """PrepareWorkflow defines operator installation steps.""" + workflow = PrepareWorkflow(ctx=context, rhoai_version="2.19") + workflow._ensure_defined() + + step_names = [s.name for s in workflow.steps] + assert "install_nfd" in step_names + assert "install_gpu" in step_names + assert "install_rhoai" in step_names + + +class TestCleanupWorkflow: + """Tests for CleanupWorkflow.""" + + @pytest.fixture + def temp_artifact_dir(self): + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.fixture + def context(self, temp_artifact_dir): + return WorkflowContext.from_environment(artifact_base=str(temp_artifact_dir)) + + def test_cleanup_defines_steps(self, context): + """CleanupWorkflow defines cleanup steps.""" + workflow = CleanupWorkflow(ctx=context, namespace="test-ns") + workflow._ensure_defined() + + step_names = [s.name for s in workflow.steps] + assert "cleanup_namespace" in step_names