GPU scheduling and multi-agent orchestration concepts without actual GPUs by using simulation + CPU-based execution +   logical resource modeling.

In [None]:
%load_ext autoreload
%autoreload 2
# Do not run this cell unless running the code on a local notebook. This line is for auto-reloading the code in the notebook whenever you make changes to the code files. 
# If you're running this code in a Jupyter notebook/ Google Colab, you can uncomment the line to enable auto-reloading of your code files.

In [None]:
%pip install  torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# Use it only if you are sure GPU is available and compatible with CUDA 12.1+. Otherwise, you can install the CPU version or the appropriate CUDA version for your system.

In [1]:
# -----------------------------
# Environment configuration
# -----------------------------
gpu_available = True   # <-- change to True to enable GPU scheduling

# What to do with GPU tasks when gpu_available=False:
DOWNGRADE_GPU_TASKS_TO_CPU = True   # True = run them on CPU, False = reject them


In [2]:
import os, platform, sys, shutil, subprocess

def get_cpu_info():
    return {
        "os": platform.platform(),
        "python": platform.python_version(),
        "python_executable": sys.executable,
        "cpu_logical_cores": os.cpu_count() or 1,
    }

def get_gpu_info_nvidia():
    if shutil.which("nvidia-smi") is None:
        return {"gpu_count": 0, "gpu_names": [], "note": "nvidia-smi not found"}

    out = subprocess.check_output(["nvidia-smi", "-L"], text=True, stderr=subprocess.STDOUT)
    gpu_lines = [ln.strip() for ln in out.splitlines() if ln.strip().startswith("GPU")]
    names = []
    for ln in gpu_lines:
        try:
            names.append(ln.split(":", 1)[1].split("(UUID", 1)[0].strip())
        except Exception:
            names.append(ln)
    return {"gpu_count": len(gpu_lines), "gpu_names": names}

def get_torch_gpu_info():
    try:
        import torch  # type: ignore

        cuda_ok = torch.cuda.is_available()
        info = {
            "torch_installed": True,
            "torch_version": torch.__version__,
            "torch_cuda_version": getattr(torch.version, "cuda", None),
            "cuda_available": cuda_ok,
            "cuda_device_count": torch.cuda.device_count() if cuda_ok else 0,
            "device_names": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())] if cuda_ok else [],
        }

        # Extra: if CUDA isn't available, sometimes this helps diagnose in WSL
        if not cuda_ok:
            # Don't force GPU init; just provide env hints
            info["env_CUDA_VISIBLE_DEVICES"] = os.getenv("CUDA_VISIBLE_DEVICES")
            info["env_LD_LIBRARY_PATH"] = os.getenv("LD_LIBRARY_PATH")

        return info

    except Exception as e:
        return {
            "torch_installed": False,
            "python_executable": sys.executable,
            "error_type": type(e).__name__,
            "error_message": str(e),
        }

# ---- Run ----
print("=== CPU Info ===")
cpu_info = get_cpu_info()
for k, v in cpu_info.items():
    print(f"{k}: {v}")

RUNTIME_DEVICE = "cpu"

if not gpu_available:
    print("\n=== Mode ===")
    print("gpu_available=False -> forcing CPU only")
else:
    print("\n=== GPU Info (driver) ===")
    try:
        gpu_info = get_gpu_info_nvidia()
    except Exception as e:
        gpu_info = {"gpu_count": 0, "gpu_names": [], "error": f"{type(e).__name__}: {e}"}
    for k, v in gpu_info.items():
        print(f"{k}: {v}")

    print("\n=== Torch / CUDA Info ===")
    torch_info = get_torch_gpu_info()
    for k, v in torch_info.items():
        print(f"{k}: {v}")

    # Decision with clear reasoning
    if gpu_info.get("gpu_count", 0) == 0:
        reason = "No GPU detected by driver (nvidia-smi)."
        RUNTIME_DEVICE = "cpu"
    elif not torch_info.get("torch_installed", False):
        reason = "Torch is not importable in this Python environment."
        RUNTIME_DEVICE = "cpu"
    elif not torch_info.get("cuda_available", False):
        reason = "Torch installed, but torch.cuda.is_available() is False (CUDA not usable)."
        RUNTIME_DEVICE = "cpu"
    else:
        reason = "GPU detected and Torch reports CUDA is available."
        RUNTIME_DEVICE = "cuda"

    print("\n=== Mode ===")
    print(f"gpu_available=True -> selected runtime device: {RUNTIME_DEVICE}")
    print("Reason:", reason)


=== CPU Info ===
os: Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39
python: 3.12.3
python_executable: /mnt/d/SRM_PG/workshop_info/env/bin/python
cpu_logical_cores: 24

=== GPU Info (driver) ===
gpu_count: 1
gpu_names: ['NVIDIA GeForce RTX 4070 Ti SUPER']

=== Torch / CUDA Info ===
torch_installed: True
torch_version: 2.5.1+cu121
torch_cuda_version: 12.1
cuda_available: True
cuda_device_count: 1
device_names: ['NVIDIA GeForce RTX 4070 Ti SUPER']

=== Mode ===
gpu_available=True -> selected runtime device: cuda
Reason: GPU detected and Torch reports CUDA is available.


In [3]:
import asyncio
import os
import shutil
import subprocess
import sys
import time
from dataclasses import dataclass, field
from typing import Dict, List, Optional




In [4]:
# -----------------------------
# Hardware detection (only when gpu_available=True)
# -----------------------------
def detect_gpus_nvidia_smi() -> int:
    if shutil.which("nvidia-smi") is None:
        return 0
    try:
        out = subprocess.check_output(["nvidia-smi", "-L"], stderr=subprocess.STDOUT, text=True)
        lines = [ln.strip() for ln in out.splitlines() if ln.strip().startswith("GPU")]
        return len(lines)
    except Exception:
        return 0

def torch_info_best_effort() -> dict:
    try:
        import torch  # type: ignore
        return {
            "torch_installed": True,
            "torch_version": torch.__version__,
            "torch_cuda_version": getattr(torch.version, "cuda", None),
            "cuda_available": torch.cuda.is_available(),
            "cuda_device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
        }
    except Exception as e:
        return {"torch_installed": False, "error": f"{type(e).__name__}: {e}"}




In [5]:
# -----------------------------
# Logical resource model
# -----------------------------
@dataclass
class Cluster:
    cpu_cores: int
    gpus: int
    cpu_sem: asyncio.Semaphore = field(init=False)
    gpu_sem: asyncio.Semaphore = field(init=False)

    def __post_init__(self):
        self.cpu_sem = asyncio.Semaphore(self.cpu_cores)
        self.gpu_sem = asyncio.Semaphore(self.gpus)


@dataclass
class Task:
    task_id: str
    agent: str
    duration_s: float
    cpu_cores: int = 1
    gpus: int = 0
    kind: str = "generic"




In [6]:
# -----------------------------
# Utilities
# -----------------------------
def ts(t0: float) -> str:
    return f"{time.time() - t0:6.2f}s"

async def acquire_n(sem: asyncio.Semaphore, n: int):
    for _ in range(n):
        await sem.acquire()

def release_n(sem: asyncio.Semaphore, n: int):
    for _ in range(n):
        sem.release()

async def run_task(task: Task, cluster: Cluster, t0: float):
    await acquire_n(cluster.cpu_sem, task.cpu_cores)
    if task.gpus > 0:
        await acquire_n(cluster.gpu_sem, task.gpus)

    place = "GPU" if task.gpus > 0 else "CPU"
    try:
        print(f"[{ts(t0)}] START {task.task_id:10} agent={task.agent:6} kind={task.kind:6} "
              f"on={place} req(cpu={task.cpu_cores}, gpu={task.gpus})")
        await asyncio.sleep(task.duration_s)
        print(f"[{ts(t0)}] DONE  {task.task_id:10} agent={task.agent:6} on={place}")
    finally:
        if task.gpus > 0:
            release_n(cluster.gpu_sem, task.gpus)
        release_n(cluster.cpu_sem, task.cpu_cores)




In [7]:
# -----------------------------
# Orchestrator (multi-agent)
# -----------------------------
class Orchestrator:
    def __init__(self, cluster: Cluster):
        self.cluster = cluster
        self.queues: Dict[str, asyncio.Queue[Task]] = {}
        self.agent_names: List[str] = []
        self.running: List[asyncio.Task] = []

    def register_agent(self, name: str):
        if name not in self.queues:
            self.queues[name] = asyncio.Queue()
            self.agent_names.append(name)

    async def submit(self, task: Task):
        await self.queues[task.agent].put(task)

    def _cleanup_done(self):
        self.running = [t for t in self.running if not t.done()]

    async def scheduler_loop(self, t0: float):
        rr = 0
        while True:
            self._cleanup_done()

            # stop when everything finished
            if all(q.empty() for q in self.queues.values()) and len(self.running) == 0:
                return

            # round-robin agent pick
            picked: Optional[str] = None
            for _ in range(len(self.agent_names)):
                name = self.agent_names[rr % len(self.agent_names)]
                rr += 1
                if not self.queues[name].empty():
                    picked = name
                    break

            if picked is None:
                await asyncio.sleep(0.02)
                continue

            task = await self.queues[picked].get()

            # Admission control: reject impossible requests
            if task.cpu_cores > self.cluster.cpu_cores:
                print(f"[{ts(t0)}] REJECT {task.task_id:10} cpu req too large "
                      f"(cpu={task.cpu_cores}/{self.cluster.cpu_cores})")
                continue
            if task.gpus > self.cluster.gpus:
                print(f"[{ts(t0)}] REJECT {task.task_id:10} gpu req too large "
                      f"(gpu={task.gpus}/{self.cluster.gpus})")
                continue

            self.running.append(asyncio.create_task(run_task(task, self.cluster, t0)))




In [8]:
# -----------------------------
# Demo workload
# -----------------------------
async def agent_submitter(orch: Orchestrator, t0: float, tasks: List[Task]):
    for task in tasks:
        await orch.submit(task)
        print(f"[{ts(t0)}] SUBMIT {task.task_id:10} agent={task.agent:6} wants(cpu={task.cpu_cores}, gpu={task.gpus})")
        await asyncio.sleep(0.03)


async def main():
    t0 = time.time()

    cpu_cores = os.cpu_count() or 1

    # Respect the flag:
    if not gpu_available:
        real_gpus = 0
        gpus_used = 0
        torch_info = None
    else:
        real_gpus = detect_gpus_nvidia_smi()
        gpus_used = real_gpus  # no simulation here; use what you actually have
        torch_info = torch_info_best_effort()

    print("=== Logical Cluster Resources ===")
    print("CPU cores (logical):", cpu_cores)
    print("gpu_available flag:", gpu_available)
    print("GPUs detected:", real_gpus, "| GPUs used for scheduling:", gpus_used)
    if torch_info is not None:
        print("Torch info:", torch_info)
    print()

    cluster = Cluster(cpu_cores=cpu_cores, gpus=gpus_used)
    orch = Orchestrator(cluster)
    for a in ["agentA", "agentB", "agentC"]:
        orch.register_agent(a)

    # Build tasks (GPU tasks will be downgraded/rejected when gpu_available=False)
    def maybe_gpu(n=1):
        if gpu_available:
            return n
        return 0 if DOWNGRADE_GPU_TASKS_TO_CPU else n  # if not downgrading, keep request and get rejected

    tasks_A = [
        Task("A_etl_1", "agentA", duration_s=1.0, cpu_cores=2, gpus=0, kind="etl"),
        Task("A_trn_1", "agentA", duration_s=2.2, cpu_cores=2, gpus=maybe_gpu(1), kind="train"),
        Task("A_inf_1", "agentA", duration_s=1.1, cpu_cores=1, gpus=maybe_gpu(1), kind="infer"),
    ]
    tasks_B = [
        Task("B_etl_1", "agentB", duration_s=1.3, cpu_cores=3, gpus=0, kind="etl"),
        Task("B_trn_1", "agentB", duration_s=2.0, cpu_cores=1, gpus=maybe_gpu(1), kind="train"),
    ]
    tasks_C = [
        Task("C_cpu_1", "agentC", duration_s=1.6, cpu_cores=cpu_cores // 2 or 1, gpus=0, kind="cpujob"),
        Task("C_inf_1", "agentC", duration_s=0.9, cpu_cores=1, gpus=maybe_gpu(1), kind="infer"),
    ]

    # If you chose "reject", print a one-liner so itâ€™s obvious
    if (not gpu_available) and (not DOWNGRADE_GPU_TASKS_TO_CPU):
        print("NOTE: gpu_available=False and DOWNGRADE_GPU_TASKS_TO_CPU=False -> GPU tasks will be rejected.\n")

    await asyncio.gather(
        agent_submitter(orch, t0, tasks_A),
        agent_submitter(orch, t0, tasks_B),
        agent_submitter(orch, t0, tasks_C),
        orch.scheduler_loop(t0),
    )

    print(f"\n[{ts(t0)}] ALL DONE")


# ---- Run in notebook vs script ----
await main()



=== Logical Cluster Resources ===
CPU cores (logical): 24
gpu_available flag: True
GPUs detected: 1 | GPUs used for scheduling: 1
Torch info: {'torch_installed': True, 'torch_version': '2.5.1+cu121', 'torch_cuda_version': '12.1', 'cuda_available': True, 'cuda_device_count': 1}

[  0.04s] SUBMIT A_etl_1    agent=agentA wants(cpu=2, gpu=0)
[  0.04s] SUBMIT B_etl_1    agent=agentB wants(cpu=3, gpu=0)
[  0.04s] SUBMIT C_cpu_1    agent=agentC wants(cpu=12, gpu=0)
[  0.04s] START A_etl_1    agent=agentA kind=etl    on=CPU req(cpu=2, gpu=0)
[  0.04s] START B_etl_1    agent=agentB kind=etl    on=CPU req(cpu=3, gpu=0)
[  0.04s] START C_cpu_1    agent=agentC kind=cpujob on=CPU req(cpu=12, gpu=0)
[  0.07s] SUBMIT A_trn_1    agent=agentA wants(cpu=2, gpu=1)
[  0.07s] SUBMIT B_trn_1    agent=agentB wants(cpu=1, gpu=1)
[  0.07s] SUBMIT C_inf_1    agent=agentC wants(cpu=1, gpu=1)
[  0.08s] START A_trn_1    agent=agentA kind=train  on=GPU req(cpu=2, gpu=1)
[  0.11s] SUBMIT A_inf_1    agent=agentA want