In [None]:
import os

os.environ["NGC_API_KEY"] = "nvapi-****"

In [None]:
%%bash
echo "${NGC_API_KEY}" | docker login nvcr.io -u '$oauthtoken' --password-stdin

In [None]:
import os, json, subprocess, time

# -------------------------------
# 1. Setup NeMo/NIM cache
# -------------------------------
os.environ["LOCAL_NIM_CACHE"] = "/ephemeral/cache/nim"
os.makedirs(os.environ["LOCAL_NIM_CACHE"], exist_ok=True)
print(f"LOCAL_NIM_CACHE set to {os.environ['LOCAL_NIM_CACHE']}")

# -------------------------------
# 2. Setup Docker ephemeral storage
# -------------------------------
storage_path = "/ephemeral/cache/docker"
os.makedirs(storage_path, exist_ok=True)

daemon_file = "/etc/docker/daemon.json"
config = {}
try:
    config = json.load(open(daemon_file)) if os.path.exists(daemon_file) else {}
except PermissionError:
    print("Cannot read daemon.json. Run with sudo or check path.")

# Update Docker root
config["data-root"] = storage_path
config_str = json.dumps(config, indent=4)

# Write daemon.json (requires sudo)
subprocess.run(f"echo '{config_str}' | sudo tee {daemon_file} > /dev/null", shell=True, check=True)

# Restart Docker
subprocess.run("sudo systemctl restart docker", shell=True, check=True)
time.sleep(5)

# Verify new Docker root
docker_root = subprocess.run(
    "docker info | grep 'Docker Root Dir'",
    shell=True, capture_output=True, text=True
).stdout.strip()
print("Docker Root Dir:", docker_root)

# -------------------------------
# 3. Setup pip cache
# -------------------------------
pip_cache = "/ephemeral/cache/pip"
os.makedirs(pip_cache, exist_ok=True)
os.environ["PIP_CACHE_DIR"] = pip_cache
print(f"PIP_CACHE_DIR set to {pip_cache}")

# -------------------------------
# 4. Setup HuggingFace cache
# -------------------------------
hf_cache = "/ephemeral/cache/huggingface"
os.makedirs(hf_cache, exist_ok=True)
os.environ["HF_HOME"] = hf_cache
print(f"HF_HOME set to {hf_cache}")

# -------------------------------
# 5. Setup tmpdir
# -------------------------------
tmp_dir = "/ephemeral/tmp"
os.makedirs(tmp_dir, exist_ok=True)
os.environ["TMPDIR"] = tmp_dir
print(f"TMPDIR set to {tmp_dir}")

In [None]:
!docker run --gpus all --name nemo-rl -it \
  -p 9000:9000 \
  -v "$(pwd)":/workspace \
  -w /workspace \
  -d nvcr.io/nvidia/nemo-rl:v0.4.0

In [None]:
container = "nemo-rl"

!docker exec {container} bash -c "git clone https://github.com/NVIDIA-NeMo/RL.git nemo-rl --recursive"
!docker exec {container} bash -c "cd nemo-rl && git submodule update --init --recursive"

# Activate NeMo RL venv
!docker exec {container} bash -c "source /opt/nemo_rl_venv/bin/activate"

# HuggingFace login
!docker exec {container} bash -c "huggingface-cli login --token hf_********"

# WANDB API key
!docker exec {container} bash -c 'export WANDB_API_KEY="*****"'

In [None]:
container = "nemo-rl"

!docker exec -it $container bash -c 'source /opt/nemo_rl_venv/bin/activate && \
uv run python nemo-rl/examples/run_dpo.py \
cluster.gpus_per_node=1 \
dpo.max_num_steps=10 \
policy.model_name=meta-llama/Llama-3.2-1B-Instruct \
policy.tokenizer.name=meta-llama/Llama-3.2-1B-Instruct'

In [None]:
container = "nemo-rl"

!docker exec {container} bash -c "source /opt/nemo_rl_venv/bin/activate && \
    uv run nemo-rl/examples/converters/convert_dcp_to_hf.py \
    --config ./results/dpo/step_10/config.yaml \
    --dcp-ckpt-path ./results/dpo/step_10/policy/weights \
    --hf-ckpt-path ./results/dpo/step_10/hf"

In [None]:
%%writefile inference.py
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

hf_path = "./results/dpo/step_10/hf/"

tokenizer = AutoTokenizer.from_pretrained(hf_path)
model = AutoModelForCausalLM.from_pretrained(hf_path, torch_dtype=torch.bfloat16)
model.eval()

prompt = "How does photosynthesis work in plants?"
inputs = tokenizer(prompt, return_tensors="pt")
out = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(out[0], skip_special_tokens=True))


In [None]:
container = "nemo-rl"
!docker exec {container} bash -c "source /opt/nemo_rl_venv/bin/activate && python inference.py"

In [None]:
%%writefile convert.py
from transformers import AutoModelForCausalLM, AutoTokenizer

src = "./results/dpo/step_10/hf"
dst = "./results/dpo/step_10/hf_st"

model = AutoModelForCausalLM.from_pretrained(src)
model.save_pretrained(dst, safe_serialization=True)

tok = AutoTokenizer.from_pretrained(src)
tok.save_pretrained(dst)

print("Saved to:", dst)


In [None]:
container = "nemo-rl"
!docker exec {container} bash -c "source /opt/nemo_rl_venv/bin/activate && python convert.py"

In [None]:
# ===============================
#   MultiLLM-NIM Container Launcher
#   (Detached mode)
# ===============================

# Choose container name
CONTAINER_NAME = "MultiLLM-NIM"

# NGC Multi-LLM NIM repo
Repository = "nim/nvidia/llm-nim"
TAG = "latest"
IMG_NAME = f"nvcr.io/{Repository}:{TAG}"

# Path to your local HF DPO model
LOCAL_MODEL_DIR = "./results/dpo/step_10/hf_st"

# Name to expose the served model
NIM_SERVED_MODEL_NAME = "dpo-llm"

# Local NIM cache (you chose ephemeral)
LOCAL_NIM_CACHE = "/ephemeral/cache/nim"

# Create cache directory
!mkdir -p "{LOCAL_NIM_CACHE}"
!chmod -R a+w "{LOCAL_NIM_CACHE}"

print("Starting MultiLLM-NIM container in detached mode...")
print("Container:", CONTAINER_NAME)
print("Image:", IMG_NAME)
print("Model Path:", LOCAL_MODEL_DIR)
print("NIM Cache:", LOCAL_NIM_CACHE)

In [None]:
# -------------------------------
# Run the container DETACHED
# -------------------------------
!docker run -d --rm --name={CONTAINER_NAME} \
  --runtime=nvidia \
  --gpus all \
  --shm-size=16GB \
  -e NIM_MODEL_PROFILE="e2f00b2cbfb168f907c8d6d4d40406f7261111fbab8b3417a485dcd19d10cc98" \
  -e NIM_MODEL_NAME="/opt/models/local_model" \
  -e NIM_SERVED_MODEL_NAME={NIM_SERVED_MODEL_NAME} \
  -v "{LOCAL_MODEL_DIR}:/opt/models/local_model" \
  -v "{LOCAL_NIM_CACHE}:/opt/nim/.cache" \
  -u $(id -u) \
  -p 8000:8000 \
  {IMG_NAME}

In [None]:
import requests

url = 'http://localhost:8000/v1/health/ready' #make sure the LLM NIM port is correct
headers = {'accept': 'application/json'}

print("Checking MultiLLM NIM readiness...")
while True:
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            if data.get("message") == "Service is ready.":
                print("LLM NIM is ready.")
                break
            else:
                print("LLM NIM is not ready. Waiting for 30 seconds...")
        else:
            print(f"Unexpected status code {response.status_code}. Waiting for 30 seconds...")
    except requests.ConnectionError:
        print("LLM NIM is not ready. Waiting for 30 seconds...")
    time.sleep(30)

In [None]:
!curl -X POST 'http://localhost:8000/v1/completions' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{"model": "dpo-llm", "prompt": "The sky appears blue because", "max_tokens": 64}'

In [None]:
!docker stop {CONTAINER_NAME}