In [23]:
import subprocess, json

NS = "ray-finetune-llm-deepspeed002"

def run(cmd):
    return subprocess.check_output(cmd, text=True).strip()

print("=" * 70)
print("ENVIRONMENT VERIFICATION")
print("=" * 70)

print("\n1. OpenShift User:")
print(f"   {run(['oc', 'whoami'])}")

print("\n2. RayCluster Status:")
try:
    data = json.loads(run(["oc", "get", "raycluster", "ray", "-n", NS, "-o", "json"]))
    state = data.get("status", {}).get("state", "unknown")
    workers = data.get("status", {}).get("availableWorkerReplicas", 0)
    print(f"   State: {state}")
    print(f"   Workers: {workers}")
    
    head_tols = data["spec"]["headGroupSpec"]["template"]["spec"].get("tolerations", [])
    has_gpu = any(t.get("key") == "nvidia.com/gpu" for t in head_tols)
    print(f"   GPU toleration: {has_gpu}")
    
    if state != "ready":
        print("   ‚ö†Ô∏è  Cluster not ready yet")
except Exception as e:
    print(f"   ‚ùå Error: {e}")

print("\n" + "=" * 70)
print("‚úÖ Environment check complete")
print("=" * 70)

ENVIRONMENT VERIFICATION

1. OpenShift User:
   system:serviceaccount:ray-finetune-llm-deepspeed002:notebook

2. RayCluster Status:
   State: ready
   Workers: 6
   GPU toleration: True

‚úÖ Environment check complete


In [24]:
import subprocess
from codeflare_sdk import TokenAuthentication

token = subprocess.check_output(["oc", "whoami", "-t"]).decode().strip()
server = subprocess.check_output(["oc", "whoami", "--show-server=true"]).decode().strip()

auth = TokenAuthentication(
    token=token,
    server=server,
    skip_tls=True
)
auth.login()

print(f"‚úÖ Authenticated to: {server}")

‚úÖ Authenticated to: https://172.30.0.1:443


In [44]:
import os
p="/opt/app-root/src/models/facebook/opt-125m"
print("dir exists:", os.path.isdir(p))
if os.path.isdir(p):
    print("has config.json:", os.path.exists(os.path.join(p,"config.json")))
    print("files:", sorted(os.listdir(p))[:30])


dir exists: True
has config.json: True
files: ['.cache', '.gitattributes', 'LICENSE.md', 'README.md', 'config.json', 'flax_model.msgpack', 'generation_config.json', 'merges.txt', 'pytorch_model.bin', 'special_tokens_map.json', 'tf_model.h5', 'tokenizer_config.json', 'vocab.json']


In [45]:
import sys

!{sys.executable} -m pip install --upgrade --quiet --no-cache-dir \
    "numpy==1.26.4" \
    "pyarrow==15.0.2" \
    "datasets==2.18.0"

print("‚úÖ Dependencies installed")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
‚úÖ Dependencies installed


In [None]:
import os

# =============================================================================
# MODEL CONFIGURATION
# =============================================================================

# WORKING_DIR = "/opt/app-root/src/MLforEng"

# Option 1: Use local uploaded model (RECOMMENDED - no download needed)
# LLM_MODEL_ID = "/opt/app-root/src/models/llama-3.2-1b-instruct"

# LLM_MODEL_ID = "/opt/app-root/src/models/facebook/opt-125m"

# LLM_MODEL_ID = "facebook/opt-125m"

# LLM_MODEL_ID = "gpt2"
LLM_MODEL_ID = "Qwen/Qwen2.5-0.5B"

# Option 2: Use HuggingFace model (requires network + token)
# LLM_MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
# LLM_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# HuggingFace token (only needed for gated models like Llama)
HF_TOKEN = os.environ.get("HF_TOKEN", "<TOKEN")

# =============================================================================
# DATASET CONFIGURATION  
# =============================================================================

# Option 1: Use your custom JSONL dataset
DATASET_TYPE = "jsonl"
TRAIN_JSONL = "/opt/app-root/src/MLforEng/artifacts/datasets/commscom_llama_prompts.jsonl"  # Relative to working_dir
EVAL_JSONL = "/opt/app-root/src/MLforEng/artifacts/datasets/commscom_llama_prompts.jsonl"

# Option 2: Use GSM8K demo dataset (math problems)
# DATASET_TYPE = "gsm8k"
# TRAIN_JSONL = ""  # Not used for gsm8k
# EVAL_JSONL = ""

# =============================================================================
# TRAINING CONFIGURATION
# =============================================================================

OUTPUT_DIR = "/opt/app-root/src/models/llama-finetuned"
STORAGE_PATH = "/opt/app-root/src"

# Training hyperparameters
MAX_STEPS = 30  # Small for demo; set to 0 to use NUM_TRAIN_EPOCHS
NUM_TRAIN_EPOCHS = 1
TRAIN_BATCH_SIZE = 2  # Can use 2 for 1B models, 1 for 8B
EVAL_BATCH_SIZE = 2
MAX_SEQ_LENGTH = 512
GRAD_ACCUM_STEPS = 4
LEARNING_RATE = 2e-5
WARMUP_RATIO = 0.03
SAVE_STEPS = 30
EVAL_STEPS = 30
SAVE_TOTAL_LIMIT = 2

# Precision (use BF16 for modern GPUs like L4, A100)
USE_BF16 = True
USE_FP16 = False

# =============================================================================
# DISPLAY CONFIGURATION
# =============================================================================

print("=" * 70)
print("TRAINING CONFIGURATION")
print("=" * 70)
print(f"\nüì¶ Model:")
print(f"   ID: {LLM_MODEL_ID}")
print(f"   Type: {'Local' if LLM_MODEL_ID.startswith('/') else 'HuggingFace'}")

print(f"\nüìä Dataset:")
print(f"   Type: {DATASET_TYPE}")
if DATASET_TYPE == "jsonl":
    print(f"   Train: {TRAIN_JSONL}")
    print(f"   Eval: {EVAL_JSONL}")
    
    # Verify file exists
    full_path = os.path.join("/opt/app-root/src/MLforEng", TRAIN_JSONL)
    if os.path.exists(full_path):
        with open(full_path) as f:
            num_lines = sum(1 for _ in f)
        print(f"   ‚úÖ File found: {num_lines} examples")
    else:
        print(f"   ‚ùå File not found: {full_path}")
else:
    print(f"   Using GSM8K (will download automatically)")

print(f"\n‚öôÔ∏è  Training:")
print(f"   Max steps: {MAX_STEPS}")
print(f"   Epochs: {NUM_TRAIN_EPOCHS}")
print(f"   Batch size: {TRAIN_BATCH_SIZE}")
print(f"   Max sequence: {MAX_SEQ_LENGTH}")
print(f"   Learning rate: {LEARNING_RATE}")
print(f"   Precision: {'BF16' if USE_BF16 else 'FP16' if USE_FP16 else 'FP32'}")

print(f"\nüíæ Output:")
print(f"   {OUTPUT_DIR}")

print("\n" + "=" * 70)

TRAINING CONFIGURATION

üì¶ Model:
   ID: Qwen/Qwen2.5-0.5B
   Type: HuggingFace

üìä Dataset:
   Type: jsonl
   Train: /opt/app-root/src/MLforEng/artifacts/datasets/commscom_llama_prompts.jsonl
   Eval: /opt/app-root/src/MLforEng/artifacts/datasets/commscom_llama_prompts.jsonl
   ‚úÖ File found: 500 examples

‚öôÔ∏è  Training:
   Max steps: 30
   Epochs: 1
   Batch size: 2
   Max sequence: 512
   Learning rate: 2e-05
   Precision: BF16

üíæ Output:
   /opt/app-root/src/models/llama-finetuned



In [52]:
from ray.job_submission import JobSubmissionClient

NS = "ray-finetune-llm-deepspeed002"
ray_dashboard_url = f"http://ray-head-svc.{NS}.svc.cluster.local:8265"

client = JobSubmissionClient(ray_dashboard_url)

# Verify connection
jobs = client.list_jobs()
print(f"‚úÖ Connected to Ray: {ray_dashboard_url}")
print(f"   Existing jobs: {len(jobs)}")

‚úÖ Connected to Ray: http://ray-head-svc.ray-finetune-llm-deepspeed002.svc.cluster.local:8265
   Existing jobs: 56


In [53]:
print("Submitting training job...")
print("=" * 70)

# Build runtime environment
runtime_env = {
    "working_dir": "./",
    "pip": "requirements.txt",
    "excludes": ["/docs/", "*.ipynb", "*.md", ".git/"],
    "env_vars": {
        # Model
        "LLM_MODEL_ID": LLM_MODEL_ID,
        "HF_TOKEN": HF_TOKEN,
        "HUGGING_FACE_HUB_TOKEN": HF_TOKEN,
        
        # Dataset
        "DATASET_TYPE": DATASET_TYPE,
        "TRAIN_JSONL": TRAIN_JSONL,
        "EVAL_JSONL": EVAL_JSONL,
        
        # Training
        "OUTPUT_DIR": OUTPUT_DIR,
        "MAX_STEPS": str(MAX_STEPS),
        "NUM_TRAIN_EPOCHS": str(NUM_TRAIN_EPOCHS),
        "TRAIN_BATCH_SIZE": str(TRAIN_BATCH_SIZE),
        "EVAL_BATCH_SIZE": str(EVAL_BATCH_SIZE),
        "MAX_SEQ_LENGTH": str(MAX_SEQ_LENGTH),
        "GRAD_ACCUM_STEPS": str(GRAD_ACCUM_STEPS),
        "LEARNING_RATE": str(LEARNING_RATE),
        "WARMUP_RATIO": str(WARMUP_RATIO),
        "SAVE_STEPS": str(SAVE_STEPS),
        "EVAL_STEPS": str(EVAL_STEPS),
        "SAVE_TOTAL_LIMIT": str(SAVE_TOTAL_LIMIT),
        "USE_BF16": "1" if USE_BF16 else "0",
        "USE_FP16": "1" if USE_FP16 else "0",
        
        # Cache
        "HF_HOME": f"{STORAGE_PATH}/.cache",
        "TRANSFORMERS_CACHE": f"{STORAGE_PATH}/.cache/transformers",
    },
}

# Build entrypoint (simple python script call)
entrypoint = "python ray_finetune_simple.py"

try:
    submission_id = client.submit_job(
        entrypoint=entrypoint,
        runtime_env=runtime_env
    )
    
    print(f"‚úÖ Job submitted successfully!")
    print(f"\nüìã Job ID: {submission_id}")
    print(f"‚è±Ô∏è  Status: {client.get_job_status(submission_id)}")
    print(f"\nüí° Monitor in next cell")
    
except Exception as e:
    print(f"‚ùå Submission failed: {e}")
    raise

print("=" * 70)

2025-12-13 15:25:46,932	INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_597d216c3cbfce9c.zip.
2025-12-13 15:25:46,933	INFO packaging.py:576 -- Creating a file package for local module './'.


Submitting training job...
‚úÖ Job submitted successfully!

üìã Job ID: raysubmit_gTc7afNCS7x6xzys
‚è±Ô∏è  Status: PENDING

üí° Monitor in next cell


In [50]:
import time

print("Monitoring training progress...")
print("=" * 70)

prev_log_length = 0
check_interval = 30  # seconds

for i in range(60):  # 30 minutes max
    logs = client.get_job_logs(submission_id)
    lines = logs.split('\n') if logs else []
    
    # Show new log lines
    if len(lines) > prev_log_length:
        new_lines = lines[prev_log_length:]
        
        keywords = ['step', 'epoch', 'loss', 'loading', 'error', 'training', 
                   'completed', 'saving', 'eval', 'config']
        
        for line in new_lines:
            if any(kw in line.lower() for kw in keywords):
                print(line)
        
        prev_log_length = len(lines)
    
    # Check status
    status = client.get_job_status(submission_id)
    
    if status in ["SUCCEEDED", "FAILED", "STOPPED"]:
        print("\n" + "=" * 70)
        if status == "SUCCEEDED":
            print("üéâ Training completed successfully!")
            print(f"\nüìÅ Model saved to: {OUTPUT_DIR}")
        elif status == "FAILED":
            print("‚ùå Training failed!")
            print("\nüìã Last 1000 chars of logs:")
            print(logs[-1000:])
        else:
            print(f"‚èπÔ∏è  Job stopped")
        print("=" * 70)
        break
    
    # Periodic update
    if i % 10 == 0:
        elapsed = i * check_interval
        print(f"\n[{elapsed}s] Status: {status}")
    
    time.sleep(check_interval)
else:
    print("\n‚ö†Ô∏è  Monitoring timeout - job may still be running")
    print(f"   Current status: {client.get_job_status(submission_id)}")

Monitoring training progress...
[TRAIN] Starting training script
[TRAIN] Config:
[TRAIN]   Max steps: 30
[TRAIN] Loading GSM8K dataset...
[TRAIN] Dataset loaded: 7473 train, 1319 eval
[TRAIN] Loading model: Qwen/Qwen2.5-0.5B
[TRAIN] Training arguments configured
[TRAIN] Starting training...
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
{'loss': 1.9064, 'grad_norm': 5.1875, 'learning_rate': 1.4482758620689657e-05, 'epoch': 0.01}
{'loss': 0.1662, 'grad_norm': 3.734375, 'learning_rate': 7.586206896551724e-06, 'epoch': 0.02}
{'loss': 0.1525, 'grad_norm': 4.71875, 'learning_rate': 6.896551724137931e-07, 'epoch': 0.03}
[A{'eval_loss': 0.46082746982574463, 'eval_runtime': 31.644, 'eval_samples_per_second': 41.682, 'eval_steps_per_second': 20.857, 'epoch': 0.03}
{'train_ru