# 06E Module Works for Qwen and Llama (Hugging Face Login). This Notebook does LORA

# Update your HF_TOKEN In notebook before running the notebook

In [None]:
import subprocess, json

NS = "ray-finetune-llm-deepspeed002"

def run(cmd):
    return subprocess.check_output(cmd, text=True).strip()

print("=" * 70)
print("ENVIRONMENT VERIFICATION")
print("=" * 70)

print("\n1. OpenShift User:")
print(f"   {run(['oc', 'whoami'])}")

print("\n2. RayCluster Status:")
try:
    data = json.loads(run(["oc", "get", "raycluster", "ray", "-n", NS, "-o", "json"]))
    state = data.get("status", {}).get("state", "unknown")
    workers = data.get("status", {}).get("availableWorkerReplicas", 0)
    print(f"   State: {state}")
    print(f"   Workers: {workers}")
    
    head_tols = data["spec"]["headGroupSpec"]["template"]["spec"].get("tolerations", [])
    has_gpu = any(t.get("key") == "nvidia.com/gpu" for t in head_tols)
    print(f"   GPU toleration: {has_gpu}")
    
    if state != "ready":
        print("   ‚ö†Ô∏è  Cluster not ready yet")
except Exception as e:
    print(f"   ‚ùå Error: {e}")

print("\n" + "=" * 70)
print("‚úÖ Environment check complete")
print("=" * 70)

In [None]:
import subprocess
from codeflare_sdk import TokenAuthentication

token = subprocess.check_output(["oc", "whoami", "-t"]).decode().strip()
server = subprocess.check_output(["oc", "whoami", "--show-server=true"]).decode().strip()

auth = TokenAuthentication(
    token=token,
    server=server,
    skip_tls=True
)
auth.login()

print(f"‚úÖ Authenticated to: {server}")

In [None]:
# May be no need to run. dont run.

import os
# p="/opt/app-root/src/models/facebook/opt-125m"

p="s3://ocpmodel"
print("dir exists:", os.path.isdir(p))
if os.path.isdir(p):
    print("has config.json:", os.path.exists(os.path.join(p,"config.json")))
    print("files:", sorted(os.listdir(p))[:30])


In [None]:
import sys

!{sys.executable} -m pip install --upgrade --quiet --no-cache-dir \
    "numpy==1.26.4" \
    "pyarrow==15.0.2" \
    "datasets==2.18.0"

print("‚úÖ Dependencies installed")

In [None]:
import os

# =============================================================================
# MODEL CONFIGURATION
# =============================================================================

# WORKING_DIR = "/opt/app-root/src/MLforEng"

# Option 1: Use local uploaded model (RECOMMENDED - no download needed)
# LLM_MODEL_ID = "/opt/app-root/src/models/llama-3.2-1b-instruct"

# LLM_MODEL_ID = "/opt/app-root/src/models/facebook/opt-125m"

# LLM_MODEL_ID = "facebook/opt-125m"

# LLM_MODEL_ID = "gpt2"


# LLM_MODEL_ID = "s3://ocpmodel/Qwen2.5-0.5B"

# Option 2: Use HuggingFace model (requires network + token)
# LLM_MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
# LLM_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# LLM_MODEL_ID = "Qwen/Qwen2.5-0.5B"
LLM_MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_BUCKET_URI = "s3://ocpmodel"

# HuggingFace token (only needed for gated models like Llama)
HF_TOKEN = os.environ.get("HF_TOKEN", "YOURTOKEN")

# HF_TOKEN = os.environ.get("HF_TOKEN", "hf_YOUR TOKEN")

# =============================================================================
# DATASET CONFIGURATION  
# =============================================================================

# Option 1: Use your custom JSONL dataset
# DATASET_TYPE = "jsonl"
# TRAIN_JSONL = "artifacts/datasets/commscom_llama_prompts.jsonl"  # Relative to working_dir
# EVAL_JSONL = "artifacts/datasets/commscom_llama_prompts.jsonl"

# Option 2: Use GSM8K demo dataset (math problems)
DATASET_TYPE = "gsm8k"
TRAIN_JSONL = ""  # Not used for gsm8k
EVAL_JSONL = ""

# =============================================================================
# TRAINING CONFIGURATION
# =============================================================================

# OUTPUT_DIR = "/opt/app-root/src/models/llama-finetuned"
# STORAGE_PATH = "/opt/app-root/src"
OUTPUT_DIR = "s3://ocpmodel/outputdir"
# STORAGE_PATH = "s3://ocpmodel"

STORAGE_PATH = "/opt/app-root/src/ray_results"


# Training hyperparameters
MAX_STEPS = 30  # Small for demo; set to 0 to use NUM_TRAIN_EPOCHS
NUM_TRAIN_EPOCHS = 1
TRAIN_BATCH_SIZE = 2  # Can use 2 for 1B models, 1 for 8B
EVAL_BATCH_SIZE = 2
MAX_SEQ_LENGTH = 512
GRAD_ACCUM_STEPS = 4
LEARNING_RATE = 2e-5
WARMUP_RATIO = 0.03
SAVE_STEPS = 30
EVAL_STEPS = 30
SAVE_TOTAL_LIMIT = 2
NUM_DEVICES = 7

# Precision (use BF16 for modern GPUs like L4, A100)
USE_BF16 = True
USE_FP16 = False

# =============================================================================
# DISPLAY CONFIGURATION
# =============================================================================

print("=" * 70)
print("TRAINING CONFIGURATION")
print("=" * 70)
print(f"\nüì¶ Model:")
print(f"   ID: {LLM_MODEL_ID}")
print(f"   Type: {'Local' if LLM_MODEL_ID.startswith('s3') else 'HuggingFace'}")

print(f"\nüìä Dataset:")
print(f"   Type: {DATASET_TYPE}")
if DATASET_TYPE == "jsonl":
    print(f"   Train: {TRAIN_JSONL}")
    print(f"   Eval: {EVAL_JSONL}")
    
    # Verify file exists
    full_path = os.path.join("/opt/app-root/src/MLforEng", TRAIN_JSONL)
    if os.path.exists(full_path):
        with open(full_path) as f:
            num_lines = sum(1 for _ in f)
        print(f"   ‚úÖ File found: {num_lines} examples")
    else:
        print(f"   ‚ùå File not found: {full_path}")
else:
    print(f"   Using GSM8K (will download automatically)")

print(f"\n‚öôÔ∏è  Training:")
print(f"   Max steps: {MAX_STEPS}")
print(f"   Epochs: {NUM_TRAIN_EPOCHS}")
print(f"   Batch size: {TRAIN_BATCH_SIZE}")
print(f"   Max sequence: {MAX_SEQ_LENGTH}")
print(f"   Learning rate: {LEARNING_RATE}")
print(f"   Precision: {'BF16' if USE_BF16 else 'FP16' if USE_FP16 else 'FP32'}")

print(f"\nüíæ Output:")
print(f"   {OUTPUT_DIR}")

print("\n" + "=" * 70)

In [None]:
import os
import json

def diagnose_jsonl_file(file_path):
    """
    Checks if a JSONL file exists, is readable, and contains valid JSON.
    """
    print(f"üîç Diagnosing file: {file_path}")
    
    # 1. Check existence and permissions
    if not os.path.exists(file_path):
        print(f"‚ùå ERROR: File does not exist at the specified path.")
        return False
    print("‚úÖ File exists.")
    
    if not os.access(file_path, os.R_OK):
        print("‚ùå ERROR: File exists but is not readable (check permissions).")
        return False
    print("‚úÖ File is readable.")
    
    # 2. Check file size
    file_size = os.path.getsize(file_path)
    print(f"üìè File size: {file_size} bytes")
    if file_size == 0:
        print("‚ùå ERROR: File is empty.")
        return False
    
    # 3. Validate JSONL format line by line
    print("üß™ Validating JSONL format...")
    valid_lines = 0
    total_lines = 0
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f, 1):
                total_lines += 1
                line = line.strip()
                if not line:  # Skip empty lines
                    continue
                
                try:
                    json.loads(line)
                    valid_lines += 1
                except json.JSONDecodeError as e:
                    print(f"   ‚ùå Line {i}: JSON decode error - {e}")
                    print(f"      Problematic line content: '{line[:100]}...'")
                    return False
                    
    except UnicodeDecodeError:
        print("‚ùå ERROR: File is not UTF-8 encoded. Try different encoding.")
        return False
    except Exception as e:
        print(f"‚ùå ERROR: Could not read file - {e}")
        return False
    
    # 4. Summary
    print(f"üìä File Summary: {valid_lines} valid JSON objects out of {total_lines} lines.")
    
    if valid_lines > 0:
        print("‚úÖ File appears to be a valid JSONL.")
        return True
    else:
        print("‚ùå File contains no valid JSON objects.")
        return False

# Run the diagnosis on your file
file_path = "/opt/app-root/src/MLforEng/artifacts/datasets/commscom_llama_prompts.jsonl"
is_valid = diagnose_jsonl_file(file_path)

if not is_valid:
    print("\nüí° Next steps:")
    print("1. Check the specific line errors above")
    print("2. Open the file and verify it's pure JSONL (one JSON per line)")
    print("3. Ensure no BOM (Byte Order Mark) at file start")
    print("4. Try `encoding='utf-8-sig'` if you suspect BOM issues")

In [None]:
from ray.job_submission import JobSubmissionClient

NS = "ray-finetune-llm-deepspeed002"
ray_dashboard_url = f"http://ray-head-svc.{NS}.svc.cluster.local:8265"

client = JobSubmissionClient(ray_dashboard_url)

# Verify connection
jobs = client.list_jobs()
print(f"‚úÖ Connected to Ray: {ray_dashboard_url}")
print(f"   Existing jobs: {len(jobs)}")

In [None]:
import os
import json

MLFORENG_ROOT = "/opt/app-root/src/MLforEng"
# Verify DeepSpeed config exists
# ds_config_path = "./deepspeed_configs/zero_3_offload_optim_param.json"
# ds_config_path = "/opt/app-root/src/MLforEng/mlforeng/llm_finetune/deepspeed_configs/zero_3_offload_optim_param.json"

ds_config_path = f"{MLFORENG_ROOT}/mlforeng/llm_finetune/deepspeed_configs/zero_3_offload_optim_param.json"

if os.path.exists(ds_config_path):
    with open(ds_config_path, 'r') as f:
        ds_config = json.load(f)

    print(f"‚úÖ DeepSpeed config found: {ds_config_path}")
    print(f"   ZeRO stage: {ds_config.get('zero_optimization', {}).get('stage', 'N/A')}")
else:
    print(f"‚ùå DeepSpeed config NOT found: {ds_config_path}")
    print("\\nCreating default DeepSpeed ZeRO-3 config...")
    
    os.makedirs("./deepspeed_configs", exist_ok=True)




    
    
    default_config = {
        "train_batch_size": "auto",
        "train_micro_batch_size_per_gpu": "auto",
        "gradient_accumulation_steps": "auto",
        "gradient_clipping": 1.0,
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": True
            },
            "offload_param": {
                "device": "cpu",
                "pin_memory": True
            },
            "overlap_comm": True,
            "contiguous_gradients": True,
            "sub_group_size": 1e9,
            "reduce_bucket_size": 5e8,
            "stage3_prefetch_bucket_size": 5e8,
            "stage3_param_persistence_threshold": 1e6,
            "stage3_max_live_parameters": 1e9,
            "stage3_max_reuse_distance": 1e9,
            "stage3_gather_16bit_weights_on_model_save": True
        },
        "fp16": {
            "enabled": True,
            "loss_scale": 0,
            "loss_scale_window": 1000,
            "initial_scale_power": 16,
            "hysteresis": 2,
            "min_loss_scale": 1
        }
    }
    
    with open(ds_config_path, 'w') as f:
        json.dump(default_config, f, indent=2)
    
    print(f"‚úÖ Created: {ds_config_path}")

In [None]:
# base64 encoding:
import base64
import json

# Read the DeepSpeed config file and convert to base64
ds_config_path = "/opt/app-root/src/MLforEng/mlforeng/llm_finetune/deepspeed_configs/zero_3_offload_optim_param.json"
try:
    with open(ds_config_path, 'r') as f:
        ds_config_content = f.read()
    ds_config_b64 = base64.b64encode(ds_config_content.encode()).decode()
    print(f"‚úÖ DeepSpeed config encoded to base64 ({len(ds_config_b64)} chars)")
except Exception as e:
    print(f"‚ùå Failed to read config file: {e}")
    # Fall back to default config as base64
    default_config = {
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {"device": "cpu", "pin_memory": True},
            "offload_param": {"device": "cpu", "pin_memory": True},
            "overlap_comm": True,
            "contiguous_gradients": True
        },
        "fp16": {"enabled": True}
    }
    config_json = json.dumps(default_config)
    ds_config_b64 = base64.b64encode(config_json.encode()).decode()

In [None]:
# For S3: Set Ray storage path to the bucket; this can also be passed as runtime_env var also
os.environ["RAY_AIR_DEFAULT_STORAGE"] = "s3://ocpmodel/ray-results"

In [None]:
# NEW VERSION TO SUBMIT JOB FROM DS MODEL
print("Submitting training job...")
print("=" * 70)

# Build runtime environment
runtime_env = {
    # 1. FIXED: Use the correct module path for packaging
    "py_modules": ["/opt/app-root/src/MLforEng/mlforeng"],
    "working_dir": "/opt/app-root/src/MLforEng",
    # "torch>=2.6.0",
    "pip": [
      # Use pinned versions for stability during debugging
        "deepspeed>=0.9.0",
        "bitsandbytes>=0.39.0",
        "scipy",
        "scikit-learn",
        "accelerate==0.31.0",
        "awscliv2==2.3.0",
        "datasets==2.19.0",
        "peft==0.11.1",
        "transformers==4.44.0",
        "sentencepiece",
     ],
    "env_vars": {
        # 2. FIXED: Consolidated config into ENVIRONMENT VARIABLES only
       # "_FORCE_ENV_REBUILD": str(int(time.time())),  # Force fresh environment
        
        # Model Config (Your script reads these)
        "LLM_MODEL_ID": LLM_MODEL_ID,  # e.g., "Qwen/Qwen2.5-0.5B"
        "HF_TOKEN": HF_TOKEN,
        "MODEL_BUCKET_URI": MODEL_BUCKET_URI,
        # Dataset Config
        "DATASET_TYPE": DATASET_TYPE,   # e.g., "gsm8k" or "jsonl"
        "TRAIN_JSONL": TRAIN_JSONL if DATASET_TYPE == "jsonl" else "",
        "EVAL_JSONL": EVAL_JSONL if DATASET_TYPE == "jsonl" else "",

        "AWS_ACCESS_KEY_ID": "<ACCESSKEY>",
        "AWS_SECRET_ACCESS_KEY": "<SECRET KEY>",
        "AWS_DEFAULT_REGION": "us-east-2",

        # Training Config
        "OUTPUT_DIR": OUTPUT_DIR,
        "MAX_STEPS": "30",
        "NUM_TRAIN_EPOCHS": str(NUM_TRAIN_EPOCHS),
        "TRAIN_BATCH_SIZE": str(TRAIN_BATCH_SIZE),
        "EVAL_BATCH_SIZE": str(EVAL_BATCH_SIZE),
        "USE_BF16": "1",
        
        # DeepSpeed Config - Pass config name as an env var
        # "DEEPSPEED_CONFIG": "zero_3_offload_optim_param.json",
        "DEEPSPEED_CONFIG": ds_config_b64,

        "DEEPSPEED_CONFIG_BASE64": ds_config_b64,
        
        # Cache Paths
        "HF_HOME": f"{STORAGE_PATH}/.cache",
        "TRANSFORMERS_CACHE": f"{STORAGE_PATH}/.cache/transformers",
    },
}

# 3. FIXED: Simple entrypoint with NO command-line arguments.
# Your script should read all config from the environment variables above.
entrypoint = "python -m mlforeng.llm_finetune.ray_finetune_llm_deepspeed --lora"

# entrypoint = (
#    "python -m mlforeng.llm_finetune.ray_finetune_llm_deepspeed"
#    f"--model-name {LLM_MODEL_ID} "
#    f"--train-path {TRAIN_JSONL} "
#    f"--test-path {EVAL_JSONL} "
#    f"--dataset-config artifacts/datasets/config.json "  # or your cfg
#    f"--output-dir {OUTPUT_DIR} "
#    f"--ds-config /opt/app-root/src/MLforEng/deepspeed_configs/zero_3_offload_optim_param.json "
#    f"--storage-path {STORAGE_PATH}"
# )

# entrypoint = "python -m ray_finetune_llm_deepspeed"

try:
    submission_id = client.submit_job(
        entrypoint=entrypoint,  # This is now a simple string
        runtime_env=runtime_env
    )
    print(f"‚úÖ Job submitted successfully!")
    print(f"\nüìã Job ID: {submission_id}")
    
except Exception as e:
    print(f"‚ùå Submission failed: {e}")
    raise


# --- DEBUG: show what this notebook THINKS it's using ---

import sys, os, importlib, subprocess, textwrap

print("\n=== RAY JOB RUNTIME_ENV (client-side view) ===")
try:
    print("RAY jobs client address:", getattr(client, "address", "<not available>"))
except NameError:
    print("RAY jobs client is not defined in this notebook cell.")

print("RAY_ADDRESS used for submission:", ray_dashboard_url)
print("working_dir:", runtime_env.get("working_dir"))
print("py_modules:", runtime_env.get("py_modules"))
print("pip packages passed into runtime_env:")
for p in runtime_env.get("pip", []):
    print("  -", p)

print("\nenv_vars passed into runtime_env:")
print(json.dumps(runtime_env.get("env_vars", {}), indent=2))

print("\n=== LOCAL NOTEBOOK PYTHON & PATHS (not inside Ray job) ===")
print("Notebook sys.executable:", sys.executable)
print("Notebook CWD:", os.getcwd())

# Where is the 'ray' CLI coming from?
try:
    ray_path = subprocess.run(
        ["which", "ray"], capture_output=True, text=True, check=False
    ).stdout.strip()
    print("`ray` CLI path:", ray_path or "<which ray returned nothing>")
except Exception as e:
    print("Could not run `which ray`:", e)

# Where is the training module located (as seen from notebook)?
try:
    import mlforeng.llm_finetune.ray_finetune_llm_deepspeed as rfl

    print("Training module file:", os.path.abspath(rfl.__file__))
except Exception as e:
    print("Could not import mlforeng.llm_finetune.ray_finetune_llm_deepspeed:", e)

print("\n=== KEY LIBRARY VERSIONS IN NOTEBOOK ENV ===")
for name in ["torch", "transformers", "deepspeed", "accelerate", "datasets", "peft"]:
    try:
        m = importlib.import_module(name)
        version = getattr(m, "__version__", "unknown")
        mod_file = getattr(m, "__file__", "<no __file__>")
        print(f"  {name}: {version} ({mod_file})")
    except Exception as e:
        print(f"  {name}: NOT INSTALLED ({e})")

# requirements.txt sanity check (as seen from notebook)
req_path = os.path.join("/opt/app-root/src/MLforEng", "requirements.txt")
print("\nrequirements.txt exists:", os.path.exists(req_path), "->", req_path)
if os.path.exists(req_path):
    print("First 20 lines of requirements.txt:")
    try:
        with open(req_path) as f:
            for i, line in enumerate(f):
                if i >= 20:
                    break
                print("   ", line.rstrip())
    except Exception as e:
        print("Could not read requirements.txt:", e)
