In [3]:
# Standard library imports
import logging
import os
import sys
import time
from io import StringIO

In [None]:

from kubernetes import client as k8s, config as k8s_config
# Edit to match your specific settings
api_server = os.getenv("OPENSHIFT_API_URL")
token = os.getenv("NOTEBOOK_USER_TOKEN")
PVC_NAME = os.getenv("SHARED_PVC_NAME", "shared")

configuration = k8s.Configuration()
configuration.host = api_server
# Un-comment if your cluster API server uses a self-signed certificate or an un-trusted CA
configuration.verify_ssl = False
configuration.api_key = {"authorization": f"Bearer {token}"}
api_client = k8s.ApiClient(configuration)

PVC_MOUNT_PATH = "/opt/app-root/src"

In [None]:
import os
import gzip
import shutil
import socket
import time

import boto3
from botocore.config import Config as BotoConfig
from botocore.exceptions import ClientError

# --- Global networking safety net: cap all socket operations ---
socket.setdefaulttimeout(10)  # seconds

# Notebook's PVC mount path (per Notebook CR). Training pods will mount the same PVC at /opt/app-root/src
PVC_NOTEBOOK_PATH = "/opt/app-root/src"
DATASET_ROOT_NOTEBOOK = PVC_NOTEBOOK_PATH
TABLE_GPT_DIR = os.path.join(DATASET_ROOT_NOTEBOOK, "table-gpt-data", "train")
MODEL_DIR = os.path.join(DATASET_ROOT_NOTEBOOK, "Qwen", "Qwen2.5-1.5B-Instruct")
os.makedirs(TABLE_GPT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# Env config for S3/MinIO
s3_endpoint = os.getenv("AWS_DEFAULT_ENDPOINT", "")
s3_access_key = os.getenv("AWS_ACCESS_KEY_ID", "")
s3_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", "")
s3_bucket = os.getenv("AWS_STORAGE_BUCKET", "")
s3_prefix = os.getenv("AWS_STORAGE_BUCKET_DATA_DIR", "")  # e.g. "osft-data"

def stream_download(s3, bucket, key, dst):
    """
    Download an object from S3/MinIO using get_object and streaming reads.
    Returns True on success, False on any error.
    """
    print(f"[notebook] STREAM download s3://{bucket}/{key} -> {dst}")
    t0 = time.time()

    try:
        resp = s3.get_object(Bucket=bucket, Key=key)
    except ClientError as e:
        err = e.response.get("Error", {})
        print(f"[notebook] CLIENT ERROR (get_object) for {key}: {err}")
        return False
    except Exception as e:
        print(f"[notebook] OTHER ERROR (get_object) for {key}: {e}")
        return False

    body = resp["Body"]
    try:
        with open(dst, "wb") as f:
            while True:
                try:
                    chunk = body.read(1024 * 1024)  # 1MB per chunk
                except socket.timeout as e:
                    print(f"[notebook] socket.timeout while reading {key}: {e}")
                    return False
                if not chunk:
                    break
                f.write(chunk)
    except Exception as e:
        print(f"[notebook] ERROR writing to {dst} for {key}: {e}")
        return False

    t1 = time.time()
    print(f"[notebook] DONE  stream {key} in {t1 - t0:.2f}s")
    return True


if s3_endpoint and s3_bucket:
    try:
        # Normalize endpoint URL
        endpoint_url = (
            s3_endpoint
            if s3_endpoint.startswith("http")
            else f"https://{s3_endpoint}"
        )
        prefix = (s3_prefix or "").strip("/")

        print(
            f"S3 configured (boto3, notebook): "
            f"endpoint={endpoint_url}, bucket={s3_bucket}, prefix={prefix or '<root>'}"
        )

        # Boto config: single attempt, reasonable connect/read timeouts
        boto_cfg = BotoConfig(
            signature_version="s3v4",
            s3={"addressing_style": "path"},
            retries={"max_attempts": 1, "mode": "standard"},
            connect_timeout=5,
            read_timeout=10,
        )

        # Create S3/MinIO client
        s3 = boto3.client(
            "s3",
            endpoint_url=endpoint_url,
            aws_access_key_id=s3_access_key,
            aws_secret_access_key=s3_secret_key,
            config=boto_cfg,
            verify=False,
        )

        # List and download all objects under the prefix
        paginator = s3.get_paginator("list_objects_v2")
        pulled_any = False
        file_count = 0

        print(f"[notebook] Starting S3 download from prefix: {prefix}")
        for page in paginator.paginate(Bucket=s3_bucket, Prefix=prefix or ""):
            contents = page.get("Contents", [])
            if not contents:
                print(f"[notebook] No contents found in this page")
                continue
            
            print(f"[notebook] Found {len(contents)} objects in this page")

            for obj in contents:
                key = obj["Key"]
                file_count += 1

                # Skip "directory markers"
                if key.endswith("/"):
                    print(f"[notebook] Skipping directory marker: {key}")
                    continue

                # Determine relative path under prefix for local storage
                rel = key[len(prefix):].lstrip("/") if prefix else key
                print(f"[notebook] Processing key={key}, rel={rel}")
                
                # Route to appropriate directory based on content type
                if "table-gpt" in rel.lower() or rel.endswith(".jsonl"):
                    dst = os.path.join(TABLE_GPT_DIR, os.path.basename(rel))
                    print(f"[notebook] Routing to dataset dir: {dst}")
                elif "qwen" in rel.lower() or any(rel.endswith(ext) for ext in [".bin", ".json", ".model", ".safetensors", ".txt"]):
                    # Preserve directory structure for model files
                    dst = os.path.join(MODEL_DIR, rel.split("Qwen2.5-1.5B-Instruct/")[-1] if "Qwen2.5-1.5B-Instruct" in rel else os.path.basename(rel))
                    print(f"[notebook] Routing to model dir: {dst}")
                else:
                    # Default: use the relative path as-is
                    dst = os.path.join(DATASET_ROOT_NOTEBOOK, rel)
                    print(f"[notebook] Routing to default dir: {dst}")
                
                os.makedirs(os.path.dirname(dst), exist_ok=True)

                # Download only if missing
                if not os.path.exists(dst):
                    ok = stream_download(s3, s3_bucket, key, dst)
                    if not ok:
                        print(f"[notebook] Download failed for {key}")
                        continue
                    pulled_any = True
                else:
                    print(f"[notebook] Skipping existing file {dst}")
                    pulled_any = True

                # If the file is .gz, decompress and remove the .gz
                if dst.endswith(".gz") and os.path.exists(dst):
                    out_path = os.path.splitext(dst)[0]
                    if not os.path.exists(out_path):
                        print(f"[notebook] Decompressing {dst} -> {out_path}")
                        try:
                            with gzip.open(dst, "rb") as f_in, open(out_path, "wb") as f_out:
                                shutil.copyfileobj(f_in, f_out)
                        except Exception as e:
                            print(f"[notebook] Failed to decompress {dst}: {e}")
                        else:
                            try:
                                os.remove(dst)
                            except Exception:
                                pass

        print(f"[notebook] S3 download complete. Processed {file_count} files, pulled_any={pulled_any}")

    except Exception as e:
        print(f"[notebook] S3 fetch failed: {e}")
        import traceback
        traceback.print_exc()
else:
    print("[notebook] S3 not configured: missing endpoint or bucket env vars")
    # Fallback to HuggingFace if S3 is not configured
    print("[notebook] Falling back to HuggingFace dataset download...")
    import json
    import random
    from datasets import load_dataset

    # Load the Table-GPT dataset
    print("Loading Table-GPT dataset...")
    dataset = load_dataset("LipengCS/Table-GPT", "All")

    # Get the training split and create a random subset of 100 samples
    train_data = dataset["train"]
    print(f"Original training set size: {len(train_data)}")

    # Create a random subset of 100 samples
    random.seed(42)  # For reproducibility
    subset_indices = random.sample(range(len(train_data)), min(100, len(train_data)))
    subset_data = train_data.select(subset_indices)

    print(f"Subset size: {len(subset_data)}")

    # Save the subset to a JSONL file
    output_file = os.path.join(TABLE_GPT_DIR, "train_All_100.jsonl")
    with open(output_file, "w") as f:
        for example in subset_data:
            f.write(json.dumps(example) + "\n")

    print(f"Subset saved to {output_file}")

# Verify dataset file exists
dataset_file = os.path.join(TABLE_GPT_DIR, "train_All_100.jsonl")
if os.path.exists(dataset_file):
    print(f"[notebook] Dataset ready: {dataset_file}")
else:
    raise RuntimeError(f"Dataset file not found: {dataset_file}")

# Verify model directory has files
if os.path.exists(MODEL_DIR) and os.listdir(MODEL_DIR):
    print(f"[notebook] Model files ready in: {MODEL_DIR}")
    print(f"[notebook] Model files: {os.listdir(MODEL_DIR)[:5]}...")  # Show first 5 files
else:
    print(f"[notebook] Warning: Model directory is empty or missing: {MODEL_DIR}")
    print("[notebook] Training will attempt to download from HuggingFace during execution")

In [None]:
# Determine model path based on whether S3 download succeeded
import os
LOCAL_MODEL_PATH = "/opt/app-root/src/Qwen/Qwen2.5-1.5B-Instruct"
HUGGINGFACE_MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"

# Check if model was downloaded from S3
model_downloaded = os.path.exists(LOCAL_MODEL_PATH) and len(os.listdir(LOCAL_MODEL_PATH)) > 0

if model_downloaded:
    model_path_to_use = LOCAL_MODEL_PATH
    print(f"‚úì Using local model from S3: {model_path_to_use}")
else:
    model_path_to_use = HUGGINGFACE_MODEL_ID  
    print(f"‚úì Using HuggingFace model ID: {model_path_to_use}")

params = {
    ###########################################################################
    # ü§ñ Model + Data Paths                                                   #
    ###########################################################################
    "model_path": model_path_to_use,
    "data_path": "/opt/app-root/src/table-gpt-data/train/train_All_100.jsonl",
    "ckpt_output_dir": "/opt/app-root/src/checkpoints-logs-dir",
    "data_output_path": "/opt/app-root/src/osft-json/_data",
    ############################################################################
    # üèãÔ∏è‚Äç‚ôÄÔ∏è Training Hyperparameters                                              #
    ############################################################################
    # Important for OSFT
    "unfreeze_rank_ratio": 0.25,
    # Standard parameters
    "effective_batch_size": 128,
    "learning_rate": 5.0e-6,
    "num_epochs": 1,
    "lr_scheduler": "cosine",
    "warmup_steps": 0,
    "seed": 42,
    ###########################################################################
    # üèéÔ∏è Performance Hyperparameters                                          #
    ###########################################################################
    "use_liger": True,
    "max_tokens_per_gpu": 32000,
    "max_seq_len": 2048,
    ############################################################################
    # üíæ Checkpointing Settings                                                #
    ############################################################################
    # Here we only want to save the very last checkpoint
    "save_final_checkpoint": True,
    "checkpoint_at_epoch": False,
    # "nproc_per_node": 2,
    # "nnodes": 2,
    # Please note that the distributed training parameters are removed because they are
    # delegated to Kubeflow Trainer
}


In [None]:
from kubeflow.trainer import TrainerClient
from kubeflow.trainer.rhai import TrainingHubAlgorithms
from kubeflow.trainer.rhai import TrainingHubTrainer
from kubeflow_trainer_api import models
from kubeflow.common.types import KubernetesBackendConfig

backend_cfg = KubernetesBackendConfig(
    client_configuration=api_client.configuration,   # <‚Äî key part
)

client = TrainerClient(backend_cfg)
print(client)

In [None]:
th_runtime = None
for runtime in client.list_runtimes():
    if runtime.name == "training-hub-2node-1gpu":
        th_runtime = runtime
        print("Found runtime: " + str(th_runtime))
        break

if th_runtime is None:
    raise RuntimeError("Required runtime 'training-hub-2node-1gpu' not found")

In [None]:

from kubeflow.trainer.options.kubernetes import (
    PodTemplateOverrides,
    PodTemplateOverride,
    PodSpecOverride,
    ContainerOverride,
)

cache_root = "/opt/app-root/src/.cache/huggingface"
triton_cache = "/opt/app-root/src/.triton"

job_name = client.train(
    trainer=TrainingHubTrainer(
        algorithm=TrainingHubAlgorithms.OSFT,
        func_args=params,
        env={ 
            "HF_HOME": cache_root,
            "TRITON_CACHE_DIR": triton_cache,
            "XDG_CACHE_HOME": "/opt/app-root/src/.cache",
            "NCCL_DEBUG": "INFO",
        },
    ),
    options=[
        PodTemplateOverrides(
            PodTemplateOverride(
                target_jobs=["node"],
                spec=PodSpecOverride(
                    volumes=[
                        {"name": "work", "persistentVolumeClaim": {"claimName": PVC_NAME}},
                    ],
                    containers=[
                        ContainerOverride(
                            name="node", 
                            volume_mounts=[
                                {"name": "work", "mountPath": "/opt/app-root/src", "readOnly": False},
                            ],
                        )
                    ],
                ),
            )
        )
    ],
    runtime=th_runtime,
)

In [None]:
# Wait for the running status, then wait for completion or failure
# Using reasonable timeout for OSFT training
client.wait_for_job_status(name=job_name, status={"Running"}, timeout=300)
client.wait_for_job_status(name=job_name, status={"Complete", "Failed"}, timeout=1800)  # 30 minutes for training

# Check if the job succeeded
job = client.get_job(name=job_name)

# Check for success: status should be "Complete" and not "Failed"
if job.status == "Failed":
    print(f"ERROR: Training job failed")
    raise RuntimeError(f"Training job failed with status: {job.status}")
elif job.status == "Complete":
    print("‚úì Training job completed successfully")
else:
    # Unexpected status
    print(f"ERROR: Unexpected job status: {job.status}")
    raise RuntimeError(f"Training job ended with unexpected status: {job.status}")

In [None]:
for c in client.get_job(name=job_name).steps:
    print(f"Step: {c.name}, Status: {c.status}, Devices: {c.device} x {c.device_count}\n")

In [None]:
for logline in client.get_job_logs(job_name, follow=False):
    print(logline)

In [None]:
client.delete_job(job_name)