In [3]:
# Standard library imports
import logging
import os
import sys
import time
from io import StringIO

In [None]:

from kubernetes import client as k8s, config as k8s_config
# Edit to match your specific settings
api_server = os.getenv("OPENSHIFT_API_URL")
token = os.getenv("NOTEBOOK_USER_TOKEN")
PVC_NAME = os.getenv("SHARED_PVC_NAME", "shared")

configuration = k8s.Configuration()
configuration.host = api_server
# Un-comment if your cluster API server uses a self-signed certificate or an un-trusted CA
configuration.verify_ssl = False
configuration.api_key = {"authorization": f"Bearer {token}"}
api_client = k8s.ApiClient(configuration)

PVC_MOUNT_PATH = "/opt/app-root/src"

In [None]:
import json
import random

from datasets import load_dataset

# Load the Table-GPT dataset
print("Loading Table-GPT dataset...")
dataset = load_dataset("LipengCS/Table-GPT", "All")

# Get the training split and create a random subset of 100 samples
train_data = dataset["train"]
print(f"Original training set size: {len(train_data)}")

# Create a random subset of 100 samples
random.seed(42)  # For reproducibility
subset_indices = random.sample(range(len(train_data)), min(100, len(train_data)))
subset_data = train_data.select(subset_indices)

print(f"Subset size: {len(subset_data)}")

# Save the subset to a JSONL file
# Save the subset to a JSONL file - USE ABSOLUTE PATH
output_dir = "table-gpt-data/train"
output_file = f"{output_dir}/train_All_100.jsonl"

print(f"Creating directory: {output_dir}")
os.makedirs(output_dir, exist_ok=True)

with open(output_file, "w") as f:
    for example in subset_data:
        f.write(json.dumps(example) + "\n")

print(f"Subset saved to {output_file}")

In [None]:
params = {
    ###########################################################################
    # ü§ñ Model + Data Paths                                                   #
    ###########################################################################
    "model_path": "Qwen/Qwen2.5-1.5B-Instruct",
    "data_path": "/opt/app-root/src/table-gpt-data/train/train_All_100.jsonl",
    "ckpt_output_dir": "/opt/app-root/src/checkpoints-logs-dir",
    "data_output_path": "/opt/app-root/src/osft-json/_data",
    ############################################################################
    # üèãÔ∏è‚Äç‚ôÄÔ∏è Training Hyperparameters                                              #
    ############################################################################
    # Important for OSFT
    "unfreeze_rank_ratio": 0.25,
    # Standard parameters
    "effective_batch_size": 128,
    "learning_rate": 5.0e-6,
    "num_epochs": 1,
    "lr_scheduler": "cosine",
    "warmup_steps": 0,
    "seed": 42,
    ###########################################################################
    # üèéÔ∏è Performance Hyperparameters                                          #
    ###########################################################################
    "use_liger": True,
    "max_tokens_per_gpu": 32000,
    "max_seq_len": 2048,
    ############################################################################
    # üíæ Checkpointing Settings                                                #
    ############################################################################
    # Here we only want to save the very last checkpoint
    "save_final_checkpoint": True,
    "checkpoint_at_epoch": False,
    # "nproc_per_node": 2,
    # "nnodes": 2,
    # Please note that the distributed training parameters are removed because they are
    # delegated to Kubeflow Trainer
}


‚öôÔ∏è  Training Hyperparameters


In [None]:
from kubeflow.trainer import TrainerClient
from kubeflow.trainer.rhai import TrainingHubAlgorithms
from kubeflow.trainer.rhai import TrainingHubTrainer
from kubeflow_trainer_api import models
from kubeflow.common.types import KubernetesBackendConfig

backend_cfg = KubernetesBackendConfig(
    client_configuration=api_client.configuration,   # <‚Äî key part
)

client = TrainerClient(backend_cfg)
print(client)

In [None]:
th_runtime = None
for runtime in client.list_runtimes():
    if runtime.name == "training-hub-2node-1gpu":
        th_runtime = runtime
        print("Found runtime: " + str(th_runtime))
        break

if th_runtime is None:
    raise RuntimeError("Required runtime 'training-hub-2node-1gpu' not found")

In [None]:

from kubeflow.trainer.options.kubernetes import (
    PodTemplateOverrides,
    PodTemplateOverride,
    PodSpecOverride,
    ContainerOverride,
)

cache_root = "/opt/app-root/src/.cache/huggingface"
triton_cache = "/opt/app-root/src/.triton"

job_name = client.train(
    trainer=TrainingHubTrainer(
        algorithm=TrainingHubAlgorithms.OSFT,
        func_args=params,
        env={ 
            "HF_HOME": cache_root,
            "TRITON_CACHE_DIR": triton_cache,
            "XDG_CACHE_HOME": "/opt/app-root/src/.cache",
            "NCCL_DEBUG": "INFO",
        },
    ),
    options=[
        PodTemplateOverrides(
            PodTemplateOverride(
                target_jobs=["node"],
                spec=PodSpecOverride(
                    volumes=[
                        {"name": "work", "persistentVolumeClaim": {"claimName": PVC_NAME}},
                    ],
                    containers=[
                        ContainerOverride(
                            name="node", 
                            volume_mounts=[
                                {"name": "work", "mountPath": "/opt/app-root/src", "readOnly": False},
                            ],
                        )
                    ],
                ),
            )
        )
    ],
    runtime=th_runtime,
)

In [None]:
# Wait for the running status, then completion.
client.wait_for_job_status(name=job_name, status={"Running"}, timeout=300)
client.wait_for_job_status(name=job_name, status={"Complete"}, timeout=600)

In [None]:
for c in client.get_job(name=job_name).steps:
    print(f"Step: {c.name}, Status: {c.status}, Devices: {c.device} x {c.device_count}\n")

In [None]:
for logline in client.get_job_logs(job_name, follow=False):
    print(logline)

In [None]:
client.delete_job(job_name)