In [None]:
import os
from azure.ai.ml import MLClient, Input, Output, PyTorchDistribution, command
from azure.ai.ml.entities import (
    AmlCompute, Environment, BuildContext, Data,
    ManagedOnlineEndpoint, ManagedOnlineDeployment, CodeConfiguration, OnlineRequestSettings
)
from azure.identity import DefaultAzureCredential
from azure.ai.ml.constants import AssetTypes
import datetime

from dotenv import load_dotenv
load_dotenv(override=True)

# Azure ML workspace configuration
SUBSCRIPTION_ID = os.getenv("SUBSCRIPTION_ID")
RESOURCE_GROUP = os.getenv("RESOURCE_GROUP")
WORKSPACE_NAME = os.getenv("WORKSPACE_NAME")
COMPUTE_CLUSTER = "demo-gpucluster01"
HF_TOKEN = os.getenv("HF_TOKEN")


# authentication via managed identity or service principal (no hard-coded creds)
ml_client = MLClient(DefaultAzureCredential(), SUBSCRIPTION_ID, RESOURCE_GROUP, WORKSPACE_NAME)

# ensure compute cluster exists or create it
try:
    ml_client.compute.get(COMPUTE_CLUSTER)
except Exception:
    print("demo-gpucluster01 was not found")

<h5>Prepare Environment</h5>

In [None]:
os.makedirs("environment", exist_ok=True)

%%writefile ./environment/Dockerfile
FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2204-cu118-py310-torch271:biweekly.202508.1

# Install pip dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt --no-cache-dir
# Upgrade known vulnerable packages again just to be safe
RUN pip install --upgrade \
    requests==2.32.4 \
    urllib3==2.5.0 \
    pillow==11.3.0 || true

# Repeat for other envs if applicable
RUN /opt/conda/bin/pip install --upgrade \
    requests==2.32.4 \
    urllib3==2.5.0 \
    pillow==11.3.0 || true

RUN /opt/conda/envs/ptca/bin/pip install --upgrade \
    requests==2.32.4 \
    urllib3==2.5.0 \
    pillow==11.3.0 || true

# Inference requirements
COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20230419.v1 /artifacts /var/
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        libcurl4 \
        liblttng-ust1 \
        libunwind8 \
        libxml++2.6-2v5 \
        nginx-light \
        psmisc \
        rsyslog \
        runit \
        unzip && \
    apt-get clean && rm -rf /var/lib/apt/lists/* && \
    cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \
    cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \
    ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \
    rm -f /etc/nginx/sites-enabled/default

RUN apt-get update && \
    apt-get install -y --only-upgrade \
        libpython3.10-stdlib \
        python3.10 \
        libpython3.10-minimal \
        python3.10-minimal \
        libpam0g \
        libpam-modules-bin \
        libpam-modules \
        libpam-runtime \
        sudo && \
    apt-get clean && rm -rf /var/lib/apt/lists/*

ENV SVDIR=/var/runit
ENV WORKER_TIMEOUT=400
EXPOSE 5001 8883 8888

# support Deepspeed launcher requirement of passwordless ssh login
RUN apt-get update
RUN apt-get install -y openssh-server openssh-client

In [None]:
%%writefile ./environment/requirements.txt
azureml-core==1.60.0.post1
azureml-dataset-runtime==1.60.0
azureml-defaults==1.60.0
azure-ml==0.0.1
azure-ml-component==0.9.18.post2
azureml-mlflow==1.60.0.post1
azureml-contrib-services==1.60.0
azureml-contrib-services==1.60.0
azureml-inference-server-http
inference-schema
MarkupSafe==2.1.2
regex
pybind11
urllib3==2.5.0
requests==2.32.4
pillow==11.3.0
cryptography>=42.0.4
aiohttp>=3.12.14
py-spy==0.3.12
debugpy~=1.6.3
ipykernel~=6.0
tensorboard
psutil~=5.8.0
matplotlib~=3.5.0
tqdm~=4.66.3
py-cpuinfo==5.0.0
torch-tb-profiler~=0.4.0
trl>=0.20.0
peft>=0.17.0
transformers>=4.55.0
trackio

<h5>Create Environment</h5>

In [None]:
env_name = "env-gpt-oss-01"
docker_dir="./environment"

env_docker_image = Environment(
    build=BuildContext(path=docker_dir),
    name=env_name,
    description="Environment created from a Docker context.",
)
env_asset = ml_client.environments.create_or_update(env_docker_image)

In [None]:
#!pip install huggingface_hub
#!pip install ipywidgets
#!pip install transformers  # Uncomment if transformers not preinstalled

<h5>Login to Huggingface</h5>

In [None]:
from huggingface_hub import notebook_login

notebook_login()

<h5>Prepare Training Dataset</h5>

In [None]:
# pip install datasets pandas pyarrow
import re
from datasets import load_dataset, Dataset, DatasetDict

APTO_DS = "APTOinc/japanese-reasoning-dataset-sample"  # Sample. Same applies to real data.

# Extract <think> ... </think> and return (thinking, final)
THINK_RE = re.compile(r"<think>\s*(.*?)\s*</think>\s*(.*)", re.DOTALL)

def split_think_final(answer_text: str):
    if not answer_text:
        return "", ""
    m = THINK_RE.match(answer_text)
    if m:
        thinking = m.group(1).strip()
        final = m.group(2).strip()
    else:
        # Fallback: if no <think> tag, leave thinking empty and treat entire text as final
        thinking = ""
        final = answer_text.strip()
    return thinking, final

# 1) Load APTO dataset
ds = load_dataset(APTO_DS, split="train")

# 2) Assemble into Harmony format (messages column)
def to_harmony(example):
    thinking, final = split_think_final(example.get("answer", ""))

    # Following the Cookbook style, put 'reasoning language: Japanese' in the system message
    # (Minimal structure aligned with Multilingual-Thinking and Cookbook examples)
    messages = [
        {"role": "system",
         "content": "reasoning language: Japanese\n\nYou are an AI chatbot with a lively and energetic personality."},
        {"role": "user",
         "content": example.get("question", "").strip()},
        # The gpt-oss chat template lets the assistant have both thinking and content
        {"role": "assistant",
         "thinking": thinking,
         "content": final}
    ]
    return {"messages": messages}

dataset = ds.map(to_harmony, remove_columns=ds.column_names)

# 3) Save (either format is fine)
# Parquet (Multilingual-Thinking in the Cookbook is distributed as parquet)
dataset.to_parquet("apto_reasoning_harmony.parquet")
# JSONL (easier for humans to inspect)
dataset.to_json("apto_reasoning_harmony.jsonl", lines=True)

# If you'd prefer readable Japanese...
'''with open("apto_reasoning_harmony.jsonl", "w", encoding="utf-8") as f:
    for ex in converted:
        f.write(json.dumps(ex, ensure_ascii=False))
        f.write("\n")'''

print(dataset[0]["messages"])


In [None]:
print(len(dataset))

<h5>Register Dataset</h5>

In [None]:
data_uri = "./apto_reasoning_harmony.parquet"

data = Data(
    path = data_uri,
    type = AssetTypes.URI_FILE,
    description = "gpt-oss-20b",
    name = "apto_reasoning_harmony",
    version = '1'
)
ml_client.data.create_or_update(data)


<h5> Peek the 1st record of the dataset using Chat Template in Tokenizer </h5>

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")

messages = dataset[0]["messages"]
conversation = tokenizer.apply_chat_template(messages, tokenize=False)
print(conversation)

<h5>Load model on this compute resource</h5>

In [None]:
# LOAD MODEL ON THIS COMPUTE INSTANCE
import torch
from transformers import AutoModelForCausalLM, Mxfp4Config

quantization_config = Mxfp4Config(dequantize=True)
model_kwargs = dict(
    attn_implementation="eager",
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
    use_cache=False,
    #device_map="auto",
)
model = AutoModelForCausalLM.from_pretrained("openai/gpt-oss-20b", **model_kwargs)
model.to("cpu")


<h5>Run the model on this compute resource</h5>

In [None]:
messages = [
    {"role": "user", "content": "オーストラリアの首都はどこですか?"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

output_ids = model.generate(input_ids, max_new_tokens=512)
response = tokenizer.batch_decode(output_ids)[0]
print(response)

<h5> Save Training script to src folder </h5>

In [None]:
os.makedirs("src", exist_ok=True)

%%writefile ./src/train.py
import os
import argparse
import json
import torch

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Mxfp4Config,
)
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTConfig, SFTTrainer


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_file", type=str, required=True,
                   help="Converted Harmony data (parquet or jsonl)")
    parser.add_argument("--output_dir", type=str, default="./outputs",
                   help="Output directory")
    parser.add_argument("--push_to_hub", action="store_true",
                   help="Specify to push to Hugging Face Hub")
    parser.add_argument("--hub_model_id", type=str, default=None,
                   help="Model name on Hub (org/name). If omitted, uses output_dir name")
    parser.add_argument("--hf_token", type=str, default=None,
                   help="Hugging Face access token (or via env var HF_TOKEN)")
    parser.add_argument("--lr", type=float, default=2e-4)
    parser.add_argument("--epochs", type=int, default=1)
    parser.add_argument("--per_device_train_batch_size", type=int, default=2)
    parser.add_argument("--grad_accum_steps", type=int, default=8)
    parser.add_argument("--max_seq_len", type=int, default=2048)
    parser.add_argument("--warmup_ratio", type=float, default=0.03)
    parser.add_argument("--cosine_min_lr_rate", type=float, default=0.1)
    parser.add_argument("--logging_steps", type=int, default=10)
    parser.add_argument("--seed", type=int, default=42)
    return parser.parse_args()


def load_harmony_dataset(train_file):
    # Identify by file extension
    train_file = os.path.abspath(train_file)
    if train_file.endswith(".parquet"):
        ds = load_dataset("parquet", data_files=train_file)["train"]
    elif train_file.endswith(".jsonl") or train_file.endswith(".json"):
        ds = load_dataset("json", data_files=train_file, split="train")
    else:
        raise ValueError("train_file must be .parquet or .jsonl")
    if "messages" not in ds.column_names:
        raise ValueError("'messages' column not found. Please convert to Harmony format.")
    return ds


def build_formatting_func(tokenizer):
    # Convert Harmony messages -> text (trainer will tokenize internally)
    def _fmt(example):
        msgs = example["messages"]
        text = tokenizer.apply_chat_template(
            msgs,
            tokenize=False,
            add_generation_prompt=False,  # For supervised fine-tuning include the final response
        )
        return text
    return _fmt


def main():
    args = get_args()

    # Hub token (needed only when pushing)
    hf_token = args.hf_token or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACEHUB_API_TOKEN")

    # --- Model & Tokenizer ---
    # Use MXFP4 in-quantized (dequantize=False recommended) to save memory
    quantization_config = Mxfp4Config(dequantize=False)

    # Flexible dtype/device so it runs even on a CPU smoke test
    use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
    torch_dtype = torch.bfloat16 if use_bf16 else torch.float16 if torch.cuda.is_available() else torch.float32

    # In AzureML distributed environments accelerate is used, so avoid specifying device_map
    model = AutoModelForCausalLM.from_pretrained(
        "openai/gpt-oss-20b",
        attn_implementation="eager",
        torch_dtype=torch_dtype,
        quantization_config=quantization_config,
        use_cache=False,
        trust_remote_code=True,
    )

    tokenizer = AutoTokenizer.from_pretrained(
        "openai/gpt-oss-20b",
        trust_remote_code=True,
        use_fast=True,
    )
    # Minimal settings for chat / long text
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    peft_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules="all-linear",
        target_parameters=[
            "7.mlp.experts.gate_up_proj",
            "7.mlp.experts.down_proj",
            "15.mlp.experts.gate_up_proj",
            "15.mlp.experts.down_proj",
            "23.mlp.experts.gate_up_proj",
            "23.mlp.experts.down_proj",
        ],
    )
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    # --- Dataset ---
    train_ds = load_harmony_dataset(args.train_file)
    formatting_func = build_formatting_func(tokenizer)

    # --- SFT settings ---
    # report_to can be "none" / "tensorboard" / "wandb" etc
    training_args = SFTConfig(
        learning_rate=args.lr,
        gradient_checkpointing=True,
        #gradient_checkpointing_kwargs={"use_reentrant": False},  # added
        #ddp_find_unused_parameters=False,                        # may cause OOM
        #packing=False,                                           # added
        ##max_seq_length=args.max_seq_len,                         # may hit trl bug
        num_train_epochs=args.epochs,
        logging_steps=args.logging_steps,
        per_device_train_batch_size=args.per_device_train_batch_size,
        gradient_accumulation_steps=args.grad_accum_steps,
        warmup_ratio=args.warmup_ratio,
        lr_scheduler_type="cosine_with_min_lr",
        lr_scheduler_kwargs={"min_lr_rate": args.cosine_min_lr_rate},
        output_dir=args.output_dir,
        report_to="trackio", # changed from none
        bf16=use_bf16,  # enabled only if GPU supports bf16
        fp16=(torch.cuda.is_available() and not use_bf16),
        seed=args.seed,
        push_to_hub=args.push_to_hub,
        hub_model_id=(args.hub_model_id or os.path.basename(args.output_dir)) if args.push_to_hub else None,
        hub_token=hf_token if args.push_to_hub else None,
        save_total_limit=2,
        save_steps=500,
    )

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        processing_class=tokenizer,
        formatting_func=formatting_func,  # Convert messages to a plain string
    )

    trainer.train()
    trainer.save_model(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)

    # === Merge LoRA into base model and save ===
    try:
        print("Merging LoRA adapter into base model...")
        merged_model = model.merge_and_unload()
        merged_dir = os.path.join(args.output_dir, "merged")
        os.makedirs(merged_dir, exist_ok=True)
        merged_model.save_pretrained(merged_dir)
        tokenizer.save_pretrained(merged_dir)
        print(f"Merged model saved at: {merged_dir}")
    except Exception as e:
        print(f"[WARN] Could not merge model automatically: {e}")


if __name__ == "__main__":
    main()


<h5>Start Training Job</h5>

In [None]:
# job configuration
NUM_NODES = 1
NUM_GPU_PER_NODE = 1

job = command(
    code="./src",  # Root (refers to src/train.py)
    command=(
        "python train.py "
        "--train_file ${{inputs.train_file}} "
        #"--output_dir ${{outputs.model_dir}} "   # default is ./outputs
        "--epochs 8 --per_device_train_batch_size 2 --grad_accum_steps 8"
    ),
    inputs={
        "train_file": Input(
            type=AssetTypes.URI_FILE,
            path="apto_reasoning_harmony@latest"
        )
    },
    outputs={"model_dir": {"mode": "rw_mount", "path": "azureml://datastores/workspaceblobstore/paths/models/gpt-oss-20b-jp-reasoner"}},
    environment="env-gpt-oss-01:2",  # Contains transformers>=4.44, trl>=0.9, peft>=0.10, accelerate>=0.33
    compute=COMPUTE_CLUSTER,
    display_name="gpt-oss-20b-01",
    experiment_name="gpt-oss-20b-01-exp",
    instance_count=NUM_NODES,
    distribution=PyTorchDistribution(
        process_count_per_instance=NUM_GPU_PER_NODE,
        node_count=NUM_NODES
    ),
    environment_variables={
        "NCCL_DEBUG": "WARN",
        #"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
        "HF_TOKEN": HF_TOKEN,
        "HF_HOME": "./outputs/hfhome",
        "TRACKIO_PROJECT": "ft-project"
    }
)
returned_job = ml_client.jobs.create_or_update(job)
print(returned_job.studio_url)

<h5>Register Model</h5>

In [None]:
model_path_from_job = "azureml://jobs/{0}/outputs/{1}".format(
    returned_job.name, "artifacts/outputs/merged"
)

print("path to register model: ", model_path_from_job)

In [None]:
from azure.ai.ml.entities import Model
model = Model(
    path=model_path_from_job,  # Reference to training job output
    name="gpt-oss-20b-jp-reasoner-01-pre",
    description="LoRA fine-tuned gpt-oss-20b model for Japanese reasoning",
    type="custom_model",   # Hugging Face models are commonly registered as custom_model
    version="3",             # Explicit version (or omit for auto increment)
)

registered_model = ml_client.models.create_or_update(model)
print(f"Registered: {registered_model.name}:{registered_model.version}")

<h5>Deploy model

In [None]:
%%writefile ./src/score.py
# score.py
import os, json, torch
from transformers import AutoTokenizer, AutoModelForCausalLM

REASONING_LANGUAGE_DEFAULT = "Japanese"

def init():
    global model, tokenizer
    model_root = os.environ.get("AZUREML_MODEL_DIR", ".")
    model_dir  = os.path.join(model_root, os.getenv("MODEL_SUBDIR", "merged"))  # Read merged/

    tokenizer = AutoTokenizer.from_pretrained(
        model_dir, trust_remote_code=True, local_files_only=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_dir,
        trust_remote_code=True,
        torch_dtype=(torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else None),
        device_map=("auto" if torch.cuda.is_available() else None),
        local_files_only=True,
    )
    model.eval()

def _build_messages(data):
    # Input can be either 1) {"prompt": "..."} or 2) {"messages": [...]} 
    system_prompt = data.get(
        "system_prompt",
        f"reasoning language: {data.get('reasoning_language', REASONING_LANGUAGE_DEFAULT)}"
    )

    if "messages" in data and isinstance(data["messages"], list):
        msgs = data["messages"]
        # Prepend system if the first message isn't system
        if not msgs or msgs[0].get("role") != "system":
            msgs = [{"role": "system", "content": system_prompt}] + msgs
        return msgs

    # Default messages if only a "prompt" was provided
    user_prompt = data.get("prompt", "")
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

def run(raw_data):
    try:
        data = json.loads(raw_data) if isinstance(raw_data, str) else raw_data

        # ---- Build messages (based on provided code) ----
        messages = _build_messages(data)

        # Generation parameters (lightweight defaults)
        max_new = int(data.get("max_new_tokens", 128))
        do_sample = bool(data.get("do_sample", False))
        gen_kwargs = {
            "max_new_tokens": max_new,
            "do_sample": do_sample,
        }
        # Only set temperature if sampling; otherwise it triggers a warning
        if do_sample and "temperature" in data:
            gen_kwargs["temperature"] = float(data["temperature"])

        # ---- chat template -> input_ids ----
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to(model.device)

        # ---- Generate ----
        with torch.no_grad():
            output_ids = model.generate(input_ids, **gen_kwargs)

        # ---- Remove prompt echo (keep only the newly generated part) ----
        generated_ids = output_ids[0, input_ids.shape[-1]:]
        text = tokenizer.decode(generated_ids, skip_special_tokens=True)

        # Return JSON UTF-8 (simplifies client display)
        return json.dumps({"output": text}, ensure_ascii=False)

    except Exception as e:
        # Return error also as JSON
        return json.dumps({"error": str(e)}, ensure_ascii=False)


<h5>Create Endpoint</h5>

In [None]:
endpoint_name = f"gptoss-jp-{datetime.datetime.now():%m%d%H%M}"

# 1) Create endpoint
endpoint = ManagedOnlineEndpoint(
    name=endpoint_name, 
    description="gpt-oss-20b",
    auth_mode="key")
ml_client.begin_create_or_update(endpoint).result()

<h5>Deploy the model</h5>

In [None]:
# 2) Deploy (refer to the already registered "merged" model in Model Registry)
registered = ml_client.models.get("gpt-oss-20b-jp-reasoner-01-pre", version="3")

deploy = ManagedOnlineDeployment(
    name="blue",
    endpoint_name=endpoint_name,
    model=registered.id,
    environment="env-gpt-oss-01@latest",
    code_configuration=CodeConfiguration(code="./src", scoring_script="score.py"),
    instance_type="Standard_NC40ads_H100_v5", 
    instance_count=1,
    request_settings=OnlineRequestSettings(
        request_timeout_ms=180000,
        max_concurrent_requests_per_instance=1,
        max_queue_wait_ms=180000,
    ),
)
ml_client.begin_create_or_update(deploy).result()

<h5>Change the traffic</h5>

In [None]:
# 3) Traffic routing
ep = ml_client.online_endpoints.get(endpoint_name)
ep.traffic = {"blue": 100}
ml_client.begin_create_or_update(ep).result()

print("endpoint:", endpoint_name)

<h5>Send Request </h5>

In [None]:
from azure.ai.ml.entities import OnlineRequestSettings
from azure.ai.ml import MLClient
import json, requests

keys = ml_client.online_endpoints.get_keys(name=endpoint_name).primary_key
scoring_url = ml_client.online_endpoints.get(endpoint_name).scoring_uri
headers = {"Authorization": f"Bearer {keys}"}

payload = {
  "messages": [
    {"role": "system", "content": "reasoning language: Japanese"},
    {"role": "user", "content": "オーストラリアの首都はどこですか？"}
  ],
  "max_new_tokens": 500
}

res = requests.post(scoring_url, headers=headers, json=payload, timeout=300)
print(res.status_code)
data = res.json()                 # First-level decode
if isinstance(data, str):         # If still a string, decode second-level
    data = json.loads(data)

print(data["output"])  

<h5>Delete Endpoint</h5>

In [None]:
ml_client.online_endpoints.begin_delete(name=endpoint_name).wait()

<h5>Problem Detection</h5>

In [None]:
print(res.status_code)
print(res.headers.get("content-type"))
print(res.text[:1000])  

In [None]:
ml_client.online_deployments.get_logs(
    name="blue", endpoint_name=endpoint_name, lines=200
)

<h5>End of notebook</h5>