In [1]:
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import ModelType

from datetime import datetime
import os
from datetime import datetime

from azure.ai.ml import MLClient, command, dsl, Input, Output
from azure.ai.ml.entities import Environment, BuildContext, AmlCompute
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment, Model
from azure.ai.ml.constants import AssetTypes

credential = InteractiveBrowserCredential()

ml_client = MLClient.from_config(credential, path="sandbox_east2_config.json")

Found the config file in: sandbox_east2_config.json


In [42]:
MODEL_NAME = "custom-decoder"
DATA_PREP_DIR = "data_prep"
TRAIN_DIR = "train"
ENV_FILENAME = "env.yaml"
DATA_PREP_ENV_NAME = "data_prep_env"
TRAIN_ENV_NAME = "train_env"
DATA_PREP_COMPUTE_NAME = "cpu-tokenization"
DATA_PREP_INSTANCE_TYPE = "Standard_DS3_v2"
TRAIN_COMPUTE_NAME = "run-clm-deepspeed"
TRAIN_INSTANCE_TYPE = "Standard_NC24s_v3"

DATASTORE_NAME = "workspaceartifactstore"

DATASTORE_TOKENIZER_PATH = "blob_data/tokenizer"
DATASTORE_DATA_PATH = "blob_data/data"
SEQUENCE_LENGTH = 2048
DATA_PREP_NUM_PROCESSES = 4
TOKENIZED_DATASET_PATH = "blob_data/tokenized"
SAMPLES_PER_FILE = 200_000

DATASTORE_MODEL_CONFIG_PATH = "./5B_model_config.json"
DEEPSPEED_CONFIG = "./deepspeed_config.json"

# YearMonthDayHourMinute
timenow = datetime.utcnow().strftime("%Y%m%d%H%M")

TRAINING_OUTPUT_PATH = "gpt-bigcode-"+timenow

# Helper functions to create/get environments/compute 

In [3]:
def get_or_create_compute_target(
    ml_client,
    compute_name,
    instance_type="STANDARD_DS3_v2",
    min_nodes=0,
    max_nodes=1,
    idle_time=300,
):
    try:
        cmpute = ml_client.compute.get(compute_name)
        cmpute_name = cmpute.name
    except Exception:
        print(f"Creating a new {instance_type} compute target...")
        compute = AmlCompute(
            name=compute_name,
            size=instance_type,
            min_instances=min_nodes,
            max_instances=max_nodes,
            idle_time_before_scale_down=idle_time,
        )
        ml_client.compute.begin_create_or_update(compute)
        cmpute_name = compute.name
    return cmpute_name


def get_environment(
    environment_name, dependencies_dir, ml_client, gpu=False, dep_yaml=None
):
    try:
        env = ml_client.environments.get(name=environment_name)
    except Exception:
        if gpu:
            image = "mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04:latest"
        else:
            image = "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest"
        env = Environment(
            name=environment_name,
            description="Custom environment",
            conda_file=os.path.join(dependencies_dir, dep_yaml),
            image=image,
        )

        env = ml_client.environments.create_or_update(env)

    return env

# Data Prep (Tokenization)

In [4]:
data_prep_environment = get_environment(
    environment_name=DATA_PREP_ENV_NAME,
    dependencies_dir=DATA_PREP_DIR,
    ml_client=ml_client,
    gpu=False,
    dep_yaml=ENV_FILENAME,
)

data_prep_compute = get_or_create_compute_target(
        ml_client=ml_client,
        compute_name=DATA_PREP_COMPUTE_NAME,
        min_nodes=1,
        max_nodes=1,
        instance_type=DATA_PREP_INSTANCE_TYPE,
    )

Creating a new Standard_DS3_v2 compute target...


In [19]:
data_prep_command = command(
    name="data_prep",
    display_name="Data preparation for pretraining",
    description="reads in json files, creates tokenized Dataset",
    inputs={
        "train_data_dir": Input(
            type="uri_folder",
            path=f"azureml://datastores/{DATASTORE_NAME}/paths/{DATASTORE_DATA_PATH}/train",
            mode="ro_mount",
        ),
        "eval_data_dir": Input(
            type="uri_folder",
            path=f"azureml://datastores/{DATASTORE_NAME}/paths/{DATASTORE_DATA_PATH}/eval",
            mode="ro_mount",
        ),
        "max_seq_length": SEQUENCE_LENGTH,
        "tokenizer_path": Input(
            type="uri_folder",
            path=f"azureml://datastores/{DATASTORE_NAME}/paths/{DATASTORE_TOKENIZER_PATH}",
            mode="ro_mount",
        ),
        "num_proc": DATA_PREP_NUM_PROCESSES,
        "samples_per_file": SAMPLES_PER_FILE,
    },
    outputs={
        "output_dir": Output(
            type="uri_folder",
            path=f"azureml://datastores/{DATASTORE_NAME}/paths/{TOKENIZED_DATASET_PATH}",
            mode="rw_mount",
        ),
    },
    # The source folder of the component
    code="./data_prep",
    command="""python run.py \
            --train_data_dir ${{inputs.train_data_dir}} \
            --eval_data_dir ${{inputs.eval_data_dir}} \
            --max_seq_length ${{inputs.max_seq_length}} \
            --tokenizer_path ${{inputs.tokenizer_path}} \
            --num_proc ${{inputs.num_proc}} \
            --samples_per_file ${{inputs.samples_per_file}} \
            --output_dir ${{outputs.output_dir}} \
            """,
    environment=f"{data_prep_environment.name}:{data_prep_environment.version}",
    compute=data_prep_compute
)


# Train

In [37]:
train_environment = get_environment(
    environment_name=TRAIN_ENV_NAME,
    dependencies_dir=TRAIN_DIR,
    ml_client=ml_client,
    gpu=True,
    dep_yaml=ENV_FILENAME,
)

train_compute = get_or_create_compute_target(
        ml_client=ml_client,
        compute_name=TRAIN_COMPUTE_NAME,
        min_nodes=1,
        max_nodes=1,
        instance_type=TRAIN_INSTANCE_TYPE,
    )

In [47]:
train_command = command(
    name="train",
    display_name="Train CLM model",
    description="Trains using CLM objective on tokenized data",
    inputs={
        "data_dir": Input(type="uri_folder"),
        "config_path": DATASTORE_MODEL_CONFIG_PATH,
        "tokenizer_path": Input(
            type="uri_folder",
            path=f"azureml://datastores/{DATASTORE_NAME}/paths/{DATASTORE_TOKENIZER_PATH}",
            mode="ro_mount",
        ),
        "deepspeed_config_path": DEEPSPEED_CONFIG,
    },
    outputs={
        "output_dir": Output(
            type="uri_folder",
            path=f"azureml://datastores/{DATASTORE_NAME}/paths/{TRAINING_OUTPUT_PATH}",
            mode="rw_mount",
        ),
    },
    # The source folder of the component
    code="./train",
    command="""torchrun --nnodes 1 --nproc_per_node 4 run.py \
            --data_dir ${{inputs.data_dir}} \
            --config_path ${{inputs.config_path}} \
            --tokenizer_path ${{inputs.tokenizer_path}} \
            --do_train \
            --do_eval \
            --evaluation_strategy epoch \
            --save_strategy epoch \
            --logging_steps 25 \
            --per_device_train_batch_size 1 \
            --per_device_eval_batch_size 1 \
            --learning_rate 3e-5 \
            --num_train_epochs 10 \
            --weight_decay 0.01 \
            --optim adamw_torch \
            --warmup_steps 100 \
            --fp16 \
            --output_dir ${{outputs.output_dir}} \
            --logging_dir ${{outputs.output_dir}} \
            --dataloader_num_workers 4 \
            --gradient_checkpointing True \
            --gradient_accumulation_steps 1 \
            --seed 42 \
            --report_to mlflow \
            --deepspeed ${{inputs.deepspeed_config_path}}
            """,
    environment=f"{train_environment.name}:{train_environment.version}",
    compute=train_compute
)


In [48]:
@dsl.pipeline(
    description="Pretraining decoder-only model using Deepspeed",
    display_name=f"Deepspeed Pretraining",
)
def pipeline_func():

    data_prep_job = data_prep_command()

    train_job = train_command(
        data_dir=data_prep_job.outputs.output_dir,
    )

    return {
        "pipeline_job_train_data": data_prep_job.outputs.output_dir,
    }

pipeline = pipeline_func()


pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    experiment_name="deepspeed" + timenow,
)

# Register model

Uploading train (0.01 MBs): 100%|██████████| 10551/10551 [00:00<00:00, 26083.73it/s]




In [45]:
pipeline_job

Experiment,Name,Type,Status,Details Page
deepspeed202306060411,keen_yogurt_7grjwmzvs8,pipeline,Preparing,Link to Azure Machine Learning studio
