### Train CNN LSTM

Reference: https://learn.microsoft.com/en-us/azure/machine-learning/how-to-train-pytorch?view=azureml-api-2

In [11]:
# Configure the workspace and authentication

from dotenv import load_dotenv
import os
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Load the .env file
load_dotenv()

subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID")
resource_group_name = os.getenv("AZURE_RESOURCE_GROUP")
workspace_name = os.getenv("AZURE_WORKSPACE_NAME")

credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=subscription_id,
    resource_group_name=resource_group_name,
    workspace_name=workspace_name,
)

In [12]:
# Create gpu cluster if it doesn't exist

from azure.ai.ml.entities import AmlCompute

gpu_compute_target = "gpucluteruk"
# gpu_compute_target = "gpuclutereastus2"
# gpu_compute_target = "gpuclutercentralindia"
# gpu_compute_target = "gpuclutereastus"

try:
    # let's see if the compute target already exists
    gpu_cluster = ml_client.compute.get(gpu_compute_target)
    print(
        f"You already have a cluster named {gpu_compute_target}, we'll reuse it as is."
    )

except Exception:
    print("Creating a new gpu compute target...")

    # Let's create the Azure ML compute object with the intended parameters
    gpu_cluster = AmlCompute(
        # Name assigned to the compute cluster
        name=gpu_compute_target,
        # Azure ML Compute is the on-demand VM service
        type="amlcompute",
        # VM Family
        size="Standard_ND96amsr_A100_v4", # Standard_ND96amsr_A100_v4 (A100 80vRAM) # Standard_ND40rs_v2 (V100 32vRAM) # STANDARD_NC6s_v3
        # Minimum running nodes when there is no job running
        min_instances=1,
        # Nodes in cluster
        max_instances=4,
        # How many seconds will the node running after the job termination
        idle_time_before_scale_down=180,
        # Dedicated or LowPriority. The latter is cheaper but there is a chance of job termination
        tier="Dedicated",
    )

    # Now, we pass the object to MLClient's create_or_update method
    gpu_cluster = ml_client.begin_create_or_update(gpu_cluster).result()

print(
    f"AMLCompute with name {gpu_cluster.name} is created, the compute size is {gpu_cluster.size}"
)

You already have a cluster named gpucluteruk, we'll reuse it as is.
AMLCompute with name gpucluteruk is created, the compute size is Standard_ND96amsr_A100_v4


In [13]:
# Enviroment variables
env_vars = {
    'AZURE_STORAGE_ACCOUNT': os.getenv("AZURE_STORAGE_ACCOUNT"),
    'AZURE_STORAGE_KEY': os.getenv("AZURE_STORAGE_KEY"),
    'BLOB_CONTAINER': os.getenv("BLOB_CONTAINER")
}

custom_env_name = "custom-acpt-pytorch-113-cuda117:9"
# Reference: https://learn.microsoft.com/en-us/azure/machine-learning/how-to-azure-container-for-pytorch-environment?view=azureml-api-2

In [14]:
# Training job

from azure.ai.ml import command
from azure.ai.ml import Input
from azure.ai.ml import PyTorchDistribution

MOSMED_DATASET_NAME='mosmed'
COVIDCTMD_DATASET_NAME='covidctmd'
LUNA16_DATASET_NAME='luna16'
CCCCII_DATASET_NAME = 'ccccii'

inputs = {
        'network': "cnn2d_multiclass",
        'dataset': f"{CCCCII_DATASET_NAME}", 
        'k': 5, 
        'i': 0, 
        'num_epochs': 20, 
        'learning_rate': 0.0005, 
        'batch_size': 16
}

job = command(
    inputs = inputs,
    compute=gpu_compute_target,
    environment=custom_env_name,
    code="../",  # location of source code
    command="python -m scripts.train.train_cnn2d_multiclass --dataset ${{inputs.dataset}} --k ${{inputs.k}} --i ${{inputs.i}} --num_epochs ${{inputs.num_epochs}} --batch_size ${{inputs.batch_size}} --learning_rate ${{inputs.learning_rate}}",
    environment_variables=env_vars,
    experiment_name="tran_cnn_2d_multiclass",
    display_name="train-cnn-2d-multiclass",
    tags={key: str(value) for key, value in inputs.items()}  # Convert all values to strings and add as tags
)

In [15]:
ml_client.jobs.create_or_update(job)

Experiment,Name,Type,Status,Details Page
tran_cnn_2d_multiclass,helpful_feather_t01xb24yhw,command,Starting,Link to Azure Machine Learning studio
