### Train LSTM

Reference: https://learn.microsoft.com/en-us/azure/machine-learning/how-to-train-pytorch?view=azureml-api-2

In [8]:
# Import necessary libraries
from dotenv import load_dotenv
from datetime import datetime
import os

# Load the .env file
load_dotenv()


True

In [9]:
# Azure ML specific imports
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import AmlCompute
from azure.ai.ml import command

# Configure the Azure workspace and authentication
subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID")
resource_group_name = os.getenv("AZURE_RESOURCE_GROUP")
workspace_name = os.getenv("AZURE_WORKSPACE_NAME")

credential = DefaultAzureCredential()
ml_client = MLClient(
    credential=credential,
    subscription_id=subscription_id,
    resource_group_name=resource_group_name,
    workspace_name=workspace_name,
)

# Create or get the GPU cluster
gpu_compute_target = "gpucluteruk"
try:
    gpu_cluster = ml_client.compute.get(gpu_compute_target)
    print(f"You already have a cluster named {gpu_compute_target}, we'll reuse it as is.")
except Exception:
    print("Creating a new GPU compute target...")
    gpu_cluster = AmlCompute(
        name=gpu_compute_target,
        type="amlcompute",
        size="Standard_ND96amsr_A100_v4",
        min_instances=1,
        max_instances=4,
        idle_time_before_scale_down=180,
        tier="Dedicated",
    )
    gpu_cluster = ml_client.begin_create_or_update(gpu_cluster).result()
print(f"AMLCompute with name {gpu_cluster.name} is created, the compute size is {gpu_cluster.size}")


You already have a cluster named gpucluteruk, we'll reuse it as is.
AMLCompute with name gpucluteruk is created, the compute size is Standard_ND96amsr_A100_v4


In [10]:
# Azure ML environment and job setup
custom_env_name = "custom-acpt-pytorch-113-cuda117:10"

env_vars = {
    'AZURE_STORAGE_ACCOUNT': os.getenv("AZURE_STORAGE_ACCOUNT"),
    'AZURE_STORAGE_KEY': os.getenv("AZURE_STORAGE_KEY"),
    'BLOB_CONTAINER': os.getenv("BLOB_CONTAINER"),
    'PRETRAINED_CNN_MODEL_URI': os.getenv('PRETRAINED_CNN_MODEL_URI')
}

# Common variables
network = "lstm_multiclass"
dataset = "ccccii"
k = 5
i = 0
num_epochs = 20
learning_rate = 0.0005
batch_size = 16
max_samples = 0


inputs = {
    'network': network,
    'dataset': dataset, 
    'k': k, 
    'i': i, 
    'num_epochs': num_epochs, 
    'learning_rate': learning_rate, 
    'batch_size': batch_size,
    'max_samples': max_samples
}


def get_display_name(base_name):
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    display_name = f"{base_name} {current_time}"
    return display_name

experiment_name = "lstm_multiclass_kfold"
import urllib
parsed_url = urllib.parse.urlparse(os.getenv('PRETRAINED_CNN_MODEL_URI'))
path_parts = parsed_url.path.split('/')
pre_trained_model = path_parts[3].replace('dcid.', '') + '/' + path_parts[5]
tags = {'pretrained_model': pre_trained_model, 'k': k, 'i': i}
tags.update({key: str(value) for key, value in inputs.items()})
job = command(
    inputs = inputs,
    compute=gpu_compute_target,
    environment=custom_env_name,
    code="../",  # location of source code
    command="python -m scripts.train.train_lstm_multiclass --run_cloud --dataset ${{inputs.dataset}} --k ${{inputs.k}} --i ${{inputs.i}} --num_epochs ${{inputs.num_epochs}} --batch_size ${{inputs.batch_size}} --learning_rate ${{inputs.learning_rate}} --max_samples ${{inputs.max_samples}}",
    environment_variables=env_vars,
    experiment_name=experiment_name,
    display_name=get_display_name(f"{experiment_name}_{k}_{i}"),
    tags=tags,
)

ml_client.jobs.create_or_update(job)


Experiment,Name,Type,Status,Details Page
lstm_multiclass_kfold,crimson_foot_gxmh6gw0g2,command,Starting,Link to Azure Machine Learning studio


In [11]:
k = 5
i = 1
inputs = {
    'network': network,
    'dataset': dataset, 
    'k': k, 
    'i': i, 
    'num_epochs': num_epochs, 
    'learning_rate': learning_rate, 
    'batch_size': batch_size,
    'max_samples': max_samples
}
experiment_name = f"lstm_multiclass_kfold"
import urllib
parsed_url = urllib.parse.urlparse(os.getenv('PRETRAINED_CNN_MODEL_URI'))
path_parts = parsed_url.path.split('/')
pre_trained_model = path_parts[3].replace('dcid.', '') + '/' + path_parts[5]
tags = {'pretrained_model': pre_trained_model, 'k': k, 'i': i}
tags.update({key: str(value) for key, value in inputs.items()})
job = command(
    inputs = inputs,
    compute=gpu_compute_target,
    environment=custom_env_name,
    code="../",  # location of source code
    command="python -m scripts.train.train_lstm_multiclass --run_cloud --dataset ${{inputs.dataset}} --k ${{inputs.k}} --i ${{inputs.i}} --num_epochs ${{inputs.num_epochs}} --batch_size ${{inputs.batch_size}} --learning_rate ${{inputs.learning_rate}} --max_samples ${{inputs.max_samples}}",
    environment_variables=env_vars,
    experiment_name=experiment_name,
    display_name=get_display_name(f"{experiment_name}_{k}_{i}"),
    tags=tags,
)
ml_client.jobs.create_or_update(job)

Experiment,Name,Type,Status,Details Page
lstm_multiclass_kfold,heroic_plastic_qcxqpyn34b,command,Starting,Link to Azure Machine Learning studio


In [12]:
k = 5
i = 2
inputs = {
    'network': network,
    'dataset': dataset, 
    'k': k, 
    'i': i, 
    'num_epochs': num_epochs, 
    'learning_rate': learning_rate, 
    'batch_size': batch_size,
    'max_samples': max_samples
}
experiment_name = f"lstm_multiclass_kfold"
import urllib
parsed_url = urllib.parse.urlparse(os.getenv('PRETRAINED_CNN_MODEL_URI'))
path_parts = parsed_url.path.split('/')
pre_trained_model = path_parts[3].replace('dcid.', '') + '/' + path_parts[5]
tags = {'pretrained_model': pre_trained_model, 'k': k, 'i': i}
tags.update({key: str(value) for key, value in inputs.items()})
job = command(
    inputs = inputs,
    compute=gpu_compute_target,
    environment=custom_env_name,
    code="../",  # location of source code
    command="python -m scripts.train.train_lstm_multiclass --run_cloud --dataset ${{inputs.dataset}} --k ${{inputs.k}} --i ${{inputs.i}} --num_epochs ${{inputs.num_epochs}} --batch_size ${{inputs.batch_size}} --learning_rate ${{inputs.learning_rate}} --max_samples ${{inputs.max_samples}}",
    environment_variables=env_vars,
    experiment_name=experiment_name,
    display_name=get_display_name(f"{experiment_name}_{k}_{i}"),
    tags=tags,
)
ml_client.jobs.create_or_update(job)

Experiment,Name,Type,Status,Details Page
lstm_multiclass_kfold,lucid_owl_f0nk241yng,command,Starting,Link to Azure Machine Learning studio


In [13]:
k = 5
i = 3
inputs = {
    'network': network,
    'dataset': dataset, 
    'k': k, 
    'i': i, 
    'num_epochs': num_epochs, 
    'learning_rate': learning_rate, 
    'batch_size': batch_size,
    'max_samples': max_samples
}
experiment_name = f"lstm_multiclass_kfold"
import urllib
parsed_url = urllib.parse.urlparse(os.getenv('PRETRAINED_CNN_MODEL_URI'))
path_parts = parsed_url.path.split('/')
pre_trained_model = path_parts[3].replace('dcid.', '') + '/' + path_parts[5]
tags = {'pretrained_model': pre_trained_model, 'k': k, 'i': i}
tags.update({key: str(value) for key, value in inputs.items()})
job = command(
    inputs = inputs,
    compute=gpu_compute_target,
    environment=custom_env_name,
    code="../",  # location of source code
    command="python -m scripts.train.train_lstm_multiclass --run_cloud --dataset ${{inputs.dataset}} --k ${{inputs.k}} --i ${{inputs.i}} --num_epochs ${{inputs.num_epochs}} --batch_size ${{inputs.batch_size}} --learning_rate ${{inputs.learning_rate}} --max_samples ${{inputs.max_samples}}",
    environment_variables=env_vars,
    experiment_name=experiment_name,
    display_name=get_display_name(f"{experiment_name}_{k}_{i}"),
    tags=tags,
)
ml_client.jobs.create_or_update(job)

Experiment,Name,Type,Status,Details Page
lstm_multiclass_kfold,plum_grass_3bwdzfbz20,command,Starting,Link to Azure Machine Learning studio


In [14]:
k = 5
i = 4
inputs = {
    'network': network,
    'dataset': dataset, 
    'k': k, 
    'i': i, 
    'num_epochs': num_epochs, 
    'learning_rate': learning_rate, 
    'batch_size': batch_size,
    'max_samples': max_samples
}
experiment_name = f"lstm_multiclass_kfold"
import urllib
parsed_url = urllib.parse.urlparse(os.getenv('PRETRAINED_CNN_MODEL_URI'))
path_parts = parsed_url.path.split('/')
pre_trained_model = path_parts[3].replace('dcid.', '') + '/' + path_parts[5]
tags = {'pretrained_model': pre_trained_model, 'k': k, 'i': i}
tags.update({key: str(value) for key, value in inputs.items()})
job = command(
    inputs = inputs,
    compute=gpu_compute_target,
    environment=custom_env_name,
    code="../",  # location of source code
    command="python -m scripts.train.train_lstm_multiclass --run_cloud --dataset ${{inputs.dataset}} --k ${{inputs.k}} --i ${{inputs.i}} --num_epochs ${{inputs.num_epochs}} --batch_size ${{inputs.batch_size}} --learning_rate ${{inputs.learning_rate}} --max_samples ${{inputs.max_samples}}",
    environment_variables=env_vars,
    experiment_name=experiment_name,
    display_name=get_display_name(f"{experiment_name}_{k}_{i}"),
    tags=tags,
)
ml_client.jobs.create_or_update(job)

Experiment,Name,Type,Status,Details Page
lstm_multiclass_kfold,red_curtain_y0tv6rgvzs,command,Starting,Link to Azure Machine Learning studio
