In [43]:
import os
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F
import ray
from ray import tune
from ray.tune import Callback
from ray.tune.schedulers import ASHAScheduler
from ray.tune.integration.mlflow import MLflowLoggerCallback
from ray.util.dask import ray_dask_get
from ray_on_aml.core import Ray_On_AML
import dask
import dask.array as da
import dask.dataframe as dd
from adlfs import AzureBlobFileSystem
from azureml.core import Run
import mlflow
from azureml.core import Workspace, Experiment, Environment, Datastore, Dataset, ScriptRunConfig
from azureml.core.runconfig import PyTorchConfiguration
# from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import PyTorchConfiguration
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies
from IPython.display import clear_output
import time
import platform
import sys
import importlib


dask.config.set(scheduler=ray_dask_get)

accList = []

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        # In this example, we don't change the model architecture
        # due to simplicity.
        self.conv1 = nn.Conv2d(1, 3, kernel_size=3)
        self.fc = nn.Linear(192, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 3))
        x = x.view(-1, 192)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)


class captureMetrics(Callback):
    def on_trial_result(self, iteration, trials, trial, result, **info):
        accList.append(result['mean_accuracy'])


# Change these values if you want the training to run quicker or slower.
EPOCH_SIZE = 512
TEST_SIZE = 256
OUTPUTPATH = './outputs'

def train(model, optimizer, train_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        # We set this just for the example to run quickly.
        if batch_idx * len(data) > EPOCH_SIZE:
            return
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()


def test(model, data_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(data_loader):
            # We set this just for the example to run quickly.
            if batch_idx * len(data) > TEST_SIZE:
                break
            data, target = data.to(device), target.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    return correct / total


def train_mnist(config):
    # Data Setup
    mnist_transforms = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    train_loader = DataLoader(
        datasets.MNIST("~/data", train=True, download=True, transform=mnist_transforms),
        batch_size=64,
        shuffle=True)
    test_loader = DataLoader(
        datasets.MNIST("~/data", train=False, transform=mnist_transforms),
        batch_size=64,
        shuffle=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = ConvNet()
    model.to(device)

    optimizer = optim.SGD(
        model.parameters(), lr=config["lr"], momentum=config["momentum"])
    
#     accList = []

    for i in range(10):
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)
#         accList.append(acc)

        # Send the current training result back to Tune
        tune.report(mean_accuracy=acc)

        if not os.path.exists(OUTPUTPATH):
            os.mkdir(OUTPUTPATH)
            print(os.getcwd())
            
        if i % 5 == 0:
            # This saves the model to the trial directory
            filename = os.path.join(OUTPUTPATH,str(i),'model.pth')
            print(filename)
            torch.save(model.state_dict(), './model.pth')


search_space = {
    "lr": tune.sample_from(lambda spec: 10**(-10 * np.random.rand())),
    "momentum": tune.uniform(0.01, 0.09)
}


In [47]:
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(run.experiment.name)

<Experiment: artifact_location='', experiment_id='68ed74d1-7579-44d2-879b-2c609da66ca5', lifecycle_stage='active', name='ray-on-aml-test', tags={}>

In [48]:
# ray_on_aml =Ray_On_AML()
# ray = ray_on_aml.getRay()

if ray: #in the headnode
    print("head node detected")

    datasets.MNIST("~/data", train=True, download=True)
    #demonstate parallel hyper param tuning
#     analysis = tune.run(train_mnist, config=search_space, callbacks=[captureMetrics()])
    analysis = tune.run(train_mnist, config=search_space, callbacks=[MLflowLoggerCallback(experiment_name=run.experiment.name, tags={"Framework":"Ray 1.9.1"}, save_artifact=True)])
#     run.log_list('acc', accList)
#     analysis = tune.run(train_mnist, config=search_space)
#     run.log('Best Hyper Params', analy1sis)
    #demonstrate parallel data processing

    # print("data count result", get_data_count())

else:
    print("in worker node")


head node detected


Trial name,status,loc,lr,momentum
train_mnist_83579_00000,PENDING,,3.56896e-08,0.0612082


Result for train_mnist_83579_00000:
  date: 2022-01-07_05-56-12
  done: false
  experiment_id: 77839913ffe245f5aed9eca1e932d3d7
  hostname: hyssh1
  iterations_since_restore: 1
  mean_accuracy: 0.090625
  node_ip: 10.1.0.5
  pid: 28277
  time_since_restore: 0.5926167964935303
  time_this_iter_s: 0.5926167964935303
  time_total_s: 0.5926167964935303
  timestamp: 1641534972
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '83579_00000'
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_mnist_83579_00000,RUNNING,10.1.0.5:28277,3.56896e-08,0.0612082,0.090625,1,0.592617


[2m[36m(ImplicitFunc pid=28277)[0m /home/azureuser/ray_results/train_mnist_2022-01-07_05-56-10/train_mnist_83579_00000_0_lr=3.569e-08,momentum=0.061208_2022-01-07_05-56-10
[2m[36m(ImplicitFunc pid=28277)[0m ./outputs/0/model.pth
Result for train_mnist_83579_00000:
  date: 2022-01-07_05-56-20
  done: false
  experiment_id: 77839913ffe245f5aed9eca1e932d3d7
  hostname: hyssh1
  iterations_since_restore: 3
  mean_accuracy: 0.09375
  node_ip: 10.1.0.5
  pid: 28277
  time_since_restore: 8.11816668510437
  time_this_iter_s: 2.7435052394866943
  time_total_s: 8.11816668510437
  timestamp: 1641534980
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: '83579_00000'
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_mnist_83579_00000,RUNNING,10.1.0.5:28277,3.56896e-08,0.0612082,0.09375,3,8.11817


Result for train_mnist_83579_00000:
  date: 2022-01-07_05-56-27
  done: false
  experiment_id: 77839913ffe245f5aed9eca1e932d3d7
  hostname: hyssh1
  iterations_since_restore: 5
  mean_accuracy: 0.0625
  node_ip: 10.1.0.5
  pid: 28277
  time_since_restore: 15.188915729522705
  time_this_iter_s: 3.0587925910949707
  time_total_s: 15.188915729522705
  timestamp: 1641534987
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '83579_00000'
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_mnist_83579_00000,RUNNING,10.1.0.5:28277,3.56896e-08,0.0612082,0.0625,5,15.1889


[2m[36m(ImplicitFunc pid=28277)[0m ./outputs/5/model.pth
Result for train_mnist_83579_00000:
  date: 2022-01-07_05-56-33
  done: false
  experiment_id: 77839913ffe245f5aed9eca1e932d3d7
  hostname: hyssh1
  iterations_since_restore: 7
  mean_accuracy: 0.078125
  node_ip: 10.1.0.5
  pid: 28277
  time_since_restore: 21.44170045852661
  time_this_iter_s: 2.3324499130249023
  time_total_s: 21.44170045852661
  timestamp: 1641534993
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: '83579_00000'
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_mnist_83579_00000,RUNNING,10.1.0.5:28277,3.56896e-08,0.0612082,0.078125,7,21.4417


Result for train_mnist_83579_00000:
  date: 2022-01-07_05-56-38
  done: false
  experiment_id: 77839913ffe245f5aed9eca1e932d3d7
  hostname: hyssh1
  iterations_since_restore: 9
  mean_accuracy: 0.078125
  node_ip: 10.1.0.5
  pid: 28277
  time_since_restore: 26.76071000099182
  time_this_iter_s: 2.6184041500091553
  time_total_s: 26.76071000099182
  timestamp: 1641534998
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: '83579_00000'
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_mnist_83579_00000,RUNNING,10.1.0.5:28277,3.56896e-08,0.0612082,0.078125,9,26.7607


Result for train_mnist_83579_00000:
  date: 2022-01-07_05-56-41
  done: true
  experiment_id: 77839913ffe245f5aed9eca1e932d3d7
  experiment_tag: 0_lr=3.569e-08,momentum=0.061208
  hostname: hyssh1
  iterations_since_restore: 10
  mean_accuracy: 0.096875
  node_ip: 10.1.0.5
  pid: 28277
  time_since_restore: 29.490155935287476
  time_this_iter_s: 2.7294459342956543
  time_total_s: 29.490155935287476
  timestamp: 1641535001
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: '83579_00000'
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_mnist_83579_00000,TERMINATED,10.1.0.5:28277,3.56896e-08,0.0612082,0.096875,10,29.4902


In [5]:
from ray_on_aml.core import Ray_On_AML
ws = Workspace.from_config()
ray_on_aml =Ray_On_AML(ws=ws, compute_cluster ="worker-cpu-v3",additional_pip_packages=['torch==1.10.0', 'torchvision', 'sklearn'], maxnode=5)
ray = ray_on_aml.getRay()

Cancel active AML runs if any
Shutting down ray if any
Found existing cluster worker-cpu-v3
Using azureml_py38 for the master node
Waiting for cluster to start
...................................

In [11]:
ray.runtime_context

<module 'ray.runtime_context' from '/anaconda/envs/azureml_py38/lib/python3.8/site-packages/ray/runtime_context.py'>

In [19]:
exp = Experiment(ws, 'ray-on-aml-test')

In [20]:
run = exp.start_logging()

In [26]:
run.id

'5d13c4ab-79ab-41b1-8fb8-ee7c7006b4cd'

In [41]:
model

NameError: name 'model' is not defined