In [2]:
import azureml.core
from azureml.core.compute import ComputeTarget, BatchAiCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import EnvironmentDefinition
from azureml.train.estimator import Estimator
from azureml.core.workspace import Workspace
from azureml.core import Experiment
from azureml.train.widgets import RunDetails

import json
import sys
import os
import shutil

sys.path.append('src')

from registry import azure_container_registry_for
import batch_ai


In [3]:
print("SDK version:", azureml.core.VERSION)

SDK version: 0.1.65


In [4]:
ws = Workspace.from_config(path='configs/azml_config.json')

Found the config file in: /workspace/configs/azml_config.json


In [5]:
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: mstest
Azure region: eastus2
Subscription id: edf507a2-6235-46c5-b560-fd463ba2e771
Resource group: msazmlrg


In [6]:
registry_name = ws.get_details()['containerRegistry'].split('/')[-1]
registry_name

'mstestacrhomithdz'

In [7]:
azr = azure_container_registry_for(ws.resource_group, registry_name, subscription_id=ws.subscription_id)

In [8]:
compute_target = batch_ai.cluster(ws, "gpucluster")

In [9]:
project_folder = './batchai_hyperdrive'
os.makedirs(project_folder, exist_ok=True)

In [10]:
%%writefile pytorch_train.py
from __future__ import print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import datasets, models, transforms
import numpy as np
import time
import os
import copy
import argparse

from azureml.core.run import Run
# get the Azure ML run object
run = Run.get_context()


def load_data(data_dir):
    """Load the train/val data."""

    # Data augmentation and normalization for training
    # Just normalization for validation
    data_transforms = {
        'train': transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'val': transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }

    image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                              data_transforms[x])
                      for x in ['train', 'val']}
    dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
                                                  shuffle=True, num_workers=0)
                   for x in ['train', 'val']}
    dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
    class_names = image_datasets['train'].classes

    return dataloaders, dataset_sizes, class_names


def train_model(model, criterion, optimizer, scheduler, num_epochs, data_dir):
    """Train the model."""

    # load training/validation data
    dataloaders, dataset_sizes, class_names = load_data(data_dir)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

            # log the best val accuracy to AML run
            run.log('best_val_acc', np.float(best_acc))

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model


def fine_tune_model(num_epochs, data_dir, learning_rate, momentum):
    """Load a pretrained model and reset the final fully connected layer."""

    # log the hyperparameter metrics to the AML run
    run.log('lr', np.float(learning_rate))
    run.log('momentum', np.float(momentum))

    model_ft = models.resnet18(pretrained=True)
    num_ftrs = model_ft.fc.in_features
    model_ft.fc = nn.Linear(num_ftrs, 2)  # only 2 classes to predict

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model_ft = model_ft.to(device)

    criterion = nn.CrossEntropyLoss()

    # Observe that all parameters are being optimized
    optimizer_ft = optim.SGD(model_ft.parameters(), lr=learning_rate, momentum=momentum)

    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

    model = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs, data_dir)

    return model


def main():
    # get command-line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, help='directory of training data')
    parser.add_argument('--num_epochs', type=int, default=25, help='number of epochs to train')
    parser.add_argument('--output_dir', type=str, help='output directory')
    parser.add_argument('--learning_rate', type=float, default=0.001, help='learning rate')
    parser.add_argument('--momentum', type=float, default=0.9, help='momentum')
    args = parser.parse_args()

    print("data directory is: " + args.data_dir)
    model = fine_tune_model(args.num_epochs, args.data_dir, args.learning_rate, args.momentum)
    os.makedirs(args.output_dir, exist_ok=True)
    torch.save(model, os.path.join(args.output_dir, 'model.pt'))


if __name__ == "__main__":
    main()


Overwriting pytorch_train.py


In [11]:
shutil.copy('pytorch_train.py', project_folder)

'./batchai_hyperdrive/pytorch_train.py'

In [12]:
import os
import urllib
from zipfile import ZipFile

# download data
download_url = 'https://download.pytorch.org/tutorial/hymenoptera_data.zip'
data_file = './hymenoptera_data.zip'
urllib.request.urlretrieve(download_url, filename=data_file)

# extract files
with ZipFile(data_file, 'r') as zip:
    print('extracting files...')
    zip.extractall()
    print('done')
    
# delete zip file
os.remove(data_file)

extracting files...
done


In [31]:
from azureml.core.image import ContainerImage

In [13]:
ds = ws.get_default_datastore()
print(ds.datastore_type, ds.account_name, ds.container_name)

AzureFile msteststoragebnqmxwim azureml-filestore-8c170506-e529-4ebe-a306-a7789c1dbdbf


In [20]:
ds.upload(src_dir='./hymenoptera_data', target_path='hymenoptera_data')

Target already exists. Skipping upload for hymenoptera_data/train/bees/2345177635_caf07159b3.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/39747887_42df2855ee.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/95238259_98470c5b10.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/2477349551_e75c97cf4d.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/92663402_37f379e57a.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/2861002136_52c7c6f708.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/969455125_58c797ef17.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/90179376_abc234e5f4.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/3044402684_3853071a87.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/98391118_bdb1e80cce.jpg
Target already exists. Skipping upload fo

Target already exists. Skipping upload for hymenoptera_data/train/bees/1092977343_cb42b38d62.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/774440991_63a4aa0cbe.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/478701318_bbd5e557b8.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/1508176360_2972117c9d.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/472288710_2abee16fa0.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/3074585407_9854eb3153.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/2405441001_b06c36fa72.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/342758693_c56b89b6b6.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/21399619_3e61e5bb6f.jpg
Target already exists. Skipping upload for hymenoptera_data/train/bees/208702903_42fb4d9748.jpg
Target already exists. Skipping uploa

Target already exists. Skipping upload for hymenoptera_data/train/ants/384191229_5779cf591b.jpg
Target already exists. Skipping upload for hymenoptera_data/train/ants/822537660_caf4ba5514.jpg
Target already exists. Skipping upload for hymenoptera_data/train/ants/386190770_672743c9a7.jpg
Target already exists. Skipping upload for hymenoptera_data/train/ants/150801171_cd86f17ed8.jpg
Target already exists. Skipping upload for hymenoptera_data/train/ants/148715752_302c84f5a4.jpg
Target already exists. Skipping upload for hymenoptera_data/train/ants/424119020_6d57481dab.jpg
Target already exists. Skipping upload for hymenoptera_data/train/ants/2265825502_fff99cfd2d.jpg
Target already exists. Skipping upload for hymenoptera_data/train/ants/1225872729_6f0856588f.jpg
Target already exists. Skipping upload for hymenoptera_data/train/ants/795000156_a9900a4a71.jpg
Target already exists. Skipping upload for hymenoptera_data/train/ants/1924473702_daa9aacdbe.jpg
Target already exists. Skipping uploa

Target already exists. Skipping upload for hymenoptera_data/val/bees/290082189_f66cb80bfc.jpg
Target already exists. Skipping upload for hymenoptera_data/val/bees/151603988_2c6f7d14c7.jpg
Target already exists. Skipping upload for hymenoptera_data/val/bees/2525379273_dcb26a516d.jpg
Target already exists. Skipping upload for hymenoptera_data/val/bees/2841437312_789699c740.jpg
Target already exists. Skipping upload for hymenoptera_data/val/bees/2501530886_e20952b97d.jpg
Target already exists. Skipping upload for hymenoptera_data/val/bees/abeja.jpg
Target already exists. Skipping upload for hymenoptera_data/val/bees/350436573_41f4ecb6c8.jpg
Target already exists. Skipping upload for hymenoptera_data/val/bees/372228424_16da1f8884.jpg
Target already exists. Skipping upload for hymenoptera_data/val/bees/416144384_961c326481.jpg
Target already exists. Skipping upload for hymenoptera_data/val/bees/2457841282_7867f16639.jpg
Target already exists. Skipping upload for hymenoptera_data/val/bees/26

Target already exists. Skipping upload for hymenoptera_data/val/ants/1337725712_2eb53cd742.jpg
Target already exists. Skipping upload for hymenoptera_data/val/ants/445356866_6cb3289067.jpg
Target already exists. Skipping upload for hymenoptera_data/val/ants/1119630822_cd325ea21a.jpg
Target already exists. Skipping upload for hymenoptera_data/val/ants/161292361_c16e0bf57a.jpg
Target already exists. Skipping upload for hymenoptera_data/val/ants/2238242353_52c82441df.jpg
Target already exists. Skipping upload for hymenoptera_data/val/ants/153320619_2aeb5fa0ee.jpg
Target already exists. Skipping upload for hymenoptera_data/val/ants/2104709400_8831b4fc6f.jpg
Target already exists. Skipping upload for hymenoptera_data/val/ants/183260961_64ab754c97.jpg
Target already exists. Skipping upload for hymenoptera_data/val/ants/desert_ant.jpg
Target already exists. Skipping upload for hymenoptera_data/val/ants/10308379_1b6c72e180.jpg
Target already exists. Skipping upload for hymenoptera_data/val/ant

$AZUREML_DATAREFERENCE_a9144fdc86f1456cb312c1855c9db7c5

In [28]:
path_on_datastore = 'hymenoptera_data'
ds_data = ds.path(path_on_datastore)
print(ds_data)

$AZUREML_DATAREFERENCE_70ad34a962dc436e9323b4da0c3581fb


In [29]:
env = EnvironmentDefinition()
env.python.user_managed_dependencies=True
env.docker.enabled = True
env.docker.gpu_support = True
env.docker.shared_volumes = True
env.docker.base_image = "azml"
env.docker.base_image_registry=azr

In [33]:
from azureml.train.dnn import PyTorch

script_params = {
    '--data_dir': ds_data,S
    '--num_epochs': 25,
    '--output_dir': './outputs'
}

estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='pytorch_train.py',
                    environment_definition=env)

In [34]:
experiment_name='batchai-hyperdrive'
experiment = Experiment(ws, name=experiment_name)

In [35]:
run = experiment.submit(estimator)

In [37]:
list(experiment.get_runs())

[Run(Experiment: batchai-hyperdrive,
 Id: batchai-hyperdrive_1539538774868,
 Type: azureml.scriptrun,
 Status: Running)]

In [38]:
print(json.dumps(run.get_details(), indent=4))

{
    "runId": "batchai-hyperdrive_1539538774868",
    "target": "gpucluster",
    "status": "Running",
    "startTimeUtc": "2018-10-14T17:39:37.07345Z",
    "properties": {
        "azureml.runsource": "experiment",
        "ContentSnapshotId": "e75a0c7e-d875-4a0f-abb1-f506d4eb7169"
    },
    "runDefinition": {
        "Script": "pytorch_train.py",
        "Arguments": [
            "--data_dir",
            "$AZUREML_DATAREFERENCE_70ad34a962dc436e9323b4da0c3581fb",
            "--num_epochs",
            "25",
            "--output_dir",
            "./outputs"
        ],
        "Framework": 0,
        "Target": "gpucluster",
        "DataReferences": {
            "70ad34a962dc436e9323b4da0c3581fb": {
                "DataStoreName": "workspacefilestore",
                "Mode": "Mount",
                "PathOnDataStore": "hymenoptera_data",
                "PathOnCompute": null,
                "Overwrite": false
            }
        },
        "JobName": null,
        "AutoPrep

In [39]:
RunDetails(run).show()

_UserRun()

In [44]:
from azureml.train.hyperdrive import (BanditPolicy, 
                                      HyperDriveRunConfig, 
                                      RandomParameterSampling, 
                                      uniform, 
                                      PrimaryMetricGoal)

In [45]:
param_sampling = RandomParameterSampling( {
        'learning_rate': uniform(0.0005, 0.005),
        'momentum': uniform(0.9, 0.99)
    }
)

early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=1, delay_evaluation=10)

hyperdrive_run_config = HyperDriveRunConfig(estimator=estimator,
                                            hyperparameter_sampling=param_sampling, 
                                            policy=early_termination_policy,
                                            primary_metric_name='best_val_acc',
                                            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                            max_total_runs=20,
                                            max_concurrent_runs=4)

In [46]:
# start the HyperDrive run
hyperdrive_run = experiment.submit(hyperdrive_run_config)

In [47]:
RunDetails(hyperdrive_run).show()

_HyperDrive(widget_settings={'childWidgetDisplay': 'popup'})

_UserRun(widget_settings={'display': 'popup'})

_UserRun(widget_settings={'display': 'popup'})

_UserRun(widget_settings={'display': 'popup'})

In [49]:
RunDetails(hyperdrive_run).get_widget_data()

{'status': 'Running',
 'workbench_run_details_uri': 'https://mlworkspace.azure.ai/portal/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/msazmlrg/providers/Microsoft.MachineLearningServices/workspaces/mstest/experiment/batchai-hyperdrive/run/batchai-hyperdrive_1539540150583',
 'run_id': 'batchai-hyperdrive_1539540150583',
 'run_properties': {'run_id': 'batchai-hyperdrive_1539540150583',
  'created_utc': '2018-10-14T18:02:30.929855Z',
  'properties': {'primary_metric_config': '{"name": "best_val_acc", "goal": "maximize"}',
   'runTemplate': 'HyperDrive',
   'azureml.runsource': 'hyperdrive'},
  'tags': {'max_concurrent_jobs': '4',
   'max_total_jobs': '20',
   'max_duration_minutes': '10080',
   'policy_config': '{"name": "BANDIT", "properties": {"evaluation_interval": 1, "delay_evaluation": 10, "slack_factor": 0.15}}',
   'generator_config': '{"name": "RANDOM", "parameter_space": {"learning_rate": ["uniform", [0.0005, 0.005]], "momentum": ["uniform", [0.9, 0.99]]}}',


In [65]:
%%time
best_run = hyperdrive_run.get_best_run_by_primary_metric()

CPU times: user 11.9 s, sys: 421 ms, total: 12.4 s
Wall time: 57 s


In [66]:
%%time
best_run_metrics = best_run.get_metrics()
print(best_run)

Run(Experiment: batchai-hyperdrive,
Id: batchai-hyperdrive_1539540150583_16,
Type: azureml.scriptrun,
Status: Completed)
CPU times: user 56.5 ms, sys: 0 ns, total: 56.5 ms
Wall time: 338 ms
