In [1]:
from azureml.core.run import Run

In [169]:
%%writefile invasive_model.py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import os
import torch.nn.functional as F
import torchvision.models as models
from azureml.core import Run

print(torch.__version__)

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--architecture', type=str)
parser.add_argument('--lr', type=float)
parser.add_argument('--n_neurons', type=int)
parser.add_argument('--epochs',type=int)
parser.add_argument('--data_path',type=str)
args = parser.parse_args()

if args.n_neurons == None: args.n_neurons = 512
if args.epochs == None: args.epochs = 10
if args.architecture == None: args.architecture = 'resnet34'
if args.lr == None: args.lr = 0.01

class early_termination():
    def __init__(self, patience, goal):
        self.best_score = 0
        self.patience = patience
        self.epochs_wo_improvement = 0
        #maximize or minimize
        self.goal = goal
        
    def register_score(self, score):
        if self.goal == 'maximize':
            if score > self.best_score:
                self.best_score = score
                self.epochs_wo_improvement = 0
            else:
                self.epochs_wo_improvement += 1

            
        if self.goal == 'minimize':
            if score < self.best_score:
                self.best_score = score
                self.epochs_wo_improvement = 0
            else:
                self.epochs_wo_improvement += 1
    
    def check_to_stop(self):
        if self.patience < self.epochs_wo_improvement:
            return True
        else:
            return False


def create_body(architecture):
    if architecture == 'resnet34':
      model = models.resnet34(pretrained=True)
    elif architecture == 'resnet18':
      model = models.resnet18(pretrained=True)
    "Cut off the body of a typically pretrained `model` at `cut` (int) or cut the model as specified by `cut(model)` (function)."
    return nn.Sequential(*list(model.children())[:-1])


class conv_net(nn.Module):
    def __init__(self,fc_neurons,architecture):
        super().__init__()

        self.body = create_body(architecture)

        conv_out_size = self._get_conv_out([3,224,224])
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, fc_neurons),
            nn.ReLU(),
            nn.Linear(fc_neurons, 2),
            nn.LogSoftmax()
        )

    def _get_conv_out(self, shape):
        o = self.body(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        conv_out = self.body(x).view(x.size()[0], -1)
        return self.fc(conv_out)


    def fit(self, train_loader, valid_loader, lr, epochs):
        criterion = nn.NLLLoss()
        optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
        print_every = 10

        
        for e in range(epochs):
            running_loss = 0
            for step, (x,y) in enumerate(train_loader):
                y = y.to(device)
                x = x.to(device)
                
                optimizer.zero_grad()
                
                # Forward and backward passes
                output = self.forward(x)
                loss = criterion(output, y)
                loss.backward()
                optimizer.step()
                
                running_loss += loss.item()
                
                if step > 1:
                    if step % print_every == 0: 
                        print("Epoch: {}/{}... ".format(e+1, epochs),
                        "Loss: {:.4f}".format(running_loss/print_every))
                        running_loss = 0
                
            loss, acc = self.score(valid_loader)
            print('\nTest set: Average loss: {:.4f}, Accuracy: {:.0f}%\n'.format(loss, acc))
            
            
            run.log('Validation Loss', loss)
            run.log('Validation Accuracy',acc)
            
            print(term_policy.epochs_wo_improvement)
            
            term_policy.register_score(acc)
            if term_policy.check_to_stop():
                break
                
            
    def score(self, valid_loader):
        test_loss = 0
        correct = 0
        with torch.no_grad():
          for data, target in valid_loader:
              data = data.to(device)
              target = target.to(device)
              output = self(data)
              test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
              pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
              correct += pred.eq(target.view_as(pred)).sum().item()

        test_loss /= len(valid_loader.dataset)

        test_acc = 100. * correct / len(valid_loader.dataset)
        
        return test_loss, test_acc

#log parameters
run = Run.get_context()
run.log('Architecture', args.architecture)
run.log('Learning Rate', args.lr)
run.log('Number of Neurons', args.n_neurons)
run.log('Epochs', args.epochs)

#data preprocessing and train/test splitting
transforms = transforms.Compose([transforms.Resize([224,224]),transforms.ToTensor()])

image_datasets = datasets.ImageFolder(args.data_path,transform=transforms)
train_set, val_set = torch.utils.data.random_split(image_datasets, [int(len(image_datasets)*0.8), int(len(image_datasets)*0.2)])
train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)
valid_loader = torch.utils.data.DataLoader(val_set, batch_size=64, shuffle=False)

print('train length ', len(train_loader), '; validation length ', len(valid_loader))
print('n classes ', image_datasets.classes)

#initialize/instantiate model
model = conv_net(args.n_neurons,args.architecture)

term_policy = early_termination(3, 'maximize')

#todo param for freezing base layers
n_layers = len(list(model.parameters()))-6
for layer in model.parameters():
    layer.requires_grad=False
    n_layers -=1
    if n_layers==0:
        break

#loss function
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9)

#use GPU if available
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)
    
model.to(device)

#fit
model.fit(train_loader, valid_loader, args.lr, args.epochs)

loss, acc = model.score(valid_loader)
run.log('Final Loss', loss)
run.log('Final Accuracy',acc)


os.makedirs('./outputs/model', exist_ok=True)

# todo model to onnx
torch.save(model.state_dict(), './outputs/model/invasive-model.state')


Overwriting invasive_model.py


In [170]:
%matplotlib inline
import numpy as np
import os
import matplotlib.pyplot as plt

import azureml
from azureml.core import Workspace, Datastore, Dataset

In [171]:
from azureml.core import Workspace, Experiment
import json
with open('/Users/paulbruffett/python_class/azure_credentials.json') as f:
    credentials = json.load(f)
    
auth = azureml.core.authentication.ServicePrincipalAuthentication(credentials['directory_id'],credentials['app_id'],credentials['app_key'])
ws = Workspace('3bdbda93-8c3a-472b-bdde-25e3028fc307','mlworkspace','pbml',auth=auth)
ws_details = ws.get_details()
print('Name:\t\t{}\nLocation:\t{}'
      .format(ws_details['name'],
              ws_details['location']))

Name:		pbml
Location:	eastus2


In [172]:
from azureml.core import Experiment

script_folder = './invasive_model'
os.makedirs(script_folder, exist_ok=True)

exp = Experiment(workspace=ws, name='invasive-species-pytorch')

In [173]:
datastore = Datastore.get(ws, 'workspaceblobstore')

In [174]:
from azureml.core.dataset import Dataset

datastore_paths = [(datastore, 'invasive/invasive/')]
invasive_ds = Dataset.File.from_files(path=datastore_paths)

In [175]:
invasive_ds = invasive_ds.register(workspace = ws,
                           name = 'invasive dataset',
                           description='training and test dataset',
                                  create_new_version=True)

In [176]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "gpu-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it uses the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Found existing compute target
{'currentNodeCount': 4, 'targetNodeCount': 4, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 4, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-01-28T01:41:24.893000+00:00', 'errors': None, 'creationTime': '2020-01-24T17:32:16.091995+00:00', 'modifiedTime': '2020-01-24T23:25:44.867920+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT600S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}


In [177]:
import shutil

# the training logic is in the keras_mnist.py file.
shutil.copy('./invasive_model.py', script_folder)

'./invasive_model/invasive_model.py'

In [28]:
dataset = Dataset.get_by_name(ws, 'invasive dataset')

# list the files referenced by mnist dataset
dataset.to_path()

Please install pyarrow>=0.11.0 for improved performance of to_pandas_dataframe. You can ensure the correct version is installed by running: pip install azureml-dataprep[pandas].


array(['/invasive/100.jpg', '/invasive/1000.jpg', '/invasive/1002.jpg',
       ..., '/not%20invasive/997.jpg', '/not%20invasive/998.jpg',
       '/not%20invasive/999.jpg'], dtype=object)

In [12]:
from azureml.train.dnn import PyTorch


script_params = {
    '--data_path': dataset.as_named_input('invasive').as_mount(),
    '--lr': 0.01,
    '--n_neurons': 256,
    '--epochs': 10,
    '--architecture': 'resnet18'
}

est = PyTorch(source_directory=script_folder,
                 script_params=script_params,
                 compute_target=compute_target, 
                 entry_script='invasive_model.py', 
                  use_gpu=True,
                 pip_packages=['torch','torchvision','azureml-dataprep[pandas,fuse]','matplotlib'])



In [None]:
run = exp.submit(est)

## Hyperparameter Tuning

In [178]:
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, loguniform



ps = RandomParameterSampling(
    {
        '--n_neurons': choice(32, 128, 256, 512, 1024),
        '--architecture': choice('resnet18','resnet34'),
        '--lr': loguniform(-6, -1)
    }
)

params = script_params = {
    '--data_path': dataset.as_named_input('invasive').as_mount(),
    '--epochs': 30,
}

In [179]:
est = PyTorch(source_directory=script_folder,
                 script_params=script_params,
                 compute_target=compute_target, 
                 entry_script='invasive_model.py', 
                  use_gpu=True,
                 pip_packages=['torch','torchvision','azureml-dataprep[pandas,fuse]','matplotlib'])



In [180]:
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

In [181]:
hdc = HyperDriveConfig(estimator=est, 
                       hyperparameter_sampling=ps, 
                       policy=policy, 
                       primary_metric_name='Validation Accuracy', 
                       primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                       max_total_runs=20,
                       max_concurrent_runs=4)

In [182]:
hdr = exp.submit(config=hdc)

In [185]:
from azureml.widgets import RunDetails
RunDetails(hdr).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [184]:
hdr.wait_for_completion(show_output=True)

RunId: invasive-species-pytorch_1580182035079311
Web View: https://ml.azure.com/experiments/invasive-species-pytorch/runs/invasive-species-pytorch_1580182035079311?wsid=/subscriptions/3bdbda93-8c3a-472b-bdde-25e3028fc307/resourcegroups/mlworkspace/workspaces/pbml

Execution Summary
RunId: invasive-species-pytorch_1580182035079311
Web View: https://ml.azure.com/experiments/invasive-species-pytorch/runs/invasive-species-pytorch_1580182035079311?wsid=/subscriptions/3bdbda93-8c3a-472b-bdde-25e3028fc307/resourcegroups/mlworkspace/workspaces/pbml



{'runId': 'invasive-species-pytorch_1580182035079311',
 'target': 'gpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-01-28T03:27:15.355263Z',
 'endTimeUtc': '2020-01-28T04:57:49.857055Z',
 'properties': {'primary_metric_config': '{"name": "Validation Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'c5265672-2ee4-484b-a12d-3bf95850e946',
  'score': '93.89978213507625',
  'best_child_run_id': 'invasive-species-pytorch_1580182035079311_17',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://pbml5310218423.blob.core.windows.net/azureml/ExperimentRun/dcid.invasive-species-pytorch_1580182035079311/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=y8Gsj8gGVL9IFUdocfEa4ogTyU7VtRSJD8RYD%2BdeaQI%3D&st=2020-01-28T15%3A55%3A13Z&se=2020-01-29T00%3A05%3A13Z&sp=r'}}