# Deploy a DL model to AKS GPU cluster
This notebook shows the steps for deploying a pretrained deep learning model (ResNet 152 model on Imagenet dataset) on a GPU enabled Azure Kubernetes Service (AKS) cluster through [Azure Machine Learning](https://docs.microsoft.com/en-us/python/api/overview/azure/ml/intro?view=azure-ml-py).

Before proceeding, please make sure you have taken care of these [prerequisite steps](../README.md). 

This tutorial will take you through the following steps:
 * [Get Workspace](#get_workspace)
 * [Persist Pretrained Model](#persist_model)
 * [Register a Model](#register_model)
 * [Create an Image](#create_image)
 * [Provision a Cluster](#provision_cluster)
 * [Deploy Web Service to Cluster](#deploy_ws)
 * [Test Web Service](#test_ws)
 * [Clean Up Resources](#clean_up)

In [1]:
from azureml.core import Workspace
from azureml.core.compute import AksCompute, ComputeTarget
from azureml.core.webservice import Webservice, AksWebservice
from azureml.core.image import Image
from azureml.core.model import Model

In [2]:
import azureml.core
print(azureml.core.VERSION)

0.1.68


In [19]:
import os
curr_dir = os.getcwd()

# locations for deployment - change as necessary
temp_dir = os.path.join(curr_dir,'deployment')  
model_path = os.path.join(temp_dir,'model_resnet_weights.h5')
score_file_path = os.path.join(temp_dir, 'score.py')
test_image_path = os.path.join(temp_dir, '220px-Lynx_lynx_poing.jpg')


if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)

# print(os.getcwd())
# print(os.listdir(os.getcwd()))

<a id='get_workspace'></a>
## Get workspace
Load existing workspace from the config file info.

In [4]:
from azureml.core.workspace import Workspace

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

Found the config file in: /workspace/AKSDeploymentTutorial_AML/Keras_Tensorflow/aml_config/config.json
yanzamlworkspace
yanzamlworkspace
eastus2
edf507a2-6235-46c5-b560-fd463ba2e771


<a id='persist_model'></a>
## Persist Pretrained Model

In [4]:
#Creating the model pickle file
import tensorflow as tf
from resnet152 import ResNet152
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input, decode_predictions

model = ResNet152(weights='imagenet')

Using TensorFlow backend.


In [5]:
model

<keras.engine.training.Model at 0x7fa52898a7f0>

In [11]:
model.save_weights(model_path)

In [12]:
model.load_weights(model_path)

In [13]:
model

<keras.engine.training.Model at 0x7ff6f1ec13c8>

<a id='regisger_model'></a>
## Register the model
Register an existing trained model, add descirption and tags.

In [15]:
#Register the model
from azureml.core.model import Model
model = Model.register(model_path = model_path, # this points to a local file
                       model_name = "resnet_model", # this is the name the model is registered as
                       tags = {'model': "dl", 'framework': "resnet"},
                       description = "resnet 152 model",
                       workspace = ws)

print(model.name, model.description, model.version)

Registering model resnet_model
resnet_model resnet 152 model 1


## Test the scoring script at local host

In [16]:
#define init() function
def init():
    import tensorflow as tf
    from resnet152 import ResNet152
    from keras.preprocessing import image
    from keras.applications.imagenet_utils import preprocess_input, decode_predictions

    import numpy as np
    import timeit as t
    import base64
    import json
    from PIL import Image, ImageOps
    from io import BytesIO
    import logging

    global model
    model = ResNet152(weights='imagenet')
    print('Model loaded')

In [17]:
init()

Model loaded


In [6]:
#define run() function 
def run(inputString):
    
    import tensorflow as tf
    from resnet152 import ResNet152
    from keras.preprocessing import image
    from keras.applications.imagenet_utils import preprocess_input, decode_predictions

    import numpy as np
    import timeit as t
    import base64
    import json
    from PIL import Image, ImageOps
    from io import BytesIO
    import logging   
    
    model = ResNet152(weights='imagenet')
    print('Model loaded')
  
    
     
    responses = []
    base64Dict = json.loads(inputString)

    for k, v in base64Dict.items():
        img_file_name, base64Img = k, v
    decoded_img = base64.b64decode(base64Img)
    img_buffer = BytesIO(decoded_img)
    imageData = Image.open(img_buffer).convert("RGB")
    
    # Evaluate the model using the input data
    img = ImageOps.fit(imageData, (224,224), Image.ANTIALIAS)
    img = np.array(img) # shape: (224, 224, 3)
    
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    
    preds = model.predict(img)
    print('Predicted:', decode_predictions(preds, top=3))
    resp = {img_file_name: str(decode_predictions(preds, top=3))}

    responses.append(resp)
    return json.dumps(responses)

In [48]:
from io import BytesIO
from PIL import Image, ImageOps
import base64
import json

img_path = test_image_path
encoded = None
with open(img_path, 'rb') as file:
  encoded = base64.b64encode(file.read())
img_dict = {img_path: encoded.decode('utf-8')}
body = json.dumps(img_dict)
resp = run(body)
print(resp)

Model loaded
Predicted: [[('n02127052', 'lynx', 0.9816487), ('n02128385', 'leopard', 0.007744099), ('n02123159', 'tiger_cat', 0.0036861112)]]
[{"220px-Lynx_lynx_poing.jpg": "[[('n02127052', 'lynx', 0.9816487), ('n02128385', 'leopard', 0.007744099), ('n02123159', 'tiger_cat', 0.0036861112)]]"}]


## Write and save scoring script

In [21]:
%%writefile $score_file_path
def init():
    import tensorflow as tf
    from resnet152 import ResNet152
    from keras.preprocessing import image
    from keras.applications.imagenet_utils import preprocess_input, decode_predictions

    import numpy as np
    import timeit as t
    import base64
    import json
    from PIL import Image, ImageOps
    from io import BytesIO
    import logging

    global model
    model = ResNet152(weights='imagenet')
    print('Model loaded')
    
def run(inputString):
    
    import tensorflow as tf
    from resnet152 import ResNet152
    from keras.preprocessing import image
    from keras.applications.imagenet_utils import preprocess_input, decode_predictions

    import numpy as np
    import timeit as t
    import base64
    import json
    from PIL import Image, ImageOps
    from io import BytesIO
    import logging   
    
    model = ResNet152(weights='imagenet')
    print('Model loaded')
  
    responses = []
    base64Dict = json.loads(inputString)

    for k, v in base64Dict.items():
        img_file_name, base64Img = k, v
    decoded_img = base64.b64decode(base64Img)
    img_buffer = BytesIO(decoded_img)
    imageData = Image.open(img_buffer).convert("RGB")
    
    # Evaluate the model using the input data
    img = ImageOps.fit(imageData, (224,224), Image.ANTIALIAS)
    img = np.array(img) # shape: (224, 224, 3)
    
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    
    preds = model.predict(img)
    print('Predicted:', decode_predictions(preds, top=3))
    resp = {img_file_name: str(decode_predictions(preds, top=3))}

    responses.append(resp)
    return json.dumps(responses)    

Writing /workspace/AKSDeploymentTutorial_AML/Keras_Tensorflow/temp/score.py


<a id='create_image'></a>
## Create an Image
Create an image using the registered model the script that will load and run the model.

In [8]:
from azureml.core.image import ContainerImage

image_config = ContainerImage.image_configuration(execution_script = "score.py",
                                                  runtime = "python",
                                                  conda_file = "myenv_yz.yml",
                                                  docker_file = "mydockerfile",
                                                  description = "Image for AKS Deployment Tutorial",
                                                  tags = {"name":"AKS","project":"AML"}, 
                                                  dependencies = ["resnet152.py"],
                                                  enable_gpu = True
                                                 )

image = ContainerImage.create(name = "myimage15",
                              # this is the model object
                              models = [],                              
                              image_config = image_config,
                              workspace = ws)

image.wait_for_creation(show_output = True)

Creating image
Running................................................................................................................................................................................................................
SucceededImage creation operation finished for image myimage15:1, operation "Succeeded"


<a id='provision_cluster'></a>
## Provision the AKS Cluster¶ 
This is a one time setup. You can reuse this cluster for multiple deployments after it has been created. If you delete the cluster or the resource group that contains it, then you would have to recreate it.

In [28]:
resource_group = 'yanzamlworkspace'
aks_name = 'yanz-aksgpu-v10'

In [29]:
!az aks create --resource-group $resource_group --name $aks_name --node-count 1 --generate-ssh-keys -k "1.10.8" -s Standard_NC6

[33mSSH key files '/root/.ssh/id_rsa' and '/root/.ssh/id_rsa.pub' have been generated under ~/.ssh to allow SSH access to the VM. If using machines without permanent storage like Azure Cloud Shell without an attached file share, back up your keys to a safe location[0m
[K - Interrupted ..ncipal creation[##################################]  100.0000%[31mDeployment failed. Correlation ID: e7f2baa2-ccd2-4451-92f6-49b89f9e6a81. Provisioning of resource(s) for container service yanz-aksgpu-v10 in resource group yanzamlworkspace failed. Message: Operation results in exceeding quota limits of Core. Maximum allowed: 48, Current in use: 48, Additional requested: 6. Please read more about quota increase at http://aka.ms/corequotaincrease.. Details: [0m
[0m

In [None]:
#Provision AKS cluster with GPU machine
# Use the default configuration (can also provide parameters to customize)
prov_config = AksCompute.provisioning_configuration(vm_size='Standard_NC6')

aks_name = 'yanz-aks-1' 
# Create the cluster
aks_target = ComputeTarget.create(workspace = ws, 
                                  name = aks_name, 
                                  provisioning_configuration = prov_config)


## Optional step: Attach existing AKS cluster¶

In [5]:
'''
# Attach an existing AKS cluster
# Use the default configuration (can also provide parameters to customize)
resource_id = '/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourcegroups/yanzamlworkspace/providers/Microsoft.ContainerService/managedClusters/yanz-aks-1c6750233554'

create_name='my-gpu-aks' 
# Create the cluster
aks_target = AksCompute.attach(workspace=ws, name=create_name, resource_id=resource_id)
# Wait for the operation to complete
aks_target.wait_for_completion(True)

'''

Creating.
SucceededProvisioning operation finished, operation "Succeeded"


In [6]:
aks_target

<azureml.core.compute.aks.AksCompute at 0x7f8978236ef0>

In [29]:
#list images
images = ws.images()
images

{'myimage1': ContainerImage(workspace=<azureml.core.workspace.Workspace object at 0x7ff75fb7fd68>, name=myimage1, id=myimage1:1, tags={'name': 'AKS', 'project': 'AML'}, properties={}, version=1)}

In [24]:
#for img in ws.images():
#    if img.name == 'myimage1': img.delete()

In [15]:
%%time
aks_target.wait_for_completion(show_output = True)
print(aks_target.provisioning_state)
print(aks_target.provisioning_errors)

SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
None
CPU times: user 26 ms, sys: 7.33 ms, total: 33.3 ms
Wall time: 463 ms


In [26]:
!az aks get-credentials -n 'yanz-aks-cpu1549812594' -g yanzamlworkspace -a -f config_cpuaks

Merged "yanz-aks-cpu1549812594-admin" as current context in config_cpuaks


In [33]:
!kubectl --kubeconfig config get services

/bin/sh: 1: kubectl: not found


<a id='deploy_ws'></a>
## Deploy web service to AKS¶ 

In [13]:
'''
#Deploy web service to AKS
#Set the web service configuration (using default here)
aks_config = AksWebservice.deploy_configuration()
print(aks_config)
'''

<azureml.core.webservice.aks.AksServiceDeploymentConfiguration object at 0x7fa51aabca20>


In [9]:
#Deploy web service to AKS
#Set the web service configuration (using customized configuration)
aks_config = AksWebservice.deploy_configuration(memory_gb=2.0, enable_app_insights=True)
print(aks_config)

<azureml.core.webservice.aks.AksServiceDeploymentConfiguration object at 0x7f89524dbcf8>


In [47]:
# by default the 500MB -- memory_gb; up to 1.4 GB  - manually - profiling ;  cluster capacity is 24G
help(AksWebservice)

Help on class AksWebservice in module azureml.core.webservice.aks:

class AksWebservice(azureml.core.webservice.webservice.Webservice)
 |  Class for AzureML AKS Webservices
 |  
 |  Method resolution order:
 |      AksWebservice
 |      azureml.core.webservice.webservice.Webservice
 |      abc.ABC
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  add_properties(self, properties)
 |      Add key value pairs to this Webservice's properties dictionary.
 |      
 |      :param properties: The dictionary of properties to add
 |      :type properties: dict[str, str]
 |  
 |  add_tags(self, tags)
 |      Add key value pairs to this Webservice's tags dictionary
 |      
 |      :param tags: The dictionary of tags to add
 |      :type tags: dict[str, str]
 |      :raises: WebserviceException
 |  
 |  remove_tags(self, tags)
 |      Remove the specified keys from this Webservice's dictionary of tags.
 |      
 |      :param tags: The list of keys to remove
 |      :type tags: list[

In [55]:
image

ContainerImage(workspace=<azureml.core.workspace.Workspace object at 0x7fa5c417c5c0>, name=myimage12, id=myimage12:1, tags={'name': 'AKS', 'project': 'AML'}, properties={}, version=1)

In [10]:
%%time
aks_service_name ='yanz-aks-service-15'

aks_service = Webservice.deploy_from_image(workspace = ws, 
                                           name = aks_service_name,
                                           image = image,
                                           deployment_config = aks_config,
                                           deployment_target = aks_target)
aks_service.wait_for_deployment(show_output = True)
print(aks_service.state)

Creating service
Running......................................................................................................................
TimedOutAKS service creation operation finished, operation "TimedOut"
Transitioning
CPU times: user 2.14 s, sys: 146 ms, total: 2.29 s
Wall time: 10min 19s


<a id='test_ws'></a>
## Test Web Service¶ 
We test the web sevice by passing data.

In [18]:
from io import BytesIO
from PIL import Image, ImageOps
import base64
import json

img_path = test_image_path
encoded = None
with open(img_path, 'rb') as file:
  encoded = base64.b64encode(file.read())
img_dict = {img_path: encoded.decode('utf-8')}
body = json.dumps(img_dict)
resp = aks_service.run(input_data = body)
print(resp)

WebserviceException: Received bad response from service:
Response Code: 404
Headers: {'server': 'Cowboy', 'date': 'Mon, 05 Nov 2018 13:59:17 GMT', 'content-length': '9', 'cache-control': 'max-age=0, private, must-revalidate', 'x-ms-request-id': 'd8457edd-03d3-4ccf-a21e-d0e7bf8ad51c'}
Content: b'Not found'

In [27]:
aks_service.update(enable_app_insights=True)

In [17]:
print(aks_service.state)

Transitioning


In [16]:
aks_service.update_deployment_state()

In [46]:
!kubectl --kubeconfig config_cpuaks proxy --port 8011

/bin/sh: 1: kubectl: not found


In [58]:
log = aks_service.get_logs(5000)

In [59]:
with open("servicelog_ws14", "w") as json_file:
    json_file.write(log)

In [21]:
#debug web service failure
print(ws.webservices()['yanz-aks-service-11'].get_logs())

{"timestamp": "2018-11-01T10:32:31.167424Z", "path": "/home/mmlspark/lib/conda/lib/python3.5/site-packages/gunicorn/glogging.py", "level": "INFO", "logger": "gunicorn.access", "tags": "%(module)s, %(asctime)s, %(levelname)s, %(message)s", "message": "127.0.0.1 - - [01/Nov/2018:10:32:31 +0000] \"GET / HTTP/1.0\" 200 7 \"-\" \"kube-probe/1.11\"", "stack_info": null, "host": "yanz-aks-service-11-795577b6f-8dlq6"}
{"timestamp": "2018-11-01T10:32:32.167302Z", "path": "/home/mmlspark/lib/conda/lib/python3.5/site-packages/gunicorn/glogging.py", "level": "INFO", "logger": "gunicorn.access", "tags": "%(module)s, %(asctime)s, %(levelname)s, %(message)s", "message": "127.0.0.1 - - [01/Nov/2018:10:32:32 +0000] \"GET / HTTP/1.0\" 200 7 \"-\" \"kube-probe/1.11\"", "stack_info": null, "host": "yanz-aks-service-11-795577b6f-8dlq6"}
{"timestamp": "2018-11-01T10:32:33.167403Z", "path": "/home/mmlspark/lib/conda/lib/python3.5/site-packages/gunicorn/glogging.py", "level": "INFO", "logger": "gunicorn.acces

<a id='clean_up'></a>
## Clean Up Resources

In [None]:
#clean up resources
#aks_target = AksCompute(name='jaya-aks-1',workspace=ws)
#aks_target.delete()

In [None]:
#alternate code to clean up resources
#!az aks delete --resource-group jayavienna --name jaya-aks-2 --yes

In [None]:
#for s in ws.webservices():
#    print(s.name)

In [None]:
#s =  Webservice(ws, 'jaya-aks-service-2')
#s.delete()

In [None]:
#from azureml.core import Workspace
#from azureml.core.compute import AksCompute, ComputeTarget

#ws = Workspace.from_config()

#for c in ws.compute_targets():
#    print(c.name)
