Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Image Classification Using Scikit-learn

## Prerequisites

*    [ A Kubernetes cluster deployed on Azure Stack HCI, connected to Azure through ARC](https://docs.microsoft.com/en-us/azure-stack/aks-hci/connect-to-arc).
     

*    [ Datastore setup in Azure Machine Learning workspace backed up by Azure Stack Hub storage account ](https://github.com/Azure/AML-Kubernetes/blob/master/docs/ASH/Train-AzureArc.md) 


*    Last but not least, you need to be able to run a Notebook. (azureml-core, numpy, matplotlib, requests are required)

   If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, make sure you go through the configuration Notebook located at [here](https://github.com/Azure/MachineLearningNotebooks) first. This sets you up with a working config file that has information on your workspace, subscription id, etc.

## Initialize AzureML workspace

Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`. 

If you haven't done already please go to `config.json` file and fill in your workspace information.

In [None]:
from azureml.core.workspace import Workspace,  ComputeTarget
from azureml.exceptions import ComputeTargetException

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

## Download mnist data

Perform pip install azureml-opendatasets to get the open dataset package, use this function to download mnist data later. This allows you to avoid download the data again when you run this notebook multiple times. The actual download time may take 2 minutes.

In [None]:
from azureml.core import Dataset
from azureml.opendatasets import MNIST
import os

def download_mnist_data():
    data_folder = os.path.join(os.getcwd(), 'mnist_data')
    os.makedirs(data_folder, exist_ok=True)

    mnist_file_dataset = MNIST.get_file_dataset()
    path = mnist_file_dataset.download(data_folder, overwrite=True)
    downloaded_folder = os.path.dirname(path[0])
    print("downloaded to", downloaded_folder)
    
    return downloaded_folder

download_mnist_data()


## Prepare the dataset

The above download_mnist_data() function will download four files  t10k-images-idx3-ubyte.gz, t10k-labels-idx1-ubyte.gz, train-images-idx3-ubyte.gz and train-labels-idx1-ubyte.gz to downloaded_folder.  Your next step is to upload these files to datastore of the workspace, and then registered as dataset in the workspace. 

"datastore_name" is the name of the datastore you setup in [this step](https://github.com/Azure/AML-Kubernetes/blob/master/docs/ASH/Train-AzureArc.md).

Upload and dataset registration take less than 1 min.

In [None]:
from azureml.core import Workspace, Dataset, Datastore

dataset_name = "mnist_ash_o"
datastore_name = "ashdatastore"

if dataset_name not  in ws.datasets:
    downloaded_folder = download_mnist_data()
    datastore =  Datastore.get(ws, datastore_name)
    
    src_dir, target_path =downloaded_folder, 'mnistdataash'
    datastore.upload(src_dir, target_path)

    # register data uploaded as AML dataset
    datastore_paths = [(datastore, target_path)]
    mnist_ds = Dataset.File.from_files(path=datastore_paths)
    mnist_ds.register(ws, dataset_name, "mnist data from http://yann.lecun.com/exdb/mnist/")

## Setup compute target

Find the Arc K8S Resource Id, e.g. /subscriptions/86204643-5a96-427b-b6bb-b35b2bd6e6ce/resourceGroups/AKS-HCI2/providers/Microsoft.Kubernetes/connectedClusters/my-workload-cluster and replace the resource id below.

In [None]:
from azureml.core.compute import KubernetesCompute
from azureml.core.compute import ComputeTarget
import os

ws = Workspace.from_config()

# choose a name for your Azure Arc-enabled Kubernetes compute
amlarc_compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "amlarc-compute4")

# resource ID for your Azure Arc-enabled Kubernetes cluster
resource_id = "/subscriptions/86204643-5a96-427b-b6bb-b35b2bd6e6ce/resourceGroups/AKS-HCI2/providers/Microsoft.Kubernetes/connectedClusters/my-workload-cluster"

if amlarc_compute_name in ws.compute_targets:
   amlarc_compute = ws.compute_targets[amlarc_compute_name]
   if amlarc_compute and type(amlarc_compute) is KubernetesCompute:
      print("found compute target: " + amlarc_compute_name)
else:
   print("creating new compute target...")
   ns = "aml"
    
   instance_types = {
    "defaultInstanceType": {
      "nodeSelector": None,
      "resources": {
        "requests": {
          "cpu": "1",
          "memory": "4Gi",
          "nvidia.com/gpu": 0
        },
        "limits": {
          "cpu": "1",
          "memory": "4Gi",
          "nvidia.com/gpu": 0
        }
      }
    }
  }

   amlarc_attach_configuration = KubernetesCompute.attach_configuration(resource_id = resource_id, namespace = ns, default_instance_type="defaultInstanceType", instance_types = instance_types)
 
   amlarc_compute = ComputeTarget.attach(ws, amlarc_compute_name, amlarc_attach_configuration)

 
   amlarc_compute.wait_for_completion(show_output=True)
    
   # For a more detailed view of current KubernetesCompute status, use get_status()
   print(amlarc_compute.get_status().serialize())

print(f"compute target id in endpoint yaml: azureml:{amlarc_compute.name}")

In [None]:
from azureml.core.compute import KubernetesCompute

attach_name = amlarc_compute_name
arcK_target = KubernetesCompute(ws, attach_name)

## Configure the training job and submit

### Create an experiement

In [None]:
from azureml.core import Experiment

experiment_name = 'mnist-demo'

exp = Experiment(workspace=ws, name=experiment_name)

### Create an environment

In [None]:
# use a curated environment that has already been built for you

from azureml.core.environment import Environment
env = Environment.get(workspace=ws, 
                      name="AzureML-Scikit-learn0.24-Cuda11-OpenMpi4.1.0-py36", 
                      version=1)

In [None]:
# customized environment

# from azureml.core.environment import Environment
# from azureml.core.conda_dependencies import CondaDependencies
# # to install required packages
# env = Environment('tutorial-env')
# cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults'], conda_packages = ['scikit-learn==0.22.1'])

# env.python.conda_dependencies = cd

### Configure the training job

The training takes about 15 mins with vm size comparable  to Standard_DS3_v2

In [None]:
from azureml.core import ScriptRunConfig

args = ['--data-folder', ws.datasets[dataset_name].as_mount(), '--regularization', 0.5]
script_folder =  "mnist_script"
src = ScriptRunConfig(source_directory=script_folder,
                      script='train.py', 
                      arguments=args,
                      compute_target=arcK_target,
                      environment=env)

### Submit the job

Run your experiment by submitting your ScriptRunConfig object. Note that this call is asynchronous.

In [None]:
run = exp.submit(config=src)
run.wait_for_completion(show_output=True)  # specify True for a verbose log

### Register the model

Register the trained model.

In [None]:
model_name='sklearn_mnist'

In [None]:
# register model
model = run.register_model(model_name=model_name,
                           model_path='outputs/sklearn_mnist_model.pkl')

The machine learning model named "sklearn_mnist" should be registered in your AzureML workspace.

### Get the model

In [None]:
from azureml.core.model import Model
model = Model(ws, model_name)
model_id = f"azureml:{model.name}:{model.version}"
print(f"Get {model.name}, latest version {model.version}, id in endpoint.yml: {model_id}")

## Deploy and score a machine learning model by using a managed online endpoint

AZ CLI only now

In [None]:
endpoint = 'sklearn-mnist-jiadu'

from pathlib import Path
prefix = Path(__file__).parent
endpoint_file = str(prefix.joinpath("endpoint.yml"))
print(f"Using Endpoint file: {endpoint_file}, please replace model id (e.g. azureml:sklearn_mnist:2) and compute target id (e.g. azureml:amlarc-compute4) according above output")

In [None]:
import helpers

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')
helpers.run(f"az ml endpoint create -n {endpoint} -f {endpoint_file} -w {ws.name} -g {ws.resource_group}")

## Test training model

### Test with inputs

Here you may use the image from test asset. The first 30 images and its labels are shown below.

In [None]:
from mnist_script.utils import load_data
import os
import glob
import matplotlib.pyplot as plt
import numpy as np

data_folder = os.path.join(os.getcwd(), 'mnist_data')

X_test = load_data(glob.glob(os.path.join(data_folder,"**/t10k-images-idx3-ubyte.gz"), recursive=True)[0], False) / 255.0
y_test = load_data(glob.glob(os.path.join(data_folder,"**/t10k-labels-idx1-ubyte.gz"), recursive=True)[0], True).reshape(-1)

# show first 30 figures

count = 0
sample_size = 30
plt.figure(figsize = (16, 6))
# for i in np.random.permutation(X_test.shape[0])[:sample_size]:
for i in range(30):
    count = count + 1
    plt.subplot(1, sample_size, count)
    plt.axhline('')
    plt.axvline('')
    plt.text(x = 10, y = -10, s = y_test[i], fontsize = 18)
    plt.imshow(X_test[i].reshape(28, 28), cmap = plt.cm.Greys)
plt.show()

Get score_uri and access_token from AZ CLI (Currently only AZ CLI supported)

In [None]:
# get predicted digits:
import helpers
from azureml.core.workspace import Workspace
ws = Workspace.from_config()
cmd = f"az ml endpoint show -n {endpoint} -w {ws.name} -g {ws.resource_group}"
properties = helpers.run(cmd, return_output=True, no_output=True)

cmd = f"az ml endpoint get-credentials -n {endpoint} -w {ws.name} -g {ws.resource_group}"
credentials = helpers.run(cmd, return_output=True, no_output=True)

Test the second image: 2

In [None]:
import json
prop_response = json.loads(properties.replace(os.linesep,""))
score_uri = prop_response["scoring_uri"]

cred_response = json.loads(credentials.replace(os.linesep, ""))
access_token = cred_response["accessToken"]

import requests
# second number should be 2
test = json.dumps({"data": X_test.tolist()[1:2]})
headers = {'Content-Type': 'application/json', 'Authorization': f"Bearer {access_token}"}
r = requests.post(score_uri, data=test, headers=headers)
print(f"predictions: {r.content}")

## Next steps

1. Learn how to [distributed training with pytorch](../distributed-cifar10/distributed-pytorch-cifar10.ipynb)
2. Learn how to [distributed training with tensorflow](../distributed-cifar10/distributed-tf2-cifar10.ipynb)
3. Learn Pipeline Steps with [Object Segmentation](../object-segmentation-on-azure-stack/)