# Train an Automated Machine Learning Model for Trash Recycling using Instance Segmentation

This notebook is an adaptation from: https://github.com/Azure/azureml-examples/tree/main/sdk/python/jobs/automl-standalone-jobs/automl-image-instance-segmentation-task-fridge-items .
Please keep an eye on this repo as is contains the most recent examples of the SDK v2 usage and is subject to change.

<img src="media/trash_segmented.png" width="800"/>

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from torch import nn
import os

# Import required libraries
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

from azure.ai.ml.constants import AssetTypes
from azure.ai.ml import Input
from azure.ai.ml.automl import ImageClassificationSearchSpace
from azure.ai.ml.sweep import (
    Choice,
    Uniform,
    BanditPolicy,
)
# import required libraries
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    Model,
    Environment,
    CodeConfiguration,
    ProbeSettings,
)

from azure.ai.ml.entities import Data
from azure.ai.ml.constants import InputOutputModes
from azure.ai.ml import automl

# IMPORT CUSTOM FUNCTION LIBRARY
from utils import *


## 1. Setup of ML Client

The ML client is the SDK object that allows us later to train the model

In [None]:
credential = DefaultAzureCredential()
ml_client = None

try:
    ml_client = MLClient.from_config(credential=credential)
except Exception as ex:
    # NOTE: Update following workspace information if not correctly configure before
    client_config = {
        "subscription_id": "<SUBSCRIPTION ID>",
        "resource_group": "<RG_NAME>",
        "workspace_name": "<WORKSPACENAME>",
    }
    if client_config["subscription_id"].startswith("<"):
        print(
            "please update your <SUBSCRIPTION_ID> <RESOURCE_GROUP> <AML_WORKSPACE_NAME> in notebook cell"
        )
        raise ex
    else:  # write and reload from config file
        import json, os
        config_path = "../.azureml/config.json"
        os.makedirs(os.path.dirname(config_path), exist_ok=True)
        with open(config_path, "w") as fo:
            fo.write(json.dumps(client_config))
        ml_client = MLClient.from_config(credential=credential, path=config_path)
print(ml_client)

## 2. Load pre-processed data 

We will now access the datasets that we have prepared for model training which can also be found in the data tile. You can also see the versions of the registered dataset in the UI to understand better which data to access.

In [None]:
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes


# Training MLTable defined locally, with local data to be uploaded
#my_training_data_input = Input(type=AssetTypes.MLTABLE, path=training_mltable_path)
# Validation MLTable defined locally, with local data to be uploaded
#my_validation_data_input = Input(type=AssetTypes.MLTABLE, path=validation_mltable_path)

# WITH REMOTE PATH: If available already in the cloud/workspace-blob-store. 
#If you need a specific version, adjust the path to "train_trash_jsonl:<version_number>" denotes the file version

my_training_data_input = Input(
    type=AssetTypes.MLTABLE, path="azureml:train_trash_jsonl:1",
    mode=InputOutputModes.DIRECT
)

# Validation MLTable with versioned TabularDataset
my_validation_data_input = Input(
    type=AssetTypes.MLTABLE, path="azureml:val_trash_jsonl:1",
    mode=InputOutputModes.DIRECT
    )

## 3. Compute target setup

You will need to provide a Compute Target that will be used for your AutoML model training. AutoML models for image tasks require GPU SKUs such as the ones from the NC, NCv2, NCv3, ND, NDv2 and NCasT4 series. We recommend using the NCsv3-series (with v100 GPUs) for faster training. Using a compute target with a multi-GPU VM SKU will leverage the multiple GPUs to speed up training. Additionally, setting up a compute target with multiple nodes will allow for faster model training by leveraging parallelism, when tuning hyperparameters for your model.


In [None]:
from azure.ai.ml.entities import AmlCompute
from azure.core.exceptions import ResourceNotFoundError

compute_name = "gpu-cluster"

try:
    _ = ml_client.compute.get(compute_name)
    print("Found existing compute target.")
except ResourceNotFoundError:
    print("Creating a new compute target...")
    compute_config = AmlCompute(
        name=compute_name,
        type="amlcompute",
        size="Standard_NC6",
        idle_time_before_scale_down=120,
        min_instances=0,
        max_instances=4,
    )
    ml_client.begin_create_or_update(compute_config)



```
# For tuning task on seperate cluster 

compute_name_tuning = "gpu-cluster-tuning"
try:
    _ = ml_client.compute.get(compute_name_tuning)
    print("Found existing compute target.")
except ResourceNotFoundError:
    print("Creating a new compute target...")
    compute_config = AmlCompute(
        name=compute_name_tuning,
        type="amlcompute",
        size="Standard_NC6",
        idle_time_before_scale_down=120,
        min_instances=0,
        max_instances=4,
    )
    ml_client.begin_create_or_update(compute_config)

## 4. Configure and run the AutoML for Images Instance Segmentation training job

AutoML allows you to easily train models for Image Classification, Object Detection & Instance Segmentation on your image data. You can control the model algorithm to be used, specify hyperparameter values for your model as well as perform a sweep across the hyperparameter space to generate an optimal model.

When using AutoML for image tasks, you need to specify the model algorithms using the model_name parameter. You can either specify a single model or choose to sweep over multiple models. Please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models?tabs=CLI-v2#configure-model-algorithms-and-hyperparameters) for the list of supported model algorithms.

### 4.1. Using default hyperparameter values for the specified algorithm
Before doing a large sweep to search for the optimal models and hyperparameters, we recommend trying the default values for a given model to get a first baseline. Next, you can explore multiple hyperparameters for the same model before sweeping over multiple models and their parameters. This allows an iterative approach, as with multiple models and multiple hyperparameters for each (as we showcase in the next section), the search space grows exponentially, and  you need more iterations to find optimal configurations.

Following functions are used to configure the AutoML image job:
#### image_instance_segmentation() function parameters:
The `image_instance_segmentation()` factory function allows user to configure the training job.

- `compute` - The compute on which the AutoML job will run. In this example we are using a compute called 'gpu-cluster' present in the workspace. You can replace it any other compute in the workspace.
- `experiment_name` - The name of the experiment. An experiment is like a folder with multiple runs in Azure ML Workspace that should be related to the same logical machine learning experiment.
- `name` - The name of the Job/Run. This is an optional property. If not specified, a random name will be generated.
- `primary_metric` - The metric that AutoML will optimize for model selection.
- `target_column_name` - The name of the column to target for predictions. It must always be specified. This parameter is applicable to 'training_data' and 'validation_data'.
- `training_data` - The data to be used for training. It should contain both training feature columns and a target column. Optionally, this data can be split for segregating a validation or test dataset. 
You can use a registered MLTable in the workspace using the format '<mltable_name>:<version>' OR you can use a local file or folder as a MLTable. For e.g Input(mltable='my_mltable:1') OR Input(mltable=MLTable(local_path="./data"))
The parameter 'training_data' must always be provided.

#### set_limits() parameters:
This is an optional configuration method to configure limits parameters such as timeouts.     
    
- `timeout_minutes` - Maximum amount of time in minutes that the whole AutoML job can take before the job terminates. If not specified, the default job's total timeout is 6 days (8,640 minutes).

#### set_image_model() function parameters:
This is an optional configuration method to configure fixed settings or parameters that don't change during the parameter space sweep. Some of the key parameters of this function are:

- `model_name` - The name of the ML algorithm that we want to use in training job. Please refer to this [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models?tabs=CLI-v2#supported-model-algorithms) for supported model algorithm.
- `number_of_epochs` - The number of training epochs. It must be positive integer (default value is 15).
- `layers_to_freeze` - The number of layers to freeze in model for transfer learning. It must be a positive integer (default value is 0).
- `early_stopping` - It enable early stopping logic during training, It must be boolean value (default is True).   
- `optimizer` - Type of optimizer to use in training. It must be either sgd, adam, adamw (default is sgd).
- `distributed` - It enable distributed training if compute target contain multiple GPUs. It must be boolean value (default is True).

If you wish to use the default hyperparameter values for a given algorithm (say `maskrcnn`). For more info, check: https://learn.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models?tabs=python

In [None]:
# general job parameters
exp_name = "trash-image-instance-segmentation-experiment_tutorial"

In [None]:
# Create the AutoML job with the related factory-function.

image_instance_segmentation_job = automl.image_instance_segmentation(
    compute=compute_name,
    experiment_name=exp_name,
    training_data=my_training_data_input,
    validation_data=my_validation_data_input,
    target_column_name="label",
)

image_instance_segmentation_job.set_limits(timeout_minutes=700) #The larger the dataset, the larger the timeout. Since we work on a subset, 120 minutes should suffice

image_instance_segmentation_job.set_image_model(model_name="maskrcnn_resnet50_fpn")

## 5. Submitting an AutoML job for Computer Vision tasks
Once you've configured your job, you can submit it as a job in the workspace in order to train a vision model using your training dataset.

In [None]:
# Submit the AutoML job
returned_job = ml_client.jobs.create_or_update(image_instance_segmentation_job)

print(f"Created job: {returned_job}")

In [None]:
ml_client.jobs.stream(returned_job.name)

## 6. Hyperparameter sweeping for your AutoML models for computer vision tasks

When using AutoML for Images, you can perform a hyperparameter sweep over a defined parameter space to find the optimal model. In this example, we sweep over the hyperparameters for `maskrcnn_resnet50_fpn` model which is pretrained on COCO, a large-scale object detection, segmentation, and captioning dataset that contains over 200K labeled images with over 80 label categories, choosing from a range of values for learning_rate, optimizer, etc., to generate a model with the optimal 'MeanAveragePrecision'. If hyperparameter values are not specified, then default values are used for the specified algorithm.

set_sweep function is used to configure the sweep settings:
### set_sweep() parameters:

- `max_trials` - Required parameter for maximum number of configurations to sweep. Must be an integer between 1 and 1000. When exploring just the default hyperparameters for a given model algorithm, set this parameter to 1.
- `max_concurrent_trials` - Maximum number of runs that can run concurrently. If not specified, all runs launch in parallel. If specified, must be an integer between 1 and 100.
    NOTE: The number of concurrent runs is gated on the resources available in the specified compute target. Ensure that the compute target has the available resources for the desired concurrency.
- `sampling_algorithm` - Sampling method to use for sweeping over the defined parameter space. Please refer to this [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models?tabs=SDK-v2#sampling-methods-for-the-sweep) for list of supported sampling methods.
- `early_termination` - Early termination policy to end poorly performing runs. If no termination policy is specified, all configurations are run to completion. Please refer to this [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models?tabs=SDK-v2#early-termination-policies) for supported early termination policies.

We use Random Sampling to pick samples from this parameter space and try a total of 10 iterations with these different samples, running 2 iterations at a time on our compute target. Please note that the more parameters the space has, the more iterations you need to find optimal models.

We leverage the Bandit early termination policy which will terminate poor performing configs (those that are not within 20% slack of the best performing config), thus significantly saving compute resources.

For more details on model and hyperparameter sweeping, please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters).

In [None]:
# Create the AutoML job with the related factory-function.
from azure.ai.ml.automl import ImageObjectDetectionSearchSpace

image_instance_segmentation_job = automl.image_instance_segmentation(
    compute=compute_name_tuning,
    experiment_name="trash_hyperparameter_job",
    training_data=my_training_data_input,
    validation_data=my_validation_data_input,
    target_column_name="label",
    primary_metric="MeanAveragePrecision",
    tags={"hackathon": "instance_segmentation"},
)

image_instance_segmentation_job.set_limits(timeout_minutes=500)

image_instance_segmentation_job.extend_search_space(
    [
        ImageObjectDetectionSearchSpace(
            model_name=Choice(["maskrcnn_resnet50_fpn"]),
            learning_rate=Uniform(0.0001, 0.001),
            # warmup_cosine_lr_warmup_epochs=Choice([0, 3]),
            optimizer=Choice(["sgd", "adam", "adamw"]),
            min_size=Choice([600, 800]),
        ),
    ]
)

image_instance_segmentation_job.set_sweep(
    max_trials=10,
    max_concurrent_trials=2,
    sampling_algorithm="Random",
    early_termination=BanditPolicy(
        evaluation_interval=2, slack_factor=0.2, delay_evaluation=6
    ),
)

In [None]:
# Submit the AutoML job
returned_job = ml_client.jobs.create_or_update(
    image_instance_segmentation_job
)  # submit the job to the backend

print(f"Created job: {returned_job}")

In [None]:
hd_job = ml_client.jobs.get(returned_job.name + "_HD")
hd_job

## 7. Initialize MLFlow Client for tracking

The models and artifacts that are produced by AutoML can be accessed via the MLFlow interface.
Initialize the MLFlow client here, and set the backend as Azure ML, via. the MLFlow Client.

IMPORTANT, you need to have installed the latest MLFlow packages with:

    pip install azureml-mlflow

    pip install mlflow

In [None]:
import mlflow

# Obtain the tracking URL from MLClient
MLFLOW_TRACKING_URI = ml_client.workspaces.get(
    name=ml_client.workspace_name
).mlflow_tracking_uri

print(MLFLOW_TRACKING_URI)

In [None]:
# Set the MLFLOW TRACKING URI

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

print("\nCurrent tracking uri: {}".format(mlflow.get_tracking_uri()))

### 7.1 Get the AutoML parent Job

In [None]:
from mlflow.tracking.client import MlflowClient

# Initialize MLFlow client
mlflow_client = MlflowClient()

In [None]:
#job_name = returned_job.name

# Example if providing an specific Job name/ID
job_name = "busy_street_1zqqwj9y3m"

# Get the parent run
mlflow_parent_run = mlflow_client.get_run(job_name)

print("Parent Run: ")
print(mlflow_parent_run)

In [None]:
# Print parent run tags. 'automl_best_child_run_id' tag should be there.
print(mlflow_parent_run.data.tags)

In [None]:
# Get the best model's child run

best_child_run_id = mlflow_parent_run.data.tags["automl_best_child_run_id"]
print("Found best child run id: ", best_child_run_id)

best_run = mlflow_client.get_run(best_child_run_id)

print("Best child run: ")
print(best_run)

### 7.2 Get best model run's metrics
Access the results (such as Models, Artifacts, Metrics) of a previously completed AutoML Run.

In [None]:
import pandas as pd

pd.DataFrame(best_run.data.metrics, index=[0]).T

## 8. Deploy the model

### 8.1. Create endpoint name

In [None]:
# Creating a unique endpoint name with current datetime to avoid conflicts
import datetime

online_endpoint_name = "trash-granular-" + datetime.datetime.now().strftime(
    "%m%d%Y")

# create an online endpoint
endpoint = ManagedOnlineEndpoint(
    name=online_endpoint_name,
    description="this is a sample online endpoint for deploying model",
    auth_mode="key",
    tags={"foo": "bar"},
)
print(online_endpoint_name)

In [None]:
ml_client.begin_create_or_update(endpoint)

### 8.2 Deploy the model

In [None]:
# import required libraries
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    Model,
    Environment,
    CodeConfiguration,
    ProbeSettings,
)

# deploying the mlflow-model
model_name = "trash_detection_big_granular"
model = Model(
    path=f"azureml://jobs/{best_run.info.run_id}/outputs/artifacts/outputs/mlflow-model/",
    name=model_name,
    description="my trash instance segmentation model",
    type=AssetTypes.MLFLOW_MODEL,
)

# for downloaded file
# model = Model(
#     path=path="artifact_downloads/outputs/mlflow-model/",
#     name=model_name,
#     description="my sample instance segmentation model",
#     type=AssetTypes.MLFLOW_MODEL,
# )

registered_model = ml_client.models.create_or_update(model)

In [None]:
print(registered_model.id)

In [None]:
online_endpoint_name = endpoint.name
deployment_name = "trash-segmentation-granular"

deployment = ManagedOnlineDeployment(
    name=deployment_name,
    endpoint_name=online_endpoint_name,
    model=registered_model.id,
    instance_type="Standard_DS3_V2",
    instance_count=1,
    liveness_probe=ProbeSettings(
        failure_threshold=30,
        success_threshold=1,
        timeout=2,
        period=10,
        initial_delay=2000,
    ),
    readiness_probe=ProbeSettings(
        failure_threshold=10,
        success_threshold=1,
        timeout=10,
        period=10,
        initial_delay=2000,
    ),
)

In [None]:
# This may take 30 minutes
ml_client.online_deployments.begin_create_or_update(deployment)

In [None]:
# is fridge items deployment to take 100% traffic
endpoint.traffic = {deployment_name: 100}
ml_client.begin_create_or_update(endpoint)

### 8.3. Get endpoint details

In [None]:
# Get the details for online endpoint
#online_endpoint_name = "trash-segmentation-big-09202022"
#deployment_name = "trash-segmentation-mlflow-dpl"

endpoint = ml_client.online_endpoints.get(name=online_endpoint_name)

# existing traffic details
print(endpoint.traffic)

# Get the scoring URI
print(endpoint.scoring_uri)

## 9. Test Deployment

In [None]:
import json
import base64


def read_image(image_path):
    with open(image_path, "rb") as f:
        return f.read()
        
sample_image = "<path to local test image>"

request_json = {
    "input_data": 
    {
        "columns": ["image"],
        "data": [base64.encodebytes(read_image(sample_image)).decode("utf-8")],
    }
}

request_file_name = "sample_request_data.json"

with open(request_file_name, "w") as request_file:
    json.dump(request_json, request_file)

In [None]:
# Get the details for online endpoint
#endpoint = ml_client.online_endpoints.get(name=online_endpoint_name)


resp = ml_client.online_endpoints.invoke(
    endpoint_name=endpoint.name,
    deployment_name=deployment_name,
    #deployment_name="trash-segmentation-big-untuned",
    request_file= request_file_name,
)


## 10. Visualise Detections

### 10.1. Polygon

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches
from matplotlib.lines import Line2D
from PIL import Image
import numpy as np
import json

IMAGE_SIZE = (18, 12)
plt.figure(figsize=IMAGE_SIZE)
img_np = mpimg.imread(sample_image)
img = Image.fromarray(img_np.astype("uint8"), "RGB")
x, y = img.size

fig, ax = plt.subplots(1, figsize=(15, 15))
# Display the image
ax.imshow(img_np)

# draw box and label for each detection
detections = json.loads(resp)[0]
for detect in detections["boxes"]:
    label = detect["label"]
    box = detect["box"]
    polygon = detect["polygon"]
    conf_score = detect["score"]
    if conf_score > 0.6: # ADjUST CONFIDENCE SCORE TO GET MORE RESULTS. THIS THRESHOLD FILTERS THE RESULTS AND ONLY KEEPS HIGH CONFIDENCE PREDICTIONS
        ymin, xmin, ymax, xmax = (
            box["topY"],
            box["topX"],
            box["bottomY"],
            box["bottomX"],
        )
        topleft_x, topleft_y = x * xmin, y * ymin
        width, height = x * (xmax - xmin), y * (ymax - ymin)
        print(
            "{}: [{}, {}, {}, {}], {}".format(
                detect["label"],
                round(topleft_x, 3),
                round(topleft_y, 3),
                round(width, 3),
                round(height, 3),
                round(conf_score, 3),
            )
        )

        color = np.random.rand(3)  #'red'
        rect = patches.Rectangle(
            (topleft_x, topleft_y),
            width,
            height,
            linewidth=2,
            edgecolor=color,
            facecolor="none",
        )

        ax.add_patch(rect)
        plt.text(topleft_x, topleft_y - 10, label, color=color, fontsize=20)

        polygon_np = np.array(polygon[0])
        polygon_np = polygon_np.reshape(-1, 2)
        polygon_np[:, 0] *= x
        polygon_np[:, 1] *= y
        poly = patches.Polygon(polygon_np, True, facecolor=color, alpha=0.4)
        ax.add_patch(poly)
        poly_line = Line2D(
            polygon_np[:, 0],
            polygon_np[:, 1],
            linewidth=2,
            marker="o",
            markersize=8,
            markerfacecolor=color,
        )
        ax.add_line(poly_line)
plt.show()

In [None]:
detections.keys()

### 10.2. Bounding Box

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches
from PIL import Image
import numpy as np
import json

IMAGE_SIZE = (18, 12)
plt.figure(figsize=IMAGE_SIZE)
img_np = mpimg.imread(sample_image)
img = Image.fromarray(img_np.astype("uint8"), "RGB")
x, y = img.size

fig, ax = plt.subplots(1, figsize=(15, 15))
# Display the image
ax.imshow(img_np)

# draw box and label for each detection
detections = json.loads(resp)
for detect in detections[0]["boxes"]:
    label = detect["label"]
    box = detect["box"]
    conf_score = detect["score"]
    if conf_score > 0.6:
        ymin, xmin, ymax, xmax = (
            box["topY"],
            box["topX"],
            box["bottomY"],
            box["bottomX"],
        )
        topleft_x, topleft_y = x * xmin, y * ymin
        width, height = x * (xmax - xmin), y * (ymax - ymin)
        print(
            "{}: [{}, {}, {}, {}], {}".format(
                detect["label"],
                round(topleft_x, 3),
                round(topleft_y, 3),
                round(width, 3),
                round(height, 3),
                round(conf_score, 3),
            )
        )

        color = np.random.rand(3)  #'red'
        rect = patches.Rectangle(
            (topleft_x, topleft_y),
            width,
            height,
            linewidth=3,
            edgecolor=color,
            facecolor="none",
        )

        ax.add_patch(rect)
        plt.text(topleft_x, topleft_y - 10, label, color=color, fontsize=20)
plt.show()