In [1]:
#%pip install azure-ai-formrecognizer==3.2.0
%pip install azure-ai-ml
%pip install azure-identity
%pip install azureml-sdk

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Handle to the workspace
from azure.ai.ml import MLClient

# Authentication package
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

### Create a handle to workspace

In [3]:
# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="d8fadbca-7cc9-4ea1-b3e6-79a0507ca5b7",
    resource_group_name="az-ml-group",
    workspace_name="random_forest_ws",
)

### Register local dataset

This registers a Data object tha can be consumed as an input within a pipeline

In [12]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

path = "../data.csv"

data = Data(
    name="data",
    path=path,
    type=AssetTypes.URI_FILE,
    description="Dataset for Interview",
    tags={"source_type": "web", "source": "AzureML examples blob"},
    version="1.0.0",
)

In [13]:
data = ml_client.data.create_or_update(data)
print(f"Dataset with name {data.name} was registered to workspace, the dataset version is {data.version}")

Dataset with name data was registered to workspace, the dataset version is 1.0.0


## Create a conda environment for pipeline 

In [14]:
import os

# define conda.yml in ./dependencies
dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [15]:
%%writefile {dependencies_dir}/conda.yaml
name: model-env
channels:
  - conda-forge
dependencies:
  - python=3.8
  - numpy=1.21.2
  - pip=21.2.4
  - scikit-learn=0.24.2
  - scipy=1.7.1
  - pandas>=1.1,<1.2
  - pip:
    - inference-schema[numpy-support]==1.3.0
    - xlrd==2.0.1
    - azureml-mlflow==1.42.0

Writing ./dependencies/conda.yaml


### Create and register custom environment in the workspace:

In [16]:
from azure.ai.ml.entities import Environment

custom_env_name = "random_forest_model"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for random forest model pipeline",
    tags={"scikit-learn": "0.24.2"},
    conda_file=os.path.join(dependencies_dir, "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    version="0.1.1",
)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)

Environment with name random_forest_model is registered to workspace, the environment version is 0.1.1


### Build the pipeline

This will consist of:
1) data_transform component: Performs the task of splitting the data into train and test dataset
2) data_train component: Trains a RandomForestRegression Model

MLFlow will be used to log the parameters and metrics during our pipeline run.

In [19]:
import os

data_transform_src_dir = "./components/data_transform"
os.makedirs(data_transform_src_dir, exist_ok=True)

In [87]:
%%writefile {data_transform_src_dir}/data_transform.py

import os
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split
import logging
import mlflow


def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="path to input data")
    parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
    parser.add_argument("--train_data", type=str, help="path to train data")
    parser.add_argument("--test_data", type=str, help="path to test data")
    args = parser.parse_args()

    # Start Logging
    mlflow.start_run()

    print("input data:", args.data)
    delimeter = ";"
    df = pd.read_csv(args.data, sep=delimeter)
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(" ", "_")
    # drop feature with missing values
    df.drop("feature_3", axis=1, inplace=True)
    # select best features + target
    cols = ["y","feature_1","feature_2", "feature_4","feature_9"]
    df = df[cols]

    mlflow.log_metric("num_samples", df.shape[0])
    mlflow.log_metric("num_features", df.shape[1] - 1)

    df_train, df_test = train_test_split(df,test_size=args.test_train_ratio,)

    # output paths are mounted as folder, therefore, we are adding a filename to the path
    df_train.to_csv(os.path.join(args.train_data, "data.csv"), index=False)

    df_test.to_csv(os.path.join(args.test_data, "data.csv"), index=False)

    # Stop Logging
    mlflow.end_run()


if __name__ == "__main__":
    main()

Overwriting ./components/data_transform/data_transform.py


- Create the Azure Ml component from data_transform.py script

In [88]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

data_transform_component = command(
    name="data_transform_model_pipeline_fixed",
    display_name="Data transform for training",
    description="reads in data and transform",
    inputs={
        "data": Input(type="uri_folder"),
        "test_train_ratio": Input(type="number"),
    },
    outputs=dict(
        train_data=Output(type="uri_folder", mode="rw_mount"),
        test_data=Output(type="uri_folder", mode="rw_mount"),
    ),
    # The source folder of the component
    code=data_transform_src_dir,
    command="""python data_transform.py \
            --data ${{inputs.data}} --test_train_ratio ${{inputs.test_train_ratio}} \
            --train_data ${{outputs.train_data}} --test_data ${{outputs.test_data}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

- Register the component to the workspace
- Create (register) the component in the workspace

In [89]:
# register the component to the workspace
data_transform_component = ml_client.create_or_update(data_transform_component.component)

# Create (register) the component in the workspace
print(
    f"Component {data_transform_component.name} with Version {data_transform_component.version} is registered"
)

Uploading data_transform (0.0 MBs): 100%|##########| 1599/1599 [00:00<00:00, 23514.93it/s]




Component data_transform_model_pipeline_fixed with Version 2023-08-16-11-01-18-2189331 is registered


### Create training component 

- Consume the training and test data, train a tree based model and return the output model. You'll use Azure ML logging capabilities to record and visualize the learning progress.


In [90]:
import os

train_src_dir = "./components/train"
os.makedirs(train_src_dir, exist_ok=True)

In [97]:
%%writefile {train_src_dir}/train.py
import argparse
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import os
import pandas as pd
import mlflow


def select_first_file(path):
    """Selects first file in folder, use under assumption there is only one file in folder
    Args:
        path (str): path to directory or file to choose
    Returns:
        str: full path of selected file
    """
    files = os.listdir(path)
    return os.path.join(path, files[0])


# Start Logging
mlflow.start_run()

# enable autologging
mlflow.sklearn.autolog()

os.makedirs("./outputs", exist_ok=True)


def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_data", type=str, help="path to train data")
    parser.add_argument("--test_data", type=str, help="path to test data")
    parser.add_argument("--n_estimators", required=False, default=100, type=int)
    parser.add_argument("--max_leaf_nodes", required=False, default=50, type=int)
    parser.add_argument("--registered_model_name", type=str, help="model name")
    parser.add_argument("--model", type=str, help="path to model file")
    args = parser.parse_args()

    # paths are mounted as folder, therefore, we are selecting the file from folder
    train_df = pd.read_csv(select_first_file(args.train_data))

    # Extracting the label column
    y_train = train_df.pop("y")

    # convert the dframe values to array
    X_train = train_df.values

    # paths are mounted as folder, therefore, we are selecting the file from folder
    test_df = pd.read_csv(select_first_file(args.test_data))

    # Extracting the label column
    y_test = test_df.pop("y")

    # convert the dframe values to array
    X_test = test_df.values

    print(f"Training with data of shape {X_train.shape}")

    model = RandomForestRegressor(
        n_estimators=args.n_estimators, max_leaf_nodes=int(args.max_leaf_nodes)
    )
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print(mean_absolute_error(y_test, y_pred))

    # Registering the model to the workspace
    print("Registering the model via MLFlow")
    mlflow.sklearn.log_model(
        sk_model=model,
        registered_model_name=args.registered_model_name,
        artifact_path=args.registered_model_name,
    )

    # Saving the model to a file
    mlflow.sklearn.save_model(
        sk_model=model,
        path=os.path.join(args.model, "trained_model_3"),
    )

    # Stop Logging
    mlflow.end_run()


if __name__ == "__main__":
    main()

Overwriting ./components/train/train.py


- When the model is trained, the model file is saved and registered to the workspace. 


In [98]:
%%writefile {train_src_dir}/train.yml
# <component>
name: train_random_forest_model_3
display_name: Random Forest Train
# version: 1 # Not specifying a version will automatically update the version
type: command
inputs:
  train_data: 
    type: uri_folder
  test_data: 
    type: uri_folder
  max_leaf_nodes:
    type: number     
  registered_model_name:
    type: string
outputs:
  model:
    type: uri_folder
code: .
environment:
  # for this step, we'll use an AzureML curate environment
  azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:1
command: >-
  python train.py 
  --train_data ${{inputs.train_data}} 
  --test_data ${{inputs.test_data}} 
  --max_leaf_nodes ${{inputs.max_leaf_nodes}}
  --registered_model_name ${{inputs.registered_model_name}} 
  --model ${{outputs.model}}
# </component>


Overwriting ./components/train/train.yml


Create the component using `load_component()`. 

In [99]:
# importing the Component Package
from azure.ai.ml import load_component

# Loading the component from the yml file
train_component = load_component(source=os.path.join(train_src_dir, "train.yml"))

Create and register the component:

In [100]:
# Now we register the component to the workspace
train_component = ml_client.create_or_update(train_component)

# Create (register) the component in your workspace
print(f"Component {train_component.name} with Version {train_component.version} is registered")

Uploading train (0.0 MBs): 100%|##########| 3469/3469 [00:00<00:00, 12096.84it/s]




Component train_random_forest_model_3 with Version 2023-08-16-11-09-51-1559769 is registered


## Build the pipeline from components


- Using *input data*, *split ratio* and *registered model name* as input variables. 
- Call the components and connect them via their inputs /outputs identifiers. 

In [101]:
# the dsl decorator tells the sdk that we are defining an Azure ML pipeline
from azure.ai.ml import dsl, Input, Output


@dsl.pipeline(
    compute="serverless",
    description="test_data_transform_train_pipeline_3",
)
def model_pipeline_pipeline(
    pipeline_job_data_input,
    pipeline_job_test_train_ratio,
    pipeline_job_max_leaf_nodes,
    pipeline_job_registered_model_name,
):
    # using data_transform_function like a python call with its own inputs
    data_transform_job = data_transform_component(
        data=pipeline_job_data_input,
        test_train_ratio=pipeline_job_test_train_ratio,
    )

    # using train_func like a python call with its own inputs
    train_job = train_component(
        train_data=data_transform_job.outputs.train_data,  # note: using outputs from previous step
        test_data=data_transform_job.outputs.test_data,  # note: using outputs from previous step
        max_leaf_nodes=pipeline_job_max_leaf_nodes,  # note: using a pipeline input as parameter
        registered_model_name=pipeline_job_registered_model_name,
    )

    # a pipeline returns a dictionary of outputs
    # keys will code for the pipeline output identifier
    return {
        "pipeline_job_train_data": data_transform_job.outputs.train_data,
        "pipeline_job_test_data": data_transform_job.outputs.test_data,
    }

### Submit the job

- Use the pipeline definition to instantiate a pipeline with the: 
- dataset, split rate, max leaf nodes and the name of the model.

In [102]:
registered_model_name = "random_forest_pipeline_3"

# Let's instantiate the pipeline with the parameters of our choice
pipeline = model_pipeline_pipeline(
    pipeline_job_data_input=Input(type="uri_file", path=data.path),
    pipeline_job_test_train_ratio=0.25,
    pipeline_job_max_leaf_nodes=50,
    pipeline_job_registered_model_name=registered_model_name,
)


pipeline_job = ml_client.jobs.create_or_update(pipeline,experiment_name="random_forest_components_2",)

## Deploy the model as an online endpoint

In [108]:
import uuid
import datetime

#online_endpoint_name = "endpoint-" + datetime.datetime.now().strftime("%m%d%H%M%f")
# Creating a unique name for the endpoint

online_endpoint_name = "credit-endpoint-" + str(uuid.uuid4())[:8]
print(online_endpoint_name)

credit-endpoint-e02c62eb


In [109]:
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    Model,
    Environment,
)

# create an online endpoint
endpoint = ManagedOnlineEndpoint(
    name=online_endpoint_name,
    description="this is an online endpoint",
    auth_mode="key",
    tags={
        "training_dataset": "model_pipeline",
        "model_type": "sklearn.RandomForestRegressor",
    },
)

endpoint_result = ml_client.begin_create_or_update(endpoint).result()

print(f"Endpint {endpoint_result.name} provisioning state: {endpoint_result.provisioning_state}")

Endpint credit-endpoint-e02c62eb provisioning state: Succeeded


- Check the registered endpoint

In [110]:
endpoint = ml_client.online_endpoints.get(name=online_endpoint_name)

print(f'Endpint "{endpoint.name}" with provisioning state "{endpoint.provisioning_state}" is retrieved')

Endpint "credit-endpoint-e02c62eb" with provisioning state "Succeeded" is retrieved


### Deploy the model to the endpoint

In [122]:
# Let's pick the latest version of the model
latest_model_version = max([int(m.version) for m in ml_client.models.list(name=registered_model_name)])

print(latest_model_version)

1

In [114]:
# picking the model to deploy. Here we use the latest version of our registered model
model = ml_client.models.get(name=registered_model_name, version=latest_model_version)


# create an online deployment.
blue_deployment = ManagedOnlineDeployment(
    name="blue",
    endpoint_name=online_endpoint_name,
    model=model,
    instance_type="Standard_F4s_v2",
    instance_count=1,
)

blue_deployment_results = ml_client.online_deployments.begin_create_or_update(
    blue_deployment
).result()

print(
    f"Deployment {blue_deployment_results.name} provisioning state: {blue_deployment_results.provisioning_state}"
)

Check: endpoint credit-endpoint-e02c62eb exists


HttpResponseError: (BadRequest) The request is invalid.
Code: BadRequest
Message: The request is invalid.
Exception Details:	(InferencingClientCreateDeploymentFailed) InferencingClient HttpRequest error, error detail: {"errors":{"VmSize":["Not enough quota available for Standard_F4s_v2 in SubscriptionId d8fadbca-7cc9-4ea1-b3e6-79a0507ca5b7. Current usage/limit: 0/6. Additional needed: 8 Please see troubleshooting guide, available here: https://aka.ms/oe-tsg#error-outofquota"]},"type":"https://tools.ietf.org/html/rfc7231#section-6.5.1","title":"One or more validation errors occurred.","status":400,"traceId":"00-0bf2c08c1aeea2f9a8fd4589aa239b0f-1ab549d5122254eb-01"}
	Code: InferencingClientCreateDeploymentFailed
	Message: InferencingClient HttpRequest error, error detail: {"errors":{"VmSize":["Not enough quota available for Standard_F4s_v2 in SubscriptionId d8fadbca-7cc9-4ea1-b3e6-79a0507ca5b7. Current usage/limit: 0/6. Additional needed: 8 Please see troubleshooting guide, available here: https://aka.ms/oe-tsg#error-outofquota"]},"type":"https://tools.ietf.org/html/rfc7231#section-6.5.1","title":"One or more validation errors occurred.","status":400,"traceId":"00-0bf2c08c1aeea2f9a8fd4589aa239b0f-1ab549d5122254eb-01"}
Additional Information:Type: ComponentName
Info: {
    "value": "managementfrontend"
}Type: Correlation
Info: {
    "value": {
        "operation": "0bf2c08c1aeea2f9a8fd4589aa239b0f",
        "request": "51d086f5367637de"
    }
}Type: Environment
Info: {
    "value": "northeurope"
}Type: Location
Info: {
    "value": "northeurope"
}Type: Time
Info: {
    "value": "2023-08-16T12:09:27.3193443+00:00"
}

### Test with a sample query

Now that the model is deployed to the endpoint, you can run inference with it.

Create a sample request file following the design expected in the run method in the score script.

In [129]:
deploy_dir = "./deploy"
os.makedirs(deploy_dir, exist_ok=True)

In [130]:
%%writefile {deploy_dir}/sample-request.json
{
    "input_data": {
    "columns": ["feature_1","feature_2", "feature_4","feature_9"],
    "index": [0, 1, 2, 3],
    "data": [
            [16, 34472, 90, 1260],
            [20, 33329, 97, 1100],
            [20, 31850, 110, 1120],
            [17, 30351, 90, 1260]

    ]
  }
}

Writing ./deploy/sample-request.json


In [36]:
# test the blue deployment with some sample data
ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    request_file="./deploy/sample-request.json",
    deployment_name="blue",
)

ValidationException: Deployment name blue not found for this endpoint

## Clean up endpoint



In [None]:
ml_client.online_endpoints.begin_delete(name=online_endpoint_name)

In [None]:
#run_id = "keen_rain_syxc8yg2tj"
!az ml model create --name model-pipeline-mixed --version 1 --path "azureml://locations/northeurope/workspaces/45824f33-6edb-4ec0-a385-a59e6afa8e5b/models/random_forest_pipeline_3/versions/1" --type mlflow_model
