**Import packages**

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

import azureml.core
from azureml.core import Workspace

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.44.0


**Connect to workspace**

In [None]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, sep='\t')
# workspace = Workspace(subscription_id, resource_group, workspace_name)

mlops_minhthy_4	southeastasia	mlops_minhthy_4


**Create experiment**

In [None]:
experiment_name = 'Abalone'

from azureml.core import Experiment
exp = Experiment(workspace=ws, name=experiment_name)

**Create or Attach existing compute resource**

In [None]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpu-cluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("found compute target: " + compute_name)
else:
    print("creating new compute target...")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

found compute target: cpu-cluster


Upload the Abalone dataset

In [None]:
from azureml.core import Dataset
# from azureml.opendatasets import MNIST


# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset


subscription_id = 'f548d18b-21f1-4160-b68c-b4577cad9721'
resource_group = 'Mlops_MinhThy_4'
workspace_name = 'Mlops_MinhThy_4'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='abalone_age_new')
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Age
0,M,455.0,365.0,95.0,514.0,2.245,101.0,0.15,15,165
1,M,0.35,265.0,0.09,2.255,995.0,485.0,0.07,7,85
2,F,0.53,0.42,135.0,677.0,2.565,1.415,0.21,9,105
3,M,0.44,365.0,125.0,516.0,2.155,114.0,155.0,10,115
4,I,0.33,255.0,0.08,205.0,895.0,395.0,55.0,7,85


Download the dataset from file

In [None]:
# azureml-core of version 1.0.72 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = 'f548d18b-21f1-4160-b68c-b4577cad9721'
resource_group = 'Mlops_MinhThy_4'
workspace_name = 'Mlops_MinhThy_4'

workspace = Workspace(subscription_id, resource_group, workspace_name)
data_folder = os.path.join(os.getcwd(), 'data')
os.makedirs(data_folder, exist_ok=True)

df = Dataset.get_by_name(workspace, name='abalone_1')
df.download(data_folder, overwrite=True)


abalone_file_dataset = df.register(workspace=ws,
                                    name='abalone_opendataset',
                                    description='training and test dataset',
                                    create_new_version=True)


In [None]:
import pandas as pd
dataset_age = pd.DataFrame([])
dataset['Age'].values = dataset['Rings'] +1.5
dataset_age = dataset.append(dataset['Age'].values)
dataset_age  
# dataset.to_pandas_dataframe()

AttributeError: 'TabularDataset' object has no attribute 'append'

Create new or use an existing compute

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

amlcompute_cluster_name = "n105962831"

# Verify that cluster does not exist already
try:
    aml_compute = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',
                                                           max_nodes=4)
    aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

aml_compute.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [None]:
from azureml.core.runconfig import RunConfiguration, DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Create a new runconfig object
aml_run_config = RunConfiguration()

# Use the aml_compute being created above. 
aml_run_config.target = aml_compute

# Enable Docker
docker=DockerConfiguration(use_docker=True)
aml_run_config.docker=docker

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
aml_run_config.environment.python.user_managed_dependencies = False

# Specify CondaDependencies obj, add necessary packages
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn'], 
    pip_packages=['azureml-sdk[automl]', 'pyarrow'])

In [None]:
#See our config

aml_run_config.environment

{
    "assetId": null,
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20220708.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "buildContext": null,
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": "2g"
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": null,
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "a

Define useful Columns

In [None]:
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Age
0,M,455.0,365.0,95.0,514.0,2.245,101.0,0.15,15,165
1,M,0.35,265.0,0.09,2.255,995.0,485.0,0.07,7,85
2,F,0.53,0.42,135.0,677.0,2.565,1.415,0.21,9,105
3,M,0.44,365.0,125.0,516.0,2.155,114.0,155.0,10,115
4,I,0.33,255.0,0.08,205.0,895.0,395.0,55.0,7,85


In [None]:
from IPython.display import Image, display


# useful columns
useful_columns = str(['Sex','Length','Diameter','Height',
                    'Whole weight','Shucked weight','Viscera weight','Shell weight','Rings','Age']).replace(",", ";")

print("Selected columns: ", useful_columns)

Selected columns:  ['Sex'; 'Length'; 'Diameter'; 'Height'; 'Whole weight'; 'Shucked weight'; 'Viscera weight'; 'Shell weight'; 'Rings'; 'Age']


Createvfolder scripts_1/prepdata

In [None]:
os.mkdir("scripts_1/prepdata")
%%writefile "scripts_1/prepdata/cleanse.py"

import argparse
import os
from azureml.core import Run

print("Clean the input data")

run = Run.get_context()
raw_data = run.input_datasets["raw_data"]

parser = argparse.ArgumentParser("cleanse")
parser.add_argument("--output_cleanse", type=str, help="cleaned abalone data directory")
parser.add_argument("--useful_columns", type=str, help="useful columns to keep")
parser.add_argument("--columns", type=str, help="rename column pattern")

args = parser.parse_args()

print("Argument 1(columns to keep): %s" % str(args.useful_columns.strip("[]").split(r'\;')))
print("Argument 2(columns renaming mapping): %s" % str(args.columns.strip("{}").split(r'\;')))
print("Argument 3(output cleansed abalone data path): %s" % args.output_cleanse)

# These functions ensure that null data is removed from the dataset,
# which will help increase machine learning model accuracy.

useful_columns = eval(args.useful_columns.replace(';', ','))
columns = eval(args.columns.replace(';', ','))

new_df = (dataset.to_pandas_dataframe()
          .dropna(how='all')
          .rename(columns=columns))[useful_columns]

new_df.reset_index(inplace=True, drop=True)

if not (args.output_cleanse is None):
    os.makedirs(args.output_cleanse, exist_ok=True)
    print("%s created" % args.output_cleanse)
    path = args.output_cleanse + "/processed.parquet"
    write_df = new_df.to_parquet(path)


FileExistsError: [Errno 17] File exists: 'scripts_1/prepdata'

In [None]:
default_store = ws.get_default_datastore() 

In [None]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

# python scripts folder
prepare_data_folder = './scripts_1/prepdata'


# rename columns 
abalone_columns = str({ 
    'Sex': "sex",
    'Length': "length",
    'Diameter': "diameter",
    'Height': "height",
    'Whole weight': "whole_weight",
    'Shucked weight': "shucked_weight",
    'Viscera weight': "viscera_weight",
    'Shell weight': "shell_weight",
    'Rings': "rings",
    'Age':'age'
}).replace(",", ";")

# Define output after cleansing step
cleansed_abalone_data = PipelineData("cleansed_abalone_data", datastore=default_store).as_dataset()

print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# cleansing step creation
# See the cleanse.py for details about input and output
cleansingStep = PythonScriptStep(
    name="Cleanse abalone Data",
    script_name="cleanse.py", 
    arguments=["--useful_columns", useful_columns,
               "--columns", abalone_columns,
               "--output_cleanse", cleansed_abalone_data],
    inputs=[dataset.as_named_input('raw_data')],
    outputs=[cleansed_abalone_data],
    compute_target=aml_compute,
    runconfig=aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("cleansingStep created.")

Cleanse script is in /mnt/batch/tasks/shared/LS_root/mounts/clusters/mlops4/code/Users/minhthy1016/scripts_1/prepdata.
cleansingStep created.


Split the data into train and test sets

In [None]:
%%writefile "scripts_1/prepdata/split_data.py"

import argparse
import os
# import azureml.core
from azureml.core import Run
from sklearn.model_selection import train_test_split


def write_output(df, path):
    os.makedirs(path, exist_ok=True)
    print("%s created" % path)
    df.to_parquet(path + "/processed.parquet")


print("Split the data into train and test")
run = Run.get_context()
transformed_data = run.input_datasets['transformed_data']
transformed_df = transformed_data.to_pandas_dataframe()

parser = argparse.ArgumentParser("split")
parser.add_argument("--output_split_train", type=str, help="output split train data")
parser.add_argument("--output_split_test", type=str, help="output split test data")

args = parser.parse_args()

print("Argument 1(output training data split path): %s" % args.output_split_train)
print("Argument 2(output test data split path): %s" % args.output_split_test)

output_split_train, output_split_test = train_test_split(transformed_df, test_size=0.2, random_state=223)
output_split_train.reset_index(inplace=True, drop=True)
output_split_test.reset_index(inplace=True, drop=True)

if not (args.output_split_train
        is None and args.output_split_test is None):
    write_output(output_split_train, args.output_split_train)
    write_output(output_split_test, args.output_split_test)

Overwriting scripts_1/prepdata/split_data.py


In [None]:
train_model_folder = './scripts_1/prepdata/'

# train and test splits output
output_split_train = PipelineData("output_split_train", datastore=default_store).as_dataset()
output_split_test = PipelineData("output_split_test", datastore=default_store).as_dataset()

print('Data spilt script is in {}.'.format(os.path.realpath(train_model_folder)))

# test train split step creation
# See the train_test_split.py for details about input and output
testTrainSplitStep = PythonScriptStep(
    name="Train Test Data Split",
    script_name="split_data.py", 
    arguments=["--output_split_train", output_split_train,
               "--output_split_test", output_split_test],
    inputs=[dataset.as_named_input('raw_data')],
    outputs=[output_split_train, output_split_test],
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=train_model_folder,
    allow_reuse=True
)

print("testTrainSplitStep created.")

Data spilt script is in /mnt/batch/tasks/shared/LS_root/mounts/clusters/mlops4/code/Users/minhthy1016/scripts_1/prepdata.
testTrainSplitStep created.


Automatically train a model
Create Experiment

In [None]:
from azureml.core import Experiment

experiment = Experiment(ws, 'Abalone_Pipelines')

Define settings for autogeneration and tuning

In [None]:
from azureml.train.automl import AutoMLConfig

# Change iterations to a reasonable number (50) to get better accuracy
automl_settings = {
    "iteration_timeout_minutes" : 5,
    "iterations" : 2,
    "primary_metric" : 'spearman_correlation',
    "n_cross_validations": 5
}

training_dataset = output_split_train.parse_parquet_files().keep_columns(['pickup_weekday','pickup_hour', 'distance','passengers', 'vendor', 'cost'])

automl_config = AutoMLConfig(task = 'regression',
                             debug_log = 'automated_ml_errors.log',
                             path = train_model_folder,
                             compute_target = aml_compute,
                             featurization = 'auto',
                             training_data = training_dataset,
                             label_column_name = 'cost',
                             **automl_settings)
                             

Define AutoMLStep

In [None]:
from azureml.pipeline.steps import AutoMLStep

trainWithAutomlStep = AutoMLStep(name='AutoML_Regression',
                                 automl_config=automl_config,
                                 allow_reuse=True)

Build and run the pipeline

In [None]:
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

pipeline_steps = [trainWithAutomlStep]

pipeline = Pipeline(workspace = ws, steps=pipeline_steps)

pipeline_run = experiment.submit(pipeline, regenerate_outputs=False)



Created step AutoML_Regression [766db0a2][5d40244e-e80d-495c-b1a6-c6799f355490], (This step will run and generate new outputs)
Created step Train Test Data Split [6269c344][aaf56703-8ddd-43fe-bd76-cadbfcf119fa], (This step will run and generate new outputs)
Submitted PipelineRun ff45f7e5-4430-43e3-bdd2-e92cbd0a7d21
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/ff45f7e5-4430-43e3-bdd2-e92cbd0a7d21?wsid=/subscriptions/f548d18b-21f1-4160-b68c-b4577cad9721/resourcegroups/mlops_minhthy_4/workspaces/mlops_minhthy_4&tid=f8a4d2e4-e0f0-4ff9-8809-bd26c493be92


In [None]:
RunDetails(pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

Explore the results

In [None]:
# Before we proceed we need to wait for the run to complete.
pipeline_run.wait_for_completion(show_output=False)

# functions to download output to local and fetch as dataframe
def get_download_path(download_path, output_name):
    output_folder = os.listdir(download_path + '/azureml')[0]
    path =  download_path + '/azureml/' + output_folder + '/' + output_name
    return path

def fetch_df(current_step, output_name):
    output_data = current_step.get_output_data(output_name)    
    download_path = './outputs/' + output_name
    output_data.download(download_path, overwrite=True)
    df_path = get_download_path(download_path, output_name) + '/processed.parquet'
    return pd.read_parquet(df_path)

PipelineRunId: ff45f7e5-4430-43e3-bdd2-e92cbd0a7d21
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/ff45f7e5-4430-43e3-bdd2-e92cbd0a7d21?wsid=/subscriptions/f548d18b-21f1-4160-b68c-b4577cad9721/resourcegroups/mlops_minhthy_4/workspaces/mlops_minhthy_4&tid=f8a4d2e4-e0f0-4ff9-8809-bd26c493be92


View training data used by AutoML

In [None]:
split_step = pipeline_run.find_step_run(testTrainSplitStep.name)[0]
train_split = fetch_df(split_step, output_split_train.name)

display(train_split.describe())
display(train_split.head(5))

View test data used by AutoML

In [None]:
split_step = pipeline_run.find_step_run(testTrainSplitStep.name)[0]
test_split = fetch_df(split_step, output_split_test.name)

display(test_split.describe())
display(test_split.head(5))

View the details of the AutoML run

In [None]:
from azureml.train.automl.run import AutoMLRun
#from azureml.widgets import RunDetails

# workaround to get the automl run as its the last step in the pipeline 
# and get_steps() returns the steps from latest to first

for step in pipeline_run.get_steps():
    automl_step_run_id = step.id
    print(step.name)
    print(automl_step_run_id)
    break

automl_run = AutoMLRun(experiment = experiment, run_id=automl_step_run_id)
#RunDetails(automl_run).show()

RunDetails(automl_run).show()

**Retrieve the best model**

Select the best model from your iterations. The get_output function returns the best run and the fitted model for the last fit invocation. By using the overloads on get_output, you can retrieve the best run and fitted model for any logged metric or a particular iteration.


In [None]:
best_run, fitted_model = automl_run.get_output()
print(best_run)
print(fitted_model)

**Test the best model accuracy**

Use the best model to run predictions on the test data set to predict taxi fares. The function predict uses the best model and predicts the values of y, trip cost, from the x_test data set. Print the first 10 predicted cost values from y_predict.

In [None]:
y_test = test_split.pop("Age")

y_predict = fitted_model.predict(test_split)
print(y_predict[:10])

Calculate the root mean squared error of the results. Convert the y_test dataframe to a list to compare to the predicted values. The function mean_squared_error takes two arrays of values and calculates the average squared error between them. Taking the square root of the result gives an error in the same units as the y variable, cost. It indicates roughly how far the taxi fare predictions are from the actual fares.

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

y_actual = y_test.values.flatten().tolist()
rmse = sqrt(mean_squared_error(y_actual, y_predict))
rmse

Run the following code to calculate mean absolute percent error (MAPE) by using the full y_actual and y_predict data sets. This metric calculates an absolute difference between each predicted and actual value and sums all the differences. Then it expresses that sum as a percent of the total of the actual values.

In [None]:
sum_actuals = sum_errors = 0

for actual_val, predict_val in zip(y_actual, y_predict):
    abs_error = actual_val - predict_val
    if abs_error < 0:
        abs_error = abs_error * -1

    sum_errors = sum_errors + abs_error
    sum_actuals = sum_actuals + actual_val

mean_abs_percent_error = sum_errors / sum_actuals
print("Model MAPE:")
print(mean_abs_percent_error)
print()
print("Model Accuracy:")
print(1 - mean_abs_percent_error)

*****Extra Part to consider ****

Create a training script

In [None]:
# %%writefile $scripts_1/train.py
%%writefile "scripts_1/prepdata/train.py"

import argparse
import os
import numpy as np
import glob

from sklearn.linear_model import LogisticRegression
import joblib

from azureml.core import Run
from utils import load_data
from azureml.core import Run, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# let user feed in 2 parameters, the dataset to mount or download, and the regularization rate of the logistic regression model
parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate')
args = parser.parse_args()

data_folder = args.data_folder
print('Data folder:', data_folder)


# Set regularization hyperparameter (passed as an argument to the script)
reg = args.reg_rate	

# Get the experiment run context
run = Run.get_context()

# Separate features and labels
X, y = dataset[['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight']].values, dataset['Age'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))


os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()




Writing $scripts_1/train.py


FileNotFoundError: [Errno 2] No such file or directory: '$scripts_1/train.py'

In [None]:
# %%writefile $data/abalone_training.py
%%writefile "scripts_1/prepdata/abalone_training.py"
import os
import argparse
from azureml.core import Run, Dataset
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Get the script arguments (regularization rate and training dataset ID)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
parser.add_argument("--input-data", type=str, dest='training_dataset_id', help='training dataset')
args = parser.parse_args()

# Set regularization hyperparameter (passed as an argument to the script)
reg = args.reg_rate	

# Get the experiment run context
run = Run.get_context()

# Separate features and labels
X, y = dataset[['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight']].values, dataset['Age'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))


os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Writing $data/abalone_training.py


FileNotFoundError: [Errno 2] No such file or directory: '$data/abalone_training.py'