In [1]:
from azureml.core import Workspace

ws = Workspace.from_config()

# Training and Registering Model

In [2]:
from azureml.core import Experiment
from azureml.core import Model
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Create an Azure ML experiment in your workspace
experiment = Experiment(workspace=ws, name='mslearn-train-diabetes')
run = experiment.start_logging()
print("Starting experiment:", experiment.name)

# load the diabetes dataset
print("Loading Data...")
diabetes = pd.read_csv('data/diabetes.csv')

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a decision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# Save the trained model
model_file = 'diabetes_model.pkl'
joblib.dump(value=model, filename=model_file)
run.upload_file(name = 'outputs/' + model_file, path_or_stream = './' + model_file)

# Complete the run
run.complete()

# Register the model
run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
                   tags={'Training context':'Inline Training'},
                   properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})

print('Model trained and registered.')

Starting experiment: mslearn-train-diabetes
Loading Data...
Training a decision tree model
Accuracy: 0.8873333333333333
AUC: 0.8738743221055774
Model trained and registered.


# Generate and upload batch data

In [3]:
from azureml.core import Datastore, Dataset
import pandas as pd 
import os 

# Set default data store
ws.set_default_datastore('workspaceblobstore')
default_ds = ws.get_default_datastore()

# Enumerate all datastore, indicating which is the default
for ds_name in ws.datastores:
    print(ds_name, "- Default =", ds_name == default_ds.name)

# Load the diabetes data
diabetes = pd.read_csv('data/diabetes2.csv')

# Get a 100-item sample of the feature columns (not the diabetic label)
sample = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].sample(n=100).values

# Create a folder
batch_folder = './batch-data'
os.makedirs(batch_folder, exist_ok = True)

print("Folder Created..!")

# Save each sample as a seperate file
print("Saving files...")
for i in range(100):
    fname = str(i + 1) + '.csv'
    sample[i].tofile(os.path.join(batch_folder, fname), sep = ",")

print("Files saved")

# Upload the files to the default datastore
print("Uploading files to datastore...")
default_ds = ws.get_default_datastore()
default_ds.upload(src_dir = "batch-data", target_path = "batch-data", overwrite = True, show_progress = True)

# Register a dataset for the input data
batch_data_set = Dataset.File.from_files(path = (default_ds, 'batch-data/'), validate = False)

try:
    batch_data_set = batch_data_set.register(workspace = ws, name = 'batch-data', description = 'batch data', create_new_version = True)

except Exception as ex:
    print(ex)

print("Done!")

azureml_globaldatasets - Default = False
workspaceworkingdirectory - Default = False
workspaceartifactstore - Default = False
workspacefilestore - Default = False
workspaceblobstore - Default = True
Folder Created..!
Saving files...
Files saved
Uploading files to datastore...
Uploading an estimated of 102 files
Uploading batch-data/.amlignore
Uploaded batch-data/.amlignore, 1 files out of an estimated total of 102
Uploading batch-data/.amlignore.amltmp
Uploaded batch-data/.amlignore.amltmp, 2 files out of an estimated total of 102
Uploading batch-data/1.csv
Uploaded batch-data/1.csv, 3 files out of an estimated total of 102
Uploading batch-data/10.csv
Uploaded batch-data/10.csv, 4 files out of an estimated total of 102
Uploading batch-data/100.csv
Uploaded batch-data/100.csv, 5 files out of an estimated total of 102
Uploading batch-data/11.csv
Uploaded batch-data/11.csv, 6 files out of an estimated total of 102
Uploading batch-data/12.csv
Uploaded batch-data/12.csv, 7 files out of an e

"Datastore.upload" is deprecated after version 1.0.69. Please use "Dataset.File.upload_directory" to upload your files             from a local directory and create FileDataset in single method call. See Dataset API change notice at https://aka.ms/dataset-deprecation.


# Create Compute

In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = 'your-compute-cluster'

try:
    inference_cluster = ComputeTarget(ws, cluster_name)
    print("Found existing cluster, use it.")

except ComputeTargetException:
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        inference_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

        inference_cluster.wait_for_completion(show_output = True)
    
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


# Pipeline for batch inferencing

In [9]:
import os

experiment_folder = 'batch_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

batch_pipeline


In [6]:
%%writefile $experiment_folder/batch_diabetes.py

import os 
import numpy as np 
from azureml.core import Model 
import joblib 

def init():

    global model 

    model_path = Model.get_model_path('diabetes_model')
    model = joblib.load(model_path) 

def run(mini_batch):

    resultList = []

    for f in mini_batch:
        data = np.genfromtxt(f, delimiter = ',')
        prediction = model.predict(data.reshape(1, -1))
        resultList.append("{}: {}".format(os.path.basename(f), prediction[0])) 
    
    return resultList

Overwriting batch_pipeline/batch_diabetes.py


In [7]:
%%writefile $experiment_folder/batch_environment.yml

name: batch_environment
dependencies:
- python=3.6.2
- scikit-learn
- pip
- pip:
    - azureml-defaults

Overwriting batch_pipeline/batch_environment.yml


In [8]:
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

batch_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/batch_environment.yml")
batch_env.docker.base_image = DEFAULT_CPU_IMAGE
print('Configuration ready.')

Configuration ready.


In [10]:
from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.data import OutputFileDatasetConfig

output_dir = OutputFileDatasetConfig(name = 'inferences')

paralell_run_config = ParallelRunConfig(
    source_directory = experiment_folder,
    entry_script = 'batch_diabetes.py',
    mini_batch_size = "5",
    error_threshold = 10,
    output_action = 'append_row',
    environment = batch_env,
    compute_target = inference_cluster,
    node_count = 2
)

paralellrun_step = ParallelRunStep(
    name = 'batch-score-diabetes',
    parallel_run_config = paralell_run_config,
    inputs = [batch_data_set.as_named_input('diabetes_batch')],
    output = output_dir,
    arguments = [],
    allow_reuse = True
)

print('Steps are defined')

Steps are defined


In [11]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace = ws, steps = [paralellrun_step])
pipeline_run = Experiment(ws, 'mslearn-diabetes-batch').submit(pipeline)
pipeline_run.wait_for_completion(show_output = True)

Created step batch-score-diabetes [d30b5e20][65bd5e8b-af0b-4a0b-b4be-1222e1ac3a55], (This step will run and generate new outputs)
Submitted PipelineRun 026e297e-0475-4900-8c4a-42cbb33a064f
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/026e297e-0475-4900-8c4a-42cbb33a064f?wsid=/subscriptions/3571f8dc-3527-4993-9d2b-ac0812d807fd/resourcegroups/aml-resources/workspaces/aml-workspace&tid=78c76086-2fb7-4f6a-b684-c129ba0ea713
PipelineRunId: 026e297e-0475-4900-8c4a-42cbb33a064f
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/026e297e-0475-4900-8c4a-42cbb33a064f?wsid=/subscriptions/3571f8dc-3527-4993-9d2b-ac0812d807fd/resourcegroups/aml-resources/workspaces/aml-workspace&tid=78c76086-2fb7-4f6a-b684-c129ba0ea713
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 9658f6fa-1814-4415-a08c-dee2897bfd53
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/9658f6fa-1814-4415-a08c-dee2897bfd53?wsid=/subscriptions/3571f8dc-3527-4993-9

'Finished'

In [12]:
import pandas as pd
import shutil

# Remove the local results folder if left over from a previous run
shutil.rmtree('diabetes-results', ignore_errors=True)

# Get the run for the first step and download its output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='diabetes-results')

# Traverse the folder hierarchy and find the results file
for root, dirs, files in os.walk('diabetes-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Display the first 20 results
df.head(20)

Unnamed: 0,File,Prediction
0,11.csv,0
1,12.csv,0
2,13.csv,0
3,14.csv,0
4,15.csv,1
5,16.csv,0
6,17.csv,0
7,18.csv,0
8,19.csv,0
9,2.csv,1


# Publishing the Pipeline

In [14]:
published_pipeline = pipeline_run.publish_pipeline(name = 'diabetes-batch-pipeline', description='Batch scoring of diabetes data', version='1.0')

published_pipeline

Name,Id,Status,Endpoint
diabetes-batch-pipeline,973c51a5-42d1-4fdb-8562-7a092d097d5f,Active,REST Endpoint


In [15]:
rest_endpoint = published_pipeline.endpoint
rest_endpoint

'https://centralindia.api.azureml.ms/pipelines/v1.0/subscriptions/3571f8dc-3527-4993-9d2b-ac0812d807fd/resourceGroups/aml-resources/providers/Microsoft.MachineLearningServices/workspaces/aml-workspace/PipelineRuns/PipelineSubmit/973c51a5-42d1-4fdb-8562-7a092d097d5f'

**To use the endpoint, client applications need to make a REST call over HTTP. This request must be authenticated, so an authorization header is required. To test this out, we'll use the authorization header from your current connection to your Azure workspace, which you can get using the following code:**

In [16]:
from azureml.core.authentication import InteractiveLoginAuthentication

from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()
print('Authentication header ready.')

Authentication header ready.


In [17]:
import requests

rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint, 
                         headers=auth_header, 
                         json={"ExperimentName": "mslearn-diabetes-batch"})
run_id = response.json()["Id"]
run_id

'eb687dc4-3cf9-4c16-b0d7-47227893bb99'

In [18]:
from azureml.pipeline.core.run import PipelineRun
from azureml.widgets import RunDetails

published_pipeline_run = PipelineRun(ws.experiments['mslearn-diabetes-batch'], run_id)

# Block until the run completes
published_pipeline_run.wait_for_completion(show_output=True)

PipelineRunId: eb687dc4-3cf9-4c16-b0d7-47227893bb99
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/eb687dc4-3cf9-4c16-b0d7-47227893bb99?wsid=/subscriptions/3571f8dc-3527-4993-9d2b-ac0812d807fd/resourcegroups/aml-resources/workspaces/aml-workspace&tid=78c76086-2fb7-4f6a-b684-c129ba0ea713

PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': 'eb687dc4-3cf9-4c16-b0d7-47227893bb99', 'status': 'Completed', 'startTimeUtc': '2022-08-26T11:17:04.829683Z', 'endTimeUtc': '2022-08-26T11:17:06.342999Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'Unavailable', 'runType': 'HTTP', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.continue_on_failed_optional_input': 'True', 'azureml.pipelineid': '973c51a5-42d1-4fdb-8562-7a092d097d5f', 'azureml.pipelineComponent': 'pipelinerun'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://amlworkspace256

'Finished'

In [19]:
import pandas as pd
import shutil

# Remove the local results folder if left over from a previous run
shutil.rmtree('diabetes-results', ignore_errors=True)

# Get the run for the first step and download its output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='diabetes-results')

# Traverse the folder hierarchy and find the results file
for root, dirs, files in os.walk('diabetes-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Display the first 20 results
df.head(20)

Unnamed: 0,File,Prediction
0,11.csv,0
1,12.csv,0
2,13.csv,0
3,14.csv,0
4,15.csv,1
5,16.csv,0
6,17.csv,0
7,18.csv,0
8,19.csv,0
9,2.csv,1
