Azure ML Pipeline - Training & Registration

In [37]:
registered_env_name = "experiment_env"
experiment_folder = 'exp_pipeline'
dataset_prefix_name = 'exp'

In [38]:
# Import required packages
from azureml.core import Workspace, Experiment, Datastore, Environment, Dataset
from azureml.core.compute import ComputeTarget, AmlCompute, DataFactoryCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.pipeline.core import Pipeline, PipelineParameter, PipelineData
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import PipelineParameter, PipelineData
from azureml.data.output_dataset_config import OutputTabularDatasetConfig, OutputDatasetConfig, OutputFileDatasetConfig
from azureml.data.datapath import DataPath
from azureml.data.data_reference import DataReference
from azureml.data.sql_data_reference import SqlDataReference
from azureml.pipeline.steps import DataTransferStep
import logging

In [39]:

# Connect to AML Workspace
ws = Workspace.from_config()

#Select AML Compute Cluster
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "mm-cluster"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)
    
    
#Get default datastore
default_ds = ws.get_default_datastore()

Found existing cluster, use it.


## Create Run configuration
The RunConfiguration defines the environment used across all the python steps.

In [40]:
conda_yml_file = '../configuration/environment.yml'

In [41]:
%%writefile $conda_yml_file
name: experiment_env
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting ../configuration/environment.yml


In [42]:
import os
# Create a folder for the pipeline step files
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

exp_pipeline


In [43]:
registered_env_name

'experiment_env'

In [46]:
from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification(registered_env_name, conda_yml_file)

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, registered_env_name)

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


## Define Output datasets

Configure datasets to pass between pipeline steps.  

In [36]:
exp_raw_data       = OutputFileDatasetConfig(name= dataset_prefix_name + 'Raw_Data', destination=(default_ds, dataset_prefix_name + '_raw_data/{run-id}')).read_delimited_files().register_on_complete(name= dataset_prefix_name + '_Raw_Data')
exp_training_data  = OutputFileDatasetConfig(name=dataset_prefix_name + 'Training_Data', destination=(default_ds, dataset_prefix_name + '_training_data/{run-id}')).read_delimited_files().register_on_complete(name=dataset_prefix_name + '_Training_Data')
exp_testing_data   = OutputFileDatasetConfig(name=dataset_prefix_name + 'Testing_Data', destination=(default_ds, dataset_prefix_name + '_testing_data/{run-id}')).read_delimited_files().register_on_complete(name=dataset_prefix_name + '_Testing_Data')

In [47]:
%%writefile ./pipeline_step_scripts/get_data.py

from azureml.core import Run, Workspace, Datastore, Dataset
from azureml.data.datapath import DataPath
import pandas as pd
import os
import argparse
from sklearn import preprocessing
import numpy as np

#Parse input arguments
parser = argparse.ArgumentParser("Get data from and register in AML workspace")
parser.add_argument('--exp_raw_dataset', dest='exp_raw_dataset', required=True)

args, _ = parser.parse_known_args()
exp_raw_dataset = args.exp_raw_dataset

#Get current run
current_run = Run.get_context()

#Get associated AML workspace
ws = current_run.experiment.workspace

#Connect to ADLS Gen2 datastore
ds = Datastore.get(ws, 'adlsgen2datastore')

#Read all raw data from ADLS Gen2
csv_paths = [(ds, 'exp/training/')]
raw_ds = Dataset.Tabular.from_delimited_files(path=csv_paths)
raw_df = raw_ds.to_pandas_dataframe().astype(np.float64)

#Make directory on mounted storage
os.makedirs(autoencoder_raw_dataset, exist_ok=True)

#Upload modified dataframe
raw_df.to_csv(os.path.join(autoencoder_raw_dataset, 'autoencoder_raw_data.csv'), index=False)

Overwriting ./pipeline_step_scripts/get_data.py


In [None]:
%%writefile ./pipeline_step_scripts/split_and_scale.py

from azureml.core import Run, Workspace, Datastore, Dataset
from azureml.data.datapath import DataPath
import os
import argparse

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import joblib
from numpy.random import seed

#Parse input arguments
parser = argparse.ArgumentParser("Split raw data into train/test and scale appropriately")
parser.add_argument('--exp_training_data', dest='autoencoder_training_data', required=True)
parser.add_argument('--exp_testing_data', dest='autoencoder_testing_data', required=True)
parser.add_argument('--split_to_train_pipeline_data', dest='split_to_train_pipeline_data', required=True)

args, _ = parser.parse_known_args()
exp_training_data = args.exp_training_data
exp_testing_data = args.exp_testing_data
split_to_train_pipeline_data = args.split_to_train_pipeline_data

#Get current run
current_run = Run.get_context()

#Get associated AML workspace
ws = current_run.experiment.workspace

# Read input dataset to pandas dataframe
raw_datset = current_run.input_datasets['Exp_Raw_Data']
raw_df = raw_datset.to_pandas_dataframe().astype(np.float64)

scaler = preprocessing.MinMaxScaler()

X_train = pd.DataFrame(scaler.fit_transform(raw_df),
                      columns=raw_df.columns,
                      index=raw_df.index)

# Save train data to both train and test (reflects the usage pattern in this sample. Note: test/train sets are typically distinct data).
os.makedirs(exp_training_data, exist_ok=True)
os.makedirs(exp_testing_data, exist_ok=True)
X_train.to_csv(os.path.join(autoencoder_training_data, 'exp_training_data.csv'), index=False)
X_train.to_csv(os.path.join(autoencoder_testing_data, 'exp_testing_data.csv'), index=False)

# Save scaler to PipelineData and outputs for record-keeping
os.makedirs('./outputs', exist_ok=True)

os.makedirs(split_to_train_pipeline_data, exist_ok=True)
joblib.dump(scaler, os.path.join(split_to_train_pipeline_data, 'scaler.pkl'))