In [None]:
#Set vars for connecting to AML workspace
import os

subscription_id = os.getenv("SUBSCRIPTION_ID", default="")
resource_group = os.getenv("RESOURCE_GROUP", default="")
workspace_name = os.getenv("WORKSPACE_NAME", default="")
workspace_region = os.getenv("WORKSPACE_REGION", default="")

In [None]:
#Create AML workspace connection
from azureml.core import Workspace

try:
    ws = Workspace(subscription_id=subscription_id, 
                   resource_group=resource_group, 
                   workspace_name=workspace_name)
    print("Workspace configuration succeeded. Skip the workspace creation steps below")
except:
    print("Workspace does not exist. Creating workspace")
    ws = Workspace.create(name=workspace_name, subscription_id=subscription_id, resource_group=resource_group,
                            location=workspace_region, create_resource_group=True, sku='enterprise', exist_ok=True)

In [None]:
#Print AML workspace details and write config file
print(ws.get_details())

ws.write_config()

In [None]:
# Choose a name for your CPU cluster
cpu_cluster_name = 'cpucluster'

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found an existing cluster, using it instead.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D13_V2',
                                                           min_nodes=0,
                                                           max_nodes=10)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    cpu_cluster.wait_for_completion(show_output=True)

In [None]:
#Create new experiment for pipeline submission
from azureml.core import Experiment

experiment = Experiment(ws, 'sample-dataprep-pipeline')

In [None]:
#Specify folder which will contain data preparation scripts
scripts_folder = './dataprep/'

In [None]:
#Write script to be run in pipeline. Should gather data, upload to AML associated datastore,
#create and register datasets
%%writefile $scripts_folder/gather_data.py
import pandas as pd
import requests
from csv import reader
import os
import argparse
import json
from azureml.core import Run, Workspace, Datastore, Dataset, Experiment

parser = argparse.ArgumentParser("Aggregate Data")

parser.add_argument("--target_path", type=str, help="Target path for data upload")

args = parser.parse_args()

#Get current run
current_run = Run.get_context()
#Get associated AML workspace
ws = current_run.experiment.workspace
#Get default datastore - used to upload data
datastore = ws.get_default_datastore()

#Target path is passed as a variable argument (can be timestamped)
target_path = args.target_path

#Pull sample dataset and add to pandas dataframe
r = requests.get('https://dprepdata.blob.core.windows.net/demo/Titanic.csv')
rows = r.text.split('\r\n')
formatted_rows = []
for row in rows:
    read = reader([row], skipinitialspace=True)
    vals = [x for x in read]
    formatted_rows.append(vals[0])
df = pd.DataFrame(formatted_rows[1:], columns=formatted_rows[0])

#Partition source dataframe based on values in 'Embarked' column
df_s = df[df['Embarked']=='S']
df_c = df[df['Embarked']=='C']
df_q = df[df['Embarked']=='Q']

#Write partitioned dataframes to files in processed data
df_s.to_csv('./processed/sourcedata_s.csv', index=False)
df_c.to_csv('./processed/sourcedata_c.csv', index=False)
df_q.to_csv('./processed/sourcedata_q.csv', index=False)

#Upload processed directory to default datastore
datastore.upload(src_dir='./processed', target_path=target_path, overwrite=True)

from azureml.core.dataset import Dataset

# Create file datasets
ds_train = Dataset.File.from_files(path=datastore.path(target_path), validate=False)

# Register the file datasets
dataset_name = 'etf_data'
train_dataset_name = dataset_name + '_train'
ds_train.register(ws, train_dataset_name, create_new_version=True)

In [None]:
#Create timestamp
#Note: all files are uploaded to timestamped subdir in AML datastore
import time

secondsSinceEpoch = time.time()
timeObj = time.localtime(secondsSinceEpoch)

timestamp = ('%d%d%d%d%d%d' % (
timeObj.tm_year, timeObj.tm_mon, timeObj.tm_mday, timeObj.tm_hour, timeObj.tm_min, timeObj.tm_sec))

print(timestamp)

In [None]:
#Create PythonScriptStep and associated run configuration
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import PipelineParameter

# create a new runconfig object
run_config = RunConfiguration()

# enable Docker 
run_config.environment.docker.enabled = True

# set Docker base image to the default CPU-based image
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE

# use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_config.environment.python.user_managed_dependencies = False

# specify CondaDependencies obj
run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['requests', 'pandas'])

#Create pipeline parameter
pipeline_param = PipelineParameter(name="target_path", default_value="source_data_0000")

#aml-pipelines-with-data-dependency-steps.ipynb
aggregateDataStep = PythonScriptStep(
    script_name="gather_data.py", 
    arguments=["--target_path", pipeline_param],
    compute_target=cpu_cluster, 
    source_directory=scripts_folder,
    runconfig=run_config
)


In [None]:
#Create pipeline, execute pipeline, and wait for response
pipeline = Pipeline(workspace=ws, steps=aggregateDataStep)

run = experiment.submit(pipeline, pipeline_parameters={"target_path": "source_data_{}".format(timestamp)})

run.wait_for_completion(show_output=True)

In [None]:
#Publish pipeline to endpoint
published_pipeline = pipeline.publish(name = 'many_models_data_prep',
                                     description = 'Gathers and organizes data for many models training job',
                                     version = '1',
                                     continue_on_step_failure = False)

In [None]:
#Sample remote execution
#Pipeline execution via REST endpoint requires AAD Token (obtained here from service principal)
#Relevant docs:
#https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-pipelines
#https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/manage-azureml-service/authentication-in-azureml/authentication-in-azureml.ipynb

import requests
import os
from azureml.core.authentication import ServicePrincipalAuthentication

#Service principal creds stored as environment vars
client_id = os.environ.get('client_id')
tenant_id = os.environ.get('tenant_id')
service_principal_password = os.environ.get('service_principal_password')

#Leverage ADAL library for obtaining token
from adal import AuthenticationContext

client_id = client_id
client_secret = service_principal_password
resource_url = "https://login.microsoftonline.com"
tenant_id = tenant_id
authority = "{}/{}".format(resource_url, tenant_id)

auth_context = AuthenticationContext(authority)
token_response = auth_context.acquire_token_with_client_credentials("https://management.azure.com/", client_id, client_secret)

#Format token response for API request to pipeline
headers = {'Authorization': 'Bearer {}'.format(token_response['accessToken'])}

#Trigger remote pipeline run
#Pipeline endpoint can be obtained from AML portal as well
response = requests.post(published_pipeline.endpoint,
                         headers=headers,
                         json={"ExperimentName": "REST_Pipeline_Trigger_Test",
                               "ParameterAssignments": {"target_path": "source_data_{}".format(timestamp)}})

In [None]:
#Sample code for pulling and partitioning Titanic dataset

import requests
import json
import pandas as pd
from csv import reader

r = requests.get('https://dprepdata.blob.core.windows.net/demo/Titanic.csv')
rows = r.text.split('\r\n')
formatted_rows = []
for row in rows:
    read = reader([row], skipinitialspace=True)
    vals = [x for x in read]
    formatted_rows.append(vals[0])
df = pd.DataFrame(formatted_rows[1:], columns=formatted_rows[0])
df_s = df[df['Embarked']=='S']
df_c = df[df['Embarked']=='C']
df_q = df[df['Embarked']=='Q']
df_q
