In [None]:
# %pip install --upgrade azureml-sdk

In [None]:
import azureml
from azureml.core import Workspace, Experiment, Datastore, Environment
from azureml.core.runconfig import RunConfiguration
from azureml.data.datapath import DataPath, DataPathComputeBinding
from azureml.data.data_reference import DataReference
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.widgets import RunDetails
from azureml.train.estimator import Estimator
import os

print("Azure ML SDK Version: ", azureml.core.VERSION)

# Setup Variables

In [None]:
os.environ['STORAGE_ACCOUNT_KEY'] = '5sMUun4GLV9ygttyddq6/5tFfowJgD9crUTH7IOQn/6FFt7XFjQUhBHNVopkXXE1Mhuk/DBsPPvgrKpNkBq65A=='

In [None]:
datastorename='seerdata'
datastorepath='hardware'
containername='seer-container'
storageaccountname='aiml50njs1storage'
storageaccountkey=os.environ.get('STORAGE_ACCOUNT_KEY')
computetarget='twtcluster'

# Register/Reference a Datastore

In [None]:
# workspace
ws = Workspace.from_config(
    path='./azureml-config.json')
print(ws.datastores)

In [None]:
# See if that datastore already exists and unregister it if so
try:
    datastore = ws.datastores[datastorename]
    print ('Unregistering existing datastore')
    datastore.unregister()
except:
    print ('Data store doesn\'t exist, no need to remove')
finally:
    # register the datastore
    datastore = Datastore.register_azure_blob_container(workspace=ws,
                                        datastore_name=datastorename,
                                        container_name=containername,
                                        account_name=storageaccountname,
                                        account_key=storageaccountkey,
                                        create_if_not_exists=True)

print('Datastore registered: ', datastore)

In [None]:
# data
datastore = ws.datastores['seerdata']
datareference = DataReference(datastore=datastore, 
                    data_reference_name="seerdata", 
                    path_on_datastore=datastorepath)


# Create Compute Resources

In [None]:
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=computetarget)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='STANDARD_NC6', 
        min_nodes=1, 
        max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, computetarget, compute_config)

cpu_cluster.wait_for_completion(show_output=True)
compute = ws.compute_targets[computetarget]

print('Compute registered: ', compute)

# Define Pipeline!

The following will be created and then run:

  1. Pipeline Parameters
  2. Data Process Step
  3. Training Step
  4. Model Registration Step
  5. Pipeline registration
  6. Submit the pipeline for execution


## Pipeline Parameters
We need to tell the Pipeline what it needs to learn to see!

In [None]:
datapath = DataPath(datastore=datastore, path_on_datastore=datastorepath)
data_path_pipeline_param = (PipelineParameter(name="data", 
                                             default_value=datapath), 
                                             DataPathComputeBinding(mode='mount'))
print(data_path_pipeline_param)

# Configuration for data prep and training steps
dataprepEnvironment = Environment.from_pip_requirements('dataprepenv', 'requirements-dataprepandtraining.txt')
dataprepRunConfig = RunConfiguration()
dataprepRunConfig.environment = dataprepEnvironment

## Data Process Step

In [None]:
seer_tfrecords = PipelineData(
    "tfrecords_set",
    datastore=datastore,
    is_directory=True
)

prepStep = PythonScriptStep(
    'parse.py',
    source_directory='.',
    name='Data Preparation',
    compute_target=compute,
    arguments=["--source_path", data_path_pipeline_param, "--target_path", seer_tfrecords],
    runconfig=dataprepRunConfig,
    inputs=[data_path_pipeline_param],
    outputs=[seer_tfrecords]
)

print(prepStep)

## Training Step

In [None]:
seer_training = PipelineData(
    "train",
    datastore=datastore,
    is_directory=True
)

trainStep = PythonScriptStep(
    name='Model Training',
    script_name='train.py',
    source_directory='.',
    runconfig=dataprepRunConfig,
    arguments=["--source_path", seer_tfrecords, 
                "--target_path", seer_training,
                "--epochs", 5,
                "--batch", 10,
                "--lr", 0.001],
    inputs=[seer_tfrecords],
    outputs=[seer_training],
    compute_target=compute
)

print(trainStep)

# Register Model Step

In [None]:
registerEnvironment = Environment.from_pip_requirements('registerenv', 'requirements-registration.txt')
registerRunConfig = RunConfiguration()
registerRunConfig.environment = registerEnvironment

seer_model = PipelineData(
    "model",
    datastore=datastore,
    is_directory=True
)

registerStep = PythonScriptStep(
    'register.py',
    source_directory='.',
    name='Model Registration',
    arguments=["--source_path", seer_training, 
               "--target_path", seer_model],
    inputs=[seer_training],
    outputs=[seer_model],
    compute_target=compute,
    runconfig=registerRunConfig
)

print(registerStep)

## Create and publish the Pipeline

In [None]:
pipeline = Pipeline(workspace=ws, steps=[prepStep, trainStep, registerStep])

published_pipeline = pipeline.publish(
    name="Seer Pipeline", 
    description="Transfer learned image classifier. Uses folders as labels.")

In [None]:
# Submit the pipeline to be run
pipeline_run = Experiment(ws, 'seer',).submit(published_pipeline)
print('Run created with ID: ', pipeline_run.id)

RunDetails(pipeline_run).show()