In [1]:
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset
from azureml.core import Datastore, Experiment

from azureml.pipeline.core import Pipeline, PipelineData, TrainingOutput
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core.graph import PipelineParameter
from azureml.pipeline.steps import AutoMLStep

from azureml.widgets import RunDetails

In [2]:
import pandas as pd
import gc
import warnings
warnings.filterwarnings('ignore')

## Identify Azure ML Workspace by configuration

In [3]:
# Initiate default workspace
ws = Workspace.from_config()

# Default datastore
def_blob_store = ws.get_default_datastore()

## Select Compute Target

In [4]:
cpu_cluster_name = "ml-dev-clus"

In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found an existing cluster, using it instead.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D12_V2',
                                                           min_nodes=0,
                                                           max_nodes=6)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    cpu_cluster.wait_for_completion(show_output=True)

Found an existing cluster, using it instead.


## Create Running Environment - Docker

In [6]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE, DockerConfiguration

# create a new runconfig object
run_config = RunConfiguration()

# enable Docker 
docker_config = DockerConfiguration(use_docker=True)
run_config.docker = docker_config

# set Docker base image to the default CPU-based image
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE

# use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_config.environment.python.user_managed_dependencies = False

# specify CondaDependencies obj
run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['pandas','scikit-learn','numpy'],
                                                                            pip_packages=['azureml-sdk[automl]'])


## Initiate Input Data Channel
- train_transaction
- train_identity

In [7]:
train_transaction = Dataset.get_by_name(ws, name='CIS Fraud Detection_train_transaction')
train_identity = Dataset.get_by_name(ws, name = 'CIS Fraud Detection_train_identity')

### Pipeline Step 1 : Clean Data

In [8]:
source_directory="./data_prep"

cleaned_data = PipelineData("cleaned_data", datastore=def_blob_store).as_dataset()

CleanStep = PythonScriptStep(
    script_name="clean_data.py", 
    arguments=["--output_combine", cleaned_data],
    inputs=[train_transaction.as_named_input('input_transaction'), train_identity.as_named_input('input_identity')],
    outputs= [cleaned_data],
    compute_target=cpu_cluster, 
    source_directory=source_directory,
    runconfig=run_config,
    allow_reuse = True
)

### Pipeline Step 2 : Select Columns

In [9]:
source_directory="./data_prep"

selected_data = PipelineData("selected_data", datastore=def_blob_store).as_dataset()
SelectedStep = PythonScriptStep(
    script_name="select_col.py", 
    arguments=["--output_selected", selected_data],
    inputs=[cleaned_data.parse_parquet_files()],
    outputs= [selected_data],
    compute_target=cpu_cluster, 
    source_directory=source_directory,
    runconfig=run_config,
    allow_reuse = True
)

### Pipeline Step 3 : Feature Engineering

In [10]:
source_directory="./data_prep"

train_data = PipelineData("train_data", datastore=def_blob_store).as_dataset() ## Last input for machine learning model

FeatureEngineeringStep = PythonScriptStep(
    script_name="feature_engineering.py", 
    arguments=["--output_train_data", train_data],
    inputs=[selected_data.parse_parquet_files()],
    outputs= [train_data],
    compute_target=cpu_cluster, 
    source_directory=source_directory,
    runconfig=run_config,
    allow_reuse = True
)

## Create Experiment Environment

In [11]:
automl_fraud_pipeline = Experiment(ws, 'fraud_detection_automl_pipeline')

## AutoML Configurations
- Classification Model

In [12]:
from azureml.train.automl import AutoMLConfig

model_folder = "./train_model"

automl_settings = {
    "iteration_timeout_minutes" :120,
    "experiment_timeout_hours" : 2,
    "iterations" : 50,
    "max_concurrent_iterations" : 4,
    "primary_metric" : "AUC_weighted",
    "n_cross_validations" : 4
}

train_dataset = train_data.parse_parquet_files()

automl_config = AutoMLConfig(task = "classification",
                             debug_log = 'automated_ml_errors.log',
                             parth = model_folder,
                             compute_target = cpu_cluster,
                             featurization = 'auto',
                             training_data = train_dataset,   ## Input from previous pipeline
                             label_column_name = 'isFraud',
                             **automl_settings)

## Create output of AutoML
- Metrics data
- Model dta

In [13]:
automl_metrics_output_name = 'metrics_output'
automl_best_model_outputname = 'best_model_output'

metrics_data = PipelineData(name = "metrics_data",
                        datastore = def_blob_store,
                        pipeline_output_name = automl_metrics_output_name,
                        training_output = TrainingOutput(type = "Metrics"))

model_data = PipelineData(name = "model_data",
                          datastore = def_blob_store,
                          pipeline_output_name = automl_best_model_outputname,
                          training_output = TrainingOutput(type= "Model"))

In [14]:
fraud_automl = AutoMLStep(name = "AutoML_FraudDetect",
                          automl_config = automl_config,
                          outputs = [metrics_data, model_data],
                          allow_reuse = True)

## Create Pipeline Object

In [15]:
pipeline_steps = [fraud_automl]
pipeline_automl = Pipeline(workspace = ws, steps = pipeline_steps)

automl_pipeline_run = automl_fraud_pipeline.submit(pipeline_automl, regenerate_outputs = False)



Created step AutoML_FraudDetect [c51f2d41][3d3d6f4f-45e1-48b2-b3bc-aca5cd2ea972], (This step will run and generate new outputs)
Created step feature_engineering.py [4eb13623][47b064f3-383e-4673-abbc-8c639a2f6ccd], (This step is eligible to reuse a previous run's output)
Created step select_col.py [6e6a6361][b3fdb1e7-9b65-4484-9f0b-4acb5e980ee2], (This step is eligible to reuse a previous run's output)
Created step clean_data.py [f3f60589][f98cf3d5-ac90-495d-bb39-f8b9b3803820], (This step is eligible to reuse a previous run's output)
Submitted PipelineRun 53776c7d-c0f1-4987-bfc0-ef4c62b0a464
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/53776c7d-c0f1-4987-bfc0-ef4c62b0a464?wsid=/subscriptions/2186b060-2874-42c6-b0f7-0335ccdedb37/resourcegroups/azure-ml-eng-dev/workspaces/azureml-eng-dev-generic&tid=271d5e7b-1350-4b96-ab84-52dbda4cf40c


## RunDetails

In [16]:
RunDetails(automl_pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

