# Orchestrate Jobs to Train and Evaluate Models with Amazon SageMaker Pipelines


## Dataset

In [2]:
# !pip install -U sagemaker

In [3]:
!pip install opendatasets

[0m

In [4]:
!pip install imblearn

[0m

In [5]:
import sys
import os
import boto3
import sagemaker
import pandas as pd
from sagemaker.workflow.pipeline_context import PipelineSession

from imblearn.combine import SMOTEENN
import opendatasets as od
from sklearn.model_selection import train_test_split

sagemaker_session = sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
pipeline_session = PipelineSession()
default_bucket = sagemaker_session.default_bucket()
model_package_group_name = f"DiabetesModelPackageGroupName"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


Now, upload the data into the default bucket. 

In [6]:
!mkdir -p data

In [7]:
od.download('https://www.kaggle.com/alexteboul/diabetes-health-indicators-dataset')
os.listdir('diabetes-health-indicators-dataset')

Skipping, found downloaded files in "./diabetes-health-indicators-dataset" (use force=True to force download)


['diabetes_012_health_indicators_BRFSS2015.csv',
 'diabetes_binary_health_indicators_BRFSS2015.csv',
 'diabetes_binary_5050split_health_indicators_BRFSS2015.csv']

In [8]:
# Since we get a headerless CSV file, we specify the column names here.
feature_names = [
    'HighBP', 
    'HighChol', 
    'CholCheck', 
    'BMI', 
    'Smoker',
    'Stroke', 
    'HeartDiseaseorAttack', 
    'PhysActivity', 
    'Fruits', 
    'Veggies',
    'HvyAlcoholConsump', 
    'AnyHealthcare', 
    'NoDocbcCost', 
    'GenHlth',
    'MentHlth', 
    'PhysHlth', 
    'DiffWalk', 
    'Sex', 
    'Age', 
    'Education',
    'Income'
]

In [9]:
dataset = pd.read_csv('./diabetes-health-indicators-dataset/diabetes_binary_health_indicators_BRFSS2015.csv')

dataClean = dataset.drop_duplicates()

X = dataClean.drop(['Diabetes_binary'], axis = 1).values
y = dataClean['Diabetes_binary']

# SMOTEENN
smoteenn = SMOTEENN(random_state=42)
X_resampled_smoteenn, y_resampled_smoteenn = smoteenn.fit_resample(X, y)

X_batch, _ = train_test_split(X_resampled_smoteenn, train_size=0.10, random_state=42)
batch_df = pd.DataFrame(X_batch, columns=feature_names)

In [10]:
# Convert y_resampled_smoteenn to a DataFrame
y_resampled_df = pd.DataFrame(y_resampled_smoteenn, columns=['Diabetes_binary'])

# Convert X_resampled_smoteenn to a DataFrame
X_resampled_df = pd.DataFrame(X_resampled_smoteenn, columns=feature_names)

# Concatenate features and labels
df = pd.concat([y_resampled_df, X_resampled_df], axis=1)

# Use 'csv' format to store the data
# The first column is expected to be the output column
df.to_csv('data/diabetes-dataset.csv', header=False)
batch_df.to_csv('data/diabetes-dataset-batch.csv', index=False, header=False)

In [11]:
local_path = "data/diabetes-dataset.csv"
base_uri = f"s3://{default_bucket}/diabetes"
input_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=local_path,
    desired_s3_uri=base_uri,
)
print(input_data_uri)

s3://sagemaker-us-east-1-741135916424/diabetes/diabetes-dataset.csv


Download a second dataset for batch transformation after model creation.

In [12]:
local_path = "data/diabetes-dataset-batch.csv"
base_uri = f"s3://{default_bucket}/diabetes"
batch_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=local_path,
    desired_s3_uri=base_uri,
)
print(batch_data_uri)

s3://sagemaker-us-east-1-741135916424/diabetes/diabetes-dataset-batch.csv


## Define Parameters to Parametrize Pipeline Execution

In [13]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)
input_data = ParameterString(
    name="InputData",
    default_value=input_data_uri,
)
batch_data = ParameterString(
    name="BatchData",
    default_value=batch_data_uri,
)
mse_threshold = ParameterFloat(name="MseThreshold", default_value=6.0)

![Define Parameters](img/pipeline-1.png)

## Define a Processing Step for Feature Engineering

In [14]:
!mkdir -p code

In [15]:
%%writefile code/preprocessing.py
import argparse
import os
import requests
import tempfile

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# Since we get a headerless CSV file, we specify the column names here.
feature_columns_names = [
    'HighBP', 
    'HighChol', 
    'CholCheck', 
    'BMI', 
    'Smoker',
    'Stroke', 
    'HeartDiseaseorAttack', 
    'PhysActivity', 
    'Fruits', 
    'Veggies',
    'HvyAlcoholConsump', 
    'AnyHealthcare', 
    'NoDocbcCost', 
    'GenHlth',
    'MentHlth', 
    'PhysHlth', 
    'DiffWalk', 
    'Sex', 
    'Age', 
    'Education',
    'Income'
]
label_column = "Diabetes_binary"

feature_columns_dtype = {
    'HighBP': np.float64, 
    'HighChol': np.float64, 
    'CholCheck': np.float64, 
    'BMI': np.float64, 
    'Smoker': np.float64,
    'Stroke': np.float64, 
    'HeartDiseaseorAttack': np.float64, 
    'PhysActivity': np.float64, 
    'Fruits': np.float64, 
    'Veggies': np.float64,
    'HvyAlcoholConsump': np.float64, 
    'AnyHealthcare': np.float64, 
    'NoDocbcCost': np.float64, 
    'GenHlth': np.float64,
    'MentHlth': np.float64, 
    'PhysHlth': np.float64, 
    'DiffWalk': np.float64, 
    'Sex': np.float64, 
    'Age': np.float64, 
    'Education': np.float64,
    'Income': np.float64
}
label_column_dtype = {"Diabetes_binary": np.float64}


def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z


if __name__ == "__main__":
    base_dir = "/opt/ml/processing"

    df = pd.read_csv(
        f"{base_dir}/input/diabetes-dataset.csv",
        header=None,
        names=feature_columns_names + [label_column],
        dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype),
    )
    numeric_features = list(feature_columns_names)
    numeric_transformer = Pipeline(
        steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
    )

    preprocess = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
        ]
    )

    y = df.pop("Diabetes_binary")
    X_pre = preprocess.fit_transform(df)
    y_pre = y.to_numpy().reshape(len(y), 1)

    X = np.concatenate((y_pre, X_pre), axis=1)

    np.random.shuffle(X)
    train, validation, test = np.split(X, [int(0.7 * len(X)), int(0.85 * len(X))])

    pd.DataFrame(train).to_csv(f"{base_dir}/train/train.csv", header=False, index=False)
    pd.DataFrame(validation).to_csv(
        f"{base_dir}/validation/validation.csv", header=False, index=False
    )
    pd.DataFrame(test).to_csv(f"{base_dir}/test/test.csv", header=False, index=False)

Overwriting code/preprocessing.py


### Create an instance of a `SKLearnProcessor`.

In [16]:
from sagemaker.sklearn.processing import SKLearnProcessor


framework_version = "1.2-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type="ml.m5.xlarge",
    instance_count=processing_instance_count,
    base_job_name="sklearn-diabetes-process",
    role=role,
    sagemaker_session=pipeline_session,
)

In [17]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

processor_args = sklearn_processor.run(
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
    ],
    code="code/preprocessing.py",
)

step_process = ProcessingStep(name="DiabetesProcess", step_args=processor_args)



## Define a Training Step to Train a Model

In [18]:
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

model_path = f"s3://{default_bucket}/DiabetesTrain"
image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.0-1",
    py_version="py3",
    instance_type="ml.m5.xlarge",
)
xgb_train = Estimator(
    image_uri=image_uri,
    instance_type=instance_type,
    instance_count=1,
    output_path=model_path,
    role=role,
    sagemaker_session=pipeline_session,
)
xgb_train.set_hyperparameters(
    objective="reg:linear",
    num_round=50,
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.7,
)

train_args = xgb_train.fit(
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "validation"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
    }
)

In [19]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep


step_train = TrainingStep(
    name="DiabetesTrain",
    step_args=train_args,
)

## Define a Model Evaluation Step to Evaluate the Trained Model

First, develop an evaluation script that is specified in a Processing step that performs the model evaluation.

After pipeline execution, you can examine the resulting `evaluation.json` for analysis.

The evaluation script uses `xgboost` to do the following:

* Load the model.
* Read the test data.
* Issue predictions against the test data.
* Build a classification report, including accuracy and ROC curve.
* Save the evaluation report to the evaluation directory.

In [20]:
%%writefile code/evaluation.py
import json
import pathlib
import pickle
import tarfile

import joblib
import numpy as np
import pandas as pd
import xgboost

from sklearn.metrics import mean_squared_error


if __name__ == "__main__":
    model_path = f"/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")

    model = pickle.load(open("xgboost-model", "rb"))

    test_path = "/opt/ml/processing/test/test.csv"
    df = pd.read_csv(test_path, header=None)

    y_test = df.iloc[:, 0].to_numpy()
    df.drop(df.columns[0], axis=1, inplace=True)

    X_test = xgboost.DMatrix(df.values)

    predictions = model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    std = np.std(y_test - predictions)
    report_dict = {
        "regression_metrics": {
            "mse": {"value": mse, "standard_deviation": std},
        },
    }

    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))

Overwriting code/evaluation.py


Next, create an instance of a `ScriptProcessor` processor and use it in the `ProcessingStep`.

In [21]:
from sagemaker.processing import ScriptProcessor


script_eval = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type="ml.m5.xlarge",
    instance_count=1,
    base_job_name="script-diabetes-eval",
    role=role,
    sagemaker_session=pipeline_session,
)

eval_args = script_eval.run(
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
            destination="/opt/ml/processing/test",
        ),
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
    ],
    code="code/evaluation.py",
)

Use the processor's arguments returned by `.run()` to construct a `ProcessingStep`, along with the input and output channels and the code that will be executed when the pipeline invokes pipeline execution.

In [22]:
from sagemaker.workflow.properties import PropertyFile


evaluation_report = PropertyFile(
    name="EvaluationReport", output_name="evaluation", path="evaluation.json"
)
step_eval = ProcessingStep(
    name="DiabetesEval",
    step_args=eval_args,
    property_files=[evaluation_report],
)

## Define a Create Model Step to Create a Model

In [23]:
from sagemaker.model import Model

model = Model(
    image_uri=image_uri,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session=pipeline_session,
    role=role,
)

Define the `ModelStep` by providing the return values from `model.create()` as the step arguments.

In [24]:
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.model_step import ModelStep

step_create_model = ModelStep(
    name="DiabetesCreateModel",
    step_args=model.create(instance_type="ml.m5.large", accelerator_type="ml.eia1.medium"),
)

## Define a Transform Step to Perform Batch Transformation

In [25]:
from sagemaker.transformer import Transformer


transformer = Transformer(
    model_name=step_create_model.properties.ModelName,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    output_path=f"s3://{default_bucket}/DiabetesTransform",
)

Pass in the transformer instance and the `TransformInput` with the `batch_data` pipeline parameter defined earlier.

In [26]:
from sagemaker.inputs import TransformInput
from sagemaker.workflow.steps import TransformStep


step_transform = TransformStep(
    name="DiabetesTransform", transformer=transformer, inputs=TransformInput(data=batch_data)
)

## Define a Register Model Step to Create a Model Package

In [27]:
from sagemaker.model_metrics import MetricsSource, ModelMetrics

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json",
    )
)

register_args = model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics,
)
step_register = ModelStep(name="DiabetesRegisterModel", step_args=register_args)



## Define a Fail Step to Terminate the Pipeline Execution and Mark it as Failed

In [28]:
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.functions import Join

step_fail = FailStep(
    name="DiabetesMSEFail",
    error_message=Join(on=" ", values=["Execution failed due to MSE >", mse_threshold]),
)

## Define a Condition Step to Check Accuracy and Conditionally Create a Model and Run a Batch Transformation and Register a Model in the Model Registry, Or Terminate the Execution in Failed State

In [29]:
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet


cond_lte = ConditionLessThanOrEqualTo(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="regression_metrics.mse.value",
    ),
    right=mse_threshold,
)

step_cond = ConditionStep(
    name="DiabetesMSECond",
    conditions=[cond_lte],
    if_steps=[step_register, step_create_model, step_transform],
    else_steps=[step_fail],
)

## Define a Pipeline of Parameters, Steps, and Conditions

In [30]:
from sagemaker.workflow.pipeline import Pipeline


pipeline_name = f"DiabetesPipeline"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_count,
        instance_type,
        model_approval_status,
        input_data,
        batch_data,
        mse_threshold,
    ],
    steps=[step_process, step_train, step_eval, step_cond],
)

### Examining the pipeline definition

In [31]:
import json


definition = json.loads(pipeline.definition())
definition



{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceCount',
   'Type': 'Integer',
   'DefaultValue': 1},
  {'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'ModelApprovalStatus',
   'Type': 'String',
   'DefaultValue': 'PendingManualApproval'},
  {'Name': 'InputData',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-us-east-1-741135916424/diabetes/diabetes-dataset.csv'},
  {'Name': 'BatchData',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-us-east-1-741135916424/diabetes/diabetes-dataset-batch.csv'},
  {'Name': 'MseThreshold', 'Type': 'Float', 'DefaultValue': 6.0}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'DiabetesProcess',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': 'ml.m5.xlarge',
      'InstanceCount': {'Ge

## Submit the pipeline to SageMaker and start execution

Submit the pipeline definition to the Pipeline service. The Pipeline service uses the role that is passed in to create all the jobs defined in the steps.

In [32]:
pipeline.upsert(role_arn=role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-1:741135916424:pipeline/DiabetesPipeline',
 'ResponseMetadata': {'RequestId': '56fcd81b-d5d8-4aa9-b298-e3f3765cc614',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '56fcd81b-d5d8-4aa9-b298-e3f3765cc614',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '84',
   'date': 'Mon, 10 Jun 2024 00:25:17 GMT'},
  'RetryAttempts': 0}}

Start the pipeline and accept all the default parameters.

In [33]:
execution = pipeline.start()

## Pipeline Operations: Examining and Waiting for Pipeline Execution

Describe the pipeline execution.

In [34]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:741135916424:pipeline/DiabetesPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:741135916424:pipeline/DiabetesPipeline/execution/cxsii09n2x5n',
 'PipelineExecutionDisplayName': 'execution-1717979118000',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 6, 10, 0, 25, 17, 940000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 6, 10, 0, 25, 17, 940000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:741135916424:user-profile/d-3sy8bv7rmcro/sostrovsky',
  'UserProfileName': 'sostrovsky',
  'DomainId': 'd-3sy8bv7rmcro',
  'IamIdentity': {'Arn': 'arn:aws:sts::741135916424:assumed-role/LabRole/SageMaker',
   'PrincipalId': 'AROA2ZDZETWEIWGGTF266:SageMaker'}},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:741135916424:user-profile/d-3sy8bv7rmcro/sostrovsky',
  'UserProfileName': 'sostrovsky',
  'DomainId': 'd-3sy8bv7rmcro',
  'IamId

Wait for the execution to complete.

In [35]:
try:
    execution.wait()
except Exception as error:
    print(error)

Waiter PipelineExecutionComplete failed: Waiter encountered a terminal failure state: For expression "PipelineExecutionStatus" we matched expected path: "Failed"


List the steps in the execution. These are the steps in the pipeline that have been resolved by the step executor service.

In [36]:
execution.list_steps()

[{'StepName': 'DiabetesTransform',
  'StartTime': datetime.datetime(2024, 6, 10, 0, 38, 26, 87000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 6, 10, 0, 43, 54, 613000, tzinfo=tzlocal()),
  'StepStatus': 'Failed',
  'FailureReason': 'ClientError: ClientError: See job logs for more information',
  'Metadata': {'TransformJob': {'Arn': 'arn:aws:sagemaker:us-east-1:741135916424:transform-job/pipelines-cxsii09n2x5n-DiabetesTransform-BPrK1KHxuz'}},
  'AttemptCount': 1},
 {'StepName': 'DiabetesCreateModel-CreateModel',
  'StartTime': datetime.datetime(2024, 6, 10, 0, 38, 24, 78000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 6, 10, 0, 38, 25, 97000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'Model': {'Arn': 'arn:aws:sagemaker:us-east-1:741135916424:model/pipelines-cxsii09n2x5n-DiabetesCreateModel--dl7FtxpRqv'}},
  'AttemptCount': 1},
 {'StepName': 'DiabetesRegisterModel-RegisterModel',
  'StartTime': datetime.datetime(2024, 6, 10, 0, 38, 24, 78000,

### Examining the Evaluation

Examine the resulting model evaluation after the pipeline completes. Download the resulting `evaluation.json` file from S3 and print the report.

In [37]:
from pprint import pprint


evaluation_json = sagemaker.s3.S3Downloader.read_file(
    "{}/evaluation.json".format(
        step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
    )
)
pprint(json.loads(evaluation_json))



{'regression_metrics': {'mse': {'standard_deviation': 1.6448157686436107,
                                'value': 2.705418933539871}}}


### Lineage

Review the lineage of the artifacts generated by the pipeline.

In [38]:
import time
from sagemaker.lineage.visualizer import LineageTableVisualizer


viz = LineageTableVisualizer(sagemaker.session.Session())
for execution_step in reversed(execution.list_steps()):
    print(execution_step)
    display(viz.show(pipeline_execution_step=execution_step))
    time.sleep(5)

{'StepName': 'DiabetesProcess', 'StartTime': datetime.datetime(2024, 6, 10, 0, 25, 19, 573000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 6, 10, 0, 30, 47, 406000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:741135916424:processing-job/pipelines-cxsii09n2x5n-DiabetesProcess-5rl3M6Vx3Q'}}, 'AttemptCount': 1}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...171de73ed36b4bac27aa0eb/preprocessing.py,Input,DataSet,ContributedTo,artifact
1,s3://...1135916424/diabetes/diabetes-dataset.csv,Input,DataSet,ContributedTo,artifact
2,68331...com/sagemaker-scikit-learn:1.2-1-cpu-py3,Input,Image,ContributedTo,artifact
3,s3://...cxsii09n2x5n/DiabetesProcess/output/test,Output,DataSet,Produced,artifact
4,s3://...9n2x5n/DiabetesProcess/output/validation,Output,DataSet,Produced,artifact
5,s3://...xsii09n2x5n/DiabetesProcess/output/train,Output,DataSet,Produced,artifact


{'StepName': 'DiabetesTrain', 'StartTime': datetime.datetime(2024, 6, 10, 0, 30, 48, 155000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 6, 10, 0, 32, 55, 650000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:741135916424:training-job/pipelines-cxsii09n2x5n-DiabetesTrain-LhsvUOcFLs'}}, 'AttemptCount': 1}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...9n2x5n/DiabetesProcess/output/validation,Input,DataSet,ContributedTo,artifact
1,s3://...xsii09n2x5n/DiabetesProcess/output/train,Input,DataSet,ContributedTo,artifact
2,68331...naws.com/sagemaker-xgboost:1.0-1-cpu-py3,Input,Image,ContributedTo,artifact
3,s3://...etesTrain-LhsvUOcFLs/output/model.tar.gz,Output,Model,Produced,artifact


{'StepName': 'DiabetesEval', 'StartTime': datetime.datetime(2024, 6, 10, 0, 32, 56, 134000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 6, 10, 0, 38, 21, 999000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:741135916424:processing-job/pipelines-cxsii09n2x5n-DiabetesEval-z3O8OYhCvj'}}, 'AttemptCount': 1}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...1efdc9e2d552feb7e97dadd942/evaluation.py,Input,DataSet,ContributedTo,artifact
1,s3://...cxsii09n2x5n/DiabetesProcess/output/test,Input,DataSet,ContributedTo,artifact
2,s3://...etesTrain-LhsvUOcFLs/output/model.tar.gz,Input,Model,ContributedTo,artifact
3,68331...naws.com/sagemaker-xgboost:1.0-1-cpu-py3,Input,Image,ContributedTo,artifact
4,s3://...024-06-10-00-25-15-485/output/evaluation,Output,DataSet,Produced,artifact


{'StepName': 'DiabetesMSECond', 'StartTime': datetime.datetime(2024, 6, 10, 0, 38, 23, 55000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 6, 10, 0, 38, 23, 527000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'Condition': {'Outcome': 'True'}}, 'AttemptCount': 1}


None

{'StepName': 'DiabetesRegisterModel-RegisterModel', 'StartTime': datetime.datetime(2024, 6, 10, 0, 38, 24, 78000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 6, 10, 0, 38, 24, 887000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'RegisterModel': {'Arn': 'arn:aws:sagemaker:us-east-1:741135916424:model-package/DiabetesModelPackageGroupName/9'}}, 'AttemptCount': 1}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...etesTrain-LhsvUOcFLs/output/model.tar.gz,Input,Model,ContributedTo,artifact
1,68331...naws.com/sagemaker-xgboost:1.0-1-cpu-py3,Input,Image,ContributedTo,artifact
2,DiabetesModelPackageGroupName-9-PendingManualA...,Input,Approval,ContributedTo,action
3,DiabetesModelPackageGroupName-1717966411-aws-m...,Output,ModelGroup,AssociatedWith,context


{'StepName': 'DiabetesCreateModel-CreateModel', 'StartTime': datetime.datetime(2024, 6, 10, 0, 38, 24, 78000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 6, 10, 0, 38, 25, 97000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'Model': {'Arn': 'arn:aws:sagemaker:us-east-1:741135916424:model/pipelines-cxsii09n2x5n-DiabetesCreateModel--dl7FtxpRqv'}}, 'AttemptCount': 1}


None

{'StepName': 'DiabetesTransform', 'StartTime': datetime.datetime(2024, 6, 10, 0, 38, 26, 87000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 6, 10, 0, 43, 54, 613000, tzinfo=tzlocal()), 'StepStatus': 'Failed', 'FailureReason': 'ClientError: ClientError: See job logs for more information', 'Metadata': {'TransformJob': {'Arn': 'arn:aws:sagemaker:us-east-1:741135916424:transform-job/pipelines-cxsii09n2x5n-DiabetesTransform-BPrK1KHxuz'}}, 'AttemptCount': 1}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...etesTrain-LhsvUOcFLs/output/model.tar.gz,Input,Model,ContributedTo,artifact
1,68331...naws.com/sagemaker-xgboost:1.0-1-cpu-py3,Input,Image,ContributedTo,artifact
2,s3://...6424/diabetes/diabetes-dataset-batch.csv,Input,DataSet,ContributedTo,artifact
3,s3://...us-east-1-741135916424/DiabetesTransform,Output,DataSet,Produced,artifact


### Parametrized Executions

In [39]:
execution = pipeline.start(
    parameters=dict(
        ModelApprovalStatus="Approved",
    )
)

In [40]:
try:
    execution.wait()
except Exception as error:
    print(error)

Waiter PipelineExecutionComplete failed: Waiter encountered a terminal failure state: For expression "PipelineExecutionStatus" we matched expected path: "Failed"


In [41]:
execution.list_steps()

[{'StepName': 'DiabetesTransform',
  'StartTime': datetime.datetime(2024, 6, 10, 0, 57, 36, 88000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 6, 10, 1, 3, 6, 343000, tzinfo=tzlocal()),
  'StepStatus': 'Failed',
  'FailureReason': 'ClientError: ClientError: See job logs for more information',
  'Metadata': {'TransformJob': {'Arn': 'arn:aws:sagemaker:us-east-1:741135916424:transform-job/pipelines-x1c300xbwm4g-DiabetesTransform-kuuxrCQgnK'}},
  'AttemptCount': 1},
 {'StepName': 'DiabetesCreateModel-CreateModel',
  'StartTime': datetime.datetime(2024, 6, 10, 0, 57, 34, 84000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 6, 10, 0, 57, 35, 477000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'Model': {'Arn': 'arn:aws:sagemaker:us-east-1:741135916424:model/pipelines-x1c300xbwm4g-DiabetesCreateModel--yBnpTb28gj'}},
  'AttemptCount': 1},
 {'StepName': 'DiabetesRegisterModel-RegisterModel',
  'StartTime': datetime.datetime(2024, 6, 10, 0, 57, 34, 84000, 

Apart from that, you might also want to adjust the MSE threshold to a smaller value and raise the bar for the accuracy of the registered model. In this case you can override the MSE threshold like the following:

In [42]:
execution = pipeline.start(parameters=dict(MseThreshold=3.0))

If the MSE threshold is not satisfied, the pipeline execution enters the `FailStep` and is marked as failed.

In [43]:
try:
    execution.wait()
except Exception as error:
    print(error)

Waiter PipelineExecutionComplete failed: Waiter encountered a terminal failure state: For expression "PipelineExecutionStatus" we matched expected path: "Failed"


In [44]:
execution.list_steps()

[{'StepName': 'DiabetesTransform',
  'StartTime': datetime.datetime(2024, 6, 10, 1, 17, 15, 54000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 6, 10, 1, 22, 34, 330000, tzinfo=tzlocal()),
  'StepStatus': 'Failed',
  'FailureReason': 'ClientError: ClientError: See job logs for more information',
  'Metadata': {'TransformJob': {'Arn': 'arn:aws:sagemaker:us-east-1:741135916424:transform-job/pipelines-49mz8d35jsio-DiabetesTransform-P3qd5Obkvz'}},
  'AttemptCount': 1},
 {'StepName': 'DiabetesCreateModel-CreateModel',
  'StartTime': datetime.datetime(2024, 6, 10, 1, 17, 13, 504000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 6, 10, 1, 17, 14, 626000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'Model': {'Arn': 'arn:aws:sagemaker:us-east-1:741135916424:model/pipelines-49mz8d35jsio-DiabetesCreateModel--j8rTFbFmRi'}},
  'AttemptCount': 1},
 {'StepName': 'DiabetesRegisterModel-RegisterModel',
  'StartTime': datetime.datetime(2024, 6, 10, 1, 17, 13, 5040