# AAI-540 Group 6 Final Project CI/CD Pipeline

Authors: Alden Caterio, Gary Takahashi, Paul Parks

This notebook contains the CI/CD Pipeline for our model

In [111]:
%pip install --disable-pip-version-check -q PyAthena
%pip install --disable-pip-version-check -q awswrangler

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [112]:
import os
import pandas as pd
import numpy as np
import boto3
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker import get_execution_role
from time import gmtime, strftime, time
from botocore.client import ClientError
from pyathena import connect
import pandas as pd
import boto3
import sagemaker
import awswrangler as wr
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sagemaker.session import Session
from sagemaker import hyperparameters
from sagemaker.tuner import ContinuousParameter, IntegerParameter, HyperparameterTuner
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TuningStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker import Model
from sagemaker import image_uris, model_uris, script_uris
from sagemaker.workflow.model_step import ModelStep
from sagemaker.processing import (
    ProcessingInput,
    ProcessingOutput,
    Processor,
    ScriptProcessor,
)
from sagemaker.inputs import TrainingInput
from sagemaker.model_metrics import (
    MetricsSource,
    ModelMetrics,
)
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.functions import Join

In [113]:
session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
model_package_group_name = f"EmailSpamDetectionGroup"

In [114]:
pipeline_session = PipelineSession()

upload_dataset_to_s3 = True
if upload_dataset_to_s3:
    filename = 'CEAS_08.csv'
    local_csv_path = './dataset/' + filename
    base_uri = f"s3://{default_bucket}/email-spam-detection"
    input_data_uri = sagemaker.s3.S3Uploader.upload(
        local_path=local_csv_path,
        desired_s3_uri=base_uri,
    )

# Setup the Pipeline Inputs

In [115]:
# set some basic pipeline parameters stored
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

input_data = ParameterString(
    name="InputData",
    default_value=input_data_uri,
)

model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)

f1_threshold = ParameterFloat(name="F1Threshold", default_value=0.8)


In [116]:
import os
if not os.path.exists("code"):
    os.mkdir("code")

# PreProcessing step for Spam Dataset

In [117]:
%%writefile code/preprocessing.py
import argparse
import os
import requests
import tempfile

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

if __name__ == "__main__":
    base_dir = "/opt/ml/processing"

    df = pd.read_csv(
        f"{base_dir}/input/CEAS_08.csv",
    )
    
    df['text'] = df['subject'] + ' ' + df['body']
    
    df = df[['text', 'label']]
    
    df_production, df_train = train_test_split(df, test_size=0.80, random_state=42)
    df_test, df_validation = train_test_split(df_production, test_size=0.50, random_state=42)

    pd.DataFrame(df_train).to_csv(f"{base_dir}/train/train.csv", index=False)
    pd.DataFrame(df_validation).to_csv(
        f"{base_dir}/validation/validation.csv", index=False
    )
    pd.DataFrame(df_test).to_csv(f"{base_dir}/test/test.csv", index=False)

Overwriting code/preprocessing.py


In [118]:
from sagemaker.sklearn.processing import SKLearnProcessor


framework_version = "1.2-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type="ml.m5.large",
    instance_count=1,
    base_job_name="sklearn-email-spam-process",
    role=role,
    sagemaker_session=pipeline_session,
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [119]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

processor_args = sklearn_processor.run(
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
    ],
    code="code/preprocessing.py",
)

step_process = ProcessingStep(name="EmailSpamProcess", step_args=processor_args)

# Model Training Step

In [120]:
%%writefile code/train.py

import argparse
import os
import pandas as pd
import joblib

from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def model_fn(model_dir):
    """Load the model for inference"""
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    vectorizer = joblib.load(os.path.join(model_dir, "vectorizer.joblib"))
    return model, vectorizer

def predict_fn(input_data, model_and_vectorizer):
    """Vectorize string input and make predictions"""
    model, vectorizer = model_and_vectorizer
    
    # Check if the input data is a string (email body)
    # Transform the input string to TF-IDF features
    input_tfidf = vectorizer.transform([str(input_data)])
    input_dense = input_tfidf.toarray()  # Convert to dense format for GaussianNB

    # Make predictions using the trained model
    prediction = model.predict(input_dense)
    
    return prediction

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    
    # SageMaker-specific arguments
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    
    args = parser.parse_args()
    
    logger.info("Loading training data...")
    
    # Read the training data
    train_data = pd.read_csv(os.path.join(args.train, 'train.csv'))
    
    # Separate features and target
    X_train = train_data['text'].astype(str)
    y_train = train_data['label']
    
    logger.info("Training the TF-IDF vectorizer...")

    # Create the TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=10000)    
    X_train_tfidf = vectorizer.fit_transform(X_train)

    # Convert sparse matrix to dense for GaussianNB
    X_train_dense = X_train_tfidf.toarray()
    
    logger.info("Training the model...")
    # Train the model
    # Build a Naive Bayes Classifier
    model = GaussianNB()
    
    model.fit(X_train_dense, y_train)
    
    logger.info("Saving the model...")
    # Save the model
    joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
    joblib.dump(vectorizer, os.path.join(args.model_dir, "vectorizer.joblib"))


Overwriting code/train.py


In [121]:
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.sklearn.estimator import SKLearn

model_path = f"s3://{default_bucket}/EmailSpamTrain"


image_uri = sagemaker.image_uris.retrieve(
    framework="sklearn",
    region=region,
    version="0.23-1",
    py_version="py3",
    instance_type="ml.m5.xlarge",
)

sklearn_estimator = Estimator(
    image_uri=image_uri,
    entry_point='code/train.py',
    instance_type='ml.m5.large',
    instance_count=1,
    output_path=model_path,
    role=role,
    sagemaker_session=pipeline_session,
)

# Run the training job
train_args = sklearn_estimator.fit(
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "validation"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
    }
)

In [122]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep


step_train = TrainingStep(
    name="EmailSpamTrain",
    step_args=train_args,
)

# Model Evaluation Step

In [123]:
%%writefile code/evaluation.py
import json
import pathlib
import pickle
import tarfile
import joblib
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

if __name__ == "__main__":
    
    logger.info("Loading model...")
    model_path = f"/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")

    model = joblib.load("model.joblib")
    vectorizer = joblib.load("vectorizer.joblib")

    logger.info("Loading test csv...")
    # Load the test data
    test_path = "/opt/ml/processing/test/test.csv"
    df = pd.read_csv(test_path)

    # Separate features and target
    y_test = df['label'].to_numpy()
    X_test = df['text'].astype(str)

    logger.info("Vectorize...")
    # Transform the test features using the TF-IDF vectorizer
    X_test_tfidf = vectorizer.transform(X_test)
    X_test_dense = X_test_tfidf.toarray()  # Convert to dense format for GaussianNB

    logger.info("Predict...")
    # Make predictions using the trained model
    predictions = model.predict(X_test_dense)

    logger.info("Eval...")
    # Calculate the F1-score
    f1 = f1_score(y_test, predictions, average='binary')
    report_dict = {
        "classification_metrics": {
            "f1_score": {"value": f1},
        },
    }

    logger.info("Saving...")
    # Save the evaluation report
    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))

Overwriting code/evaluation.py


In [124]:
from sagemaker.processing import ScriptProcessor

script_eval = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type="ml.m5.xlarge",
    instance_count=1,
    base_job_name="script-email-spam-eval",
    role=role,
    sagemaker_session=pipeline_session,
)

eval_args = script_eval.run(
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
            destination="/opt/ml/processing/test",
        ),
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
    ],
    code="code/evaluation.py",
)

In [125]:
from sagemaker.workflow.properties import PropertyFile


evaluation_report = PropertyFile(
    name="EvaluationReport", output_name="evaluation", path="evaluation.json"
)
step_eval = ProcessingStep(
    name="EmailSpamEval",
    step_args=eval_args,
    property_files=[evaluation_report],
)

# Model Registry Step

In [126]:
from sagemaker.model import Model

model = Model(
    image_uri=image_uri,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session=pipeline_session,
    role=role,
)

In [127]:
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.model_step import ModelStep

step_create_model = ModelStep(
    name="EmailSpamCreateModel",
    step_args=model.create(instance_type="ml.m5.large", accelerator_type="ml.eia1.medium"),
)

In [128]:
from sagemaker.model_metrics import MetricsSource, ModelMetrics

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json",
    )
)

register_args = model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics,
)
step_register = ModelStep(name="EmailSpamRegisterModel", step_args=register_args)



# Pipeline Failure Condition

In [129]:
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.functions import Join

step_fail = FailStep(
    name="EmailSpamF1Fail",
    error_message=Join(on=" ", values=["Execution failed due to F1 >", f1_threshold]),
)

In [130]:
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet


cond_lte = ConditionLessThanOrEqualTo(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="classification_metrics.f1_score.value",
    ),
    right=f1_threshold,
)

step_cond = ConditionStep(
    name="EmailSpamF1Cond",
    conditions=[cond_lte],
    else_steps=[step_register, step_create_model],
    if_steps=[step_fail],
)

# Pipeline Creation

In [131]:
from sagemaker.workflow.pipeline import Pipeline


pipeline_name = f"EmailSpamPipeline"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        1,
        instance_type="ml.m5.xlarge",
        model_approval_status,
        input_data,
        f1_threshold,
    ],
    steps=[step_process, step_train, step_eval, step_cond],
)

### Verify Pipeline

In [132]:
import json


definition = json.loads(pipeline.definition())
definition



{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'ModelApprovalStatus',
   'Type': 'String',
   'DefaultValue': 'PendingManualApproval'},
  {'Name': 'InputData',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-us-east-1-692501163596/email-spam-detection/CEAS_08.csv'},
  {'Name': 'F1Threshold', 'Type': 'Float', 'DefaultValue': 0.8}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'EmailSpamProcess',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': 'ml.m5.large',
      'InstanceCount': 1,
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3',
     'ContainerEntrypoint': ['python3',
      '/opt/ml/processing/input/

In [133]:
pipeline.upsert(role_arn=role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-1:692501163596:pipeline/EmailSpamPipeline',
 'ResponseMetadata': {'RequestId': '5dc9db54-4146-4174-9491-06cd44d59372',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5dc9db54-4146-4174-9491-06cd44d59372',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '85',
   'date': 'Sat, 12 Oct 2024 03:11:05 GMT'},
  'RetryAttempts': 0}}

# Pipeline Run #1

In [134]:
execution = pipeline.start()

In [135]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:692501163596:pipeline/EmailSpamPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:692501163596:pipeline/EmailSpamPipeline/execution/ew6shfl4rn0k',
 'PipelineExecutionDisplayName': 'execution-1728702666305',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 10, 12, 3, 11, 6, 212000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 10, 12, 3, 11, 6, 212000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:692501163596:user-profile/d-hbhsbx20lpn1/default-1725649993774',
  'UserProfileName': 'default-1725649993774',
  'DomainId': 'd-hbhsbx20lpn1',
  'IamIdentity': {'Arn': 'arn:aws:sts::692501163596:assumed-role/LabRole/SageMaker',
   'PrincipalId': 'AROA2CPCJFZGH7DFUTPCN:SageMaker'}},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:692501163596:user-profile/d-hbhsbx20lpn1/default-1725649993774',
  'UserProfileName': 'default-1725649993

In [138]:
execution.wait()

In [139]:
execution.list_steps()

[{'StepName': 'EmailSpamCreateModel-CreateModel',
  'StartTime': datetime.datetime(2024, 10, 12, 3, 19, 1, 550000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 10, 12, 3, 19, 3, 429000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'Model': {'Arn': 'arn:aws:sagemaker:us-east-1:692501163596:model/pipelines-ew6shfl4rn0k-EmailSpamCreateModel-cmZL3A7XLr'}},
  'AttemptCount': 1},
 {'StepName': 'EmailSpamRegisterModel-RegisterModel',
  'StartTime': datetime.datetime(2024, 10, 12, 3, 19, 1, 550000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 10, 12, 3, 19, 3, 515000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'RegisterModel': {'Arn': 'arn:aws:sagemaker:us-east-1:692501163596:model-package/EmailSpamDetectionGroup/1'}},
  'AttemptCount': 1},
 {'StepName': 'EmailSpamF1Cond',
  'StartTime': datetime.datetime(2024, 10, 12, 3, 19, 0, 832000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 10, 12, 3, 19, 1, 65000, tzinfo=tzlocal()),

In [140]:
from pprint import pprint


evaluation_json = sagemaker.s3.S3Downloader.read_file(
    "{}/evaluation.json".format(
        step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
    )
)
pprint(json.loads(evaluation_json))



{'classification_metrics': {'f1_score': {'value': 0.9877626414223043}}}


In [141]:
import time
from sagemaker.lineage.visualizer import LineageTableVisualizer


viz = LineageTableVisualizer(sagemaker.session.Session())
for execution_step in reversed(execution.list_steps()):
    print(execution_step)
    display(viz.show(pipeline_execution_step=execution_step))
    time.sleep(5)

{'StepName': 'EmailSpamProcess', 'StartTime': datetime.datetime(2024, 10, 12, 3, 11, 7, 655000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 10, 12, 3, 13, 41, 103000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:692501163596:processing-job/pipelines-ew6shfl4rn0k-EmailSpamProcess-wSZu2gLYOq'}}, 'AttemptCount': 1}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...503255e4584b28350f81c61/preprocessing.py,Input,DataSet,ContributedTo,artifact
1,s3://...1163596/email-spam-detection/CEAS_08.csv,Input,DataSet,ContributedTo,artifact
2,68331...com/sagemaker-scikit-learn:1.2-1-cpu-py3,Input,Image,ContributedTo,artifact
3,s3://...w6shfl4rn0k/EmailSpamProcess/output/test,Output,DataSet,Produced,artifact
4,s3://...4rn0k/EmailSpamProcess/output/validation,Output,DataSet,Produced,artifact
5,s3://...6shfl4rn0k/EmailSpamProcess/output/train,Output,DataSet,Produced,artifact


{'StepName': 'EmailSpamTrain', 'StartTime': datetime.datetime(2024, 10, 12, 3, 13, 41, 969000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 10, 12, 3, 16, 24, 985000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:692501163596:training-job/pipelines-ew6shfl4rn0k-EmailSpamTrain-Ce7aRNlPp1'}}, 'AttemptCount': 1}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...4rn0k/EmailSpamProcess/output/validation,Input,DataSet,ContributedTo,artifact
1,s3://...6shfl4rn0k/EmailSpamProcess/output/train,Input,DataSet,ContributedTo,artifact
2,68331...om/sagemaker-scikit-learn:0.23-1-cpu-py3,Input,Image,ContributedTo,artifact
3,s3://...SpamTrain-Ce7aRNlPp1/output/model.tar.gz,Output,Model,Produced,artifact


{'StepName': 'EmailSpamEval', 'StartTime': datetime.datetime(2024, 10, 12, 3, 16, 25, 976000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 10, 12, 3, 18, 59, 857000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:692501163596:processing-job/pipelines-ew6shfl4rn0k-EmailSpamEval-dHt9Nkx0d1'}}, 'AttemptCount': 1}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...42dabcf22542b85d6f240a4d25/evaluation.py,Input,DataSet,ContributedTo,artifact
1,s3://...w6shfl4rn0k/EmailSpamProcess/output/test,Input,DataSet,ContributedTo,artifact
2,s3://...SpamTrain-Ce7aRNlPp1/output/model.tar.gz,Input,Model,ContributedTo,artifact
3,68331...om/sagemaker-scikit-learn:0.23-1-cpu-py3,Input,Image,ContributedTo,artifact
4,s3://...024-10-12-03-11-03-671/output/evaluation,Output,DataSet,Produced,artifact


{'StepName': 'EmailSpamF1Cond', 'StartTime': datetime.datetime(2024, 10, 12, 3, 19, 0, 832000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 10, 12, 3, 19, 1, 65000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'Condition': {'Outcome': 'False'}}, 'AttemptCount': 1}


None

{'StepName': 'EmailSpamRegisterModel-RegisterModel', 'StartTime': datetime.datetime(2024, 10, 12, 3, 19, 1, 550000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 10, 12, 3, 19, 3, 515000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'RegisterModel': {'Arn': 'arn:aws:sagemaker:us-east-1:692501163596:model-package/EmailSpamDetectionGroup/1'}}, 'AttemptCount': 1}


Unnamed: 0,Name/Source,Direction,Type,Association Type,Lineage Type
0,s3://...SpamTrain-Ce7aRNlPp1/output/model.tar.gz,Input,Model,ContributedTo,artifact
1,68331...om/sagemaker-scikit-learn:0.23-1-cpu-py3,Input,Image,ContributedTo,artifact
2,EmailSpamDetectionGroup-1-PendingManualApprova...,Input,Approval,ContributedTo,action
3,EmailSpamDetectionGroup-1728703143-aws-model-p...,Output,ModelGroup,AssociatedWith,context


{'StepName': 'EmailSpamCreateModel-CreateModel', 'StartTime': datetime.datetime(2024, 10, 12, 3, 19, 1, 550000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2024, 10, 12, 3, 19, 3, 429000, tzinfo=tzlocal()), 'StepStatus': 'Succeeded', 'Metadata': {'Model': {'Arn': 'arn:aws:sagemaker:us-east-1:692501163596:model/pipelines-ew6shfl4rn0k-EmailSpamCreateModel-cmZL3A7XLr'}}, 'AttemptCount': 1}


None

# Pipeline Run #2 with Approval

In [142]:
execution = pipeline.start(
    parameters=dict(
        ModelApprovalStatus="Approved",
    )
)

In [143]:
execution.wait()

In [144]:
execution.list_steps()

[{'StepName': 'EmailSpamCreateModel-CreateModel',
  'StartTime': datetime.datetime(2024, 10, 12, 3, 30, 15, 91000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 10, 12, 3, 30, 17, 243000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'Model': {'Arn': 'arn:aws:sagemaker:us-east-1:692501163596:model/pipelines-xry3da4cgopu-EmailSpamCreateModel-5GjqQeHCQb'}},
  'AttemptCount': 1},
 {'StepName': 'EmailSpamRegisterModel-RegisterModel',
  'StartTime': datetime.datetime(2024, 10, 12, 3, 30, 15, 91000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 10, 12, 3, 30, 16, 991000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'RegisterModel': {'Arn': 'arn:aws:sagemaker:us-east-1:692501163596:model-package/EmailSpamDetectionGroup/2'}},
  'AttemptCount': 1},
 {'StepName': 'EmailSpamF1Cond',
  'StartTime': datetime.datetime(2024, 10, 12, 3, 30, 13, 697000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 10, 12, 3, 30, 14, 25000, tzinfo=tzlocal

# Pipeline Run #3 With Failing F1 Threshold

In [145]:
execution = pipeline.start(parameters=dict(F1Threshold=0.99))

In [146]:
try:
    execution.wait()
except Exception as error:
    print(error)

Waiter PipelineExecutionComplete failed: Waiter encountered a terminal failure state: For expression "PipelineExecutionStatus" we matched expected path: "Failed"


In [147]:
execution.list_steps()

[{'StepName': 'EmailSpamF1Fail',
  'StartTime': datetime.datetime(2024, 10, 12, 3, 38, 19, 79000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 10, 12, 3, 38, 19, 793000, tzinfo=tzlocal()),
  'StepStatus': 'Failed',
  'FailureReason': 'Execution failed due to F1 > 0.99',
  'Metadata': {'Fail': {'ErrorMessage': 'Execution failed due to F1 > 0.99'}},
  'AttemptCount': 1},
 {'StepName': 'EmailSpamF1Cond',
  'StartTime': datetime.datetime(2024, 10, 12, 3, 38, 18, 79000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 10, 12, 3, 38, 18, 460000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'Condition': {'Outcome': 'True'}},
  'AttemptCount': 1},
 {'StepName': 'EmailSpamEval',
  'StartTime': datetime.datetime(2024, 10, 12, 3, 35, 43, 894000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 10, 12, 3, 38, 17, 573000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:692501163596:proces