In [None]:
import boto3
import sagemaker
import sagemaker.session


In [None]:
!pip install -U sagemaker --quiet

In [None]:
region = boto3.Session().region_name
session = sagemaker.session.Session()
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
model_package_group_name = f"CreditRiskModelPackageGroupName"
prefix = 'sagemaker/credit-xgboost'

In [None]:
# Upload the raw dataset
input_data_uri = session.upload_data(path='dataset/UCI_Credit_Card.csv', key_prefix=prefix+'/data')
print('Data set uploaded to ', input_data_uri)

### Pipeline input parameters

Pipelines can be initiated with default parameters, but also injected when calling the pipeline.start() method. 


In [None]:

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

# How many instances to use when processing
processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)

# What instance type to use for processing
processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value="ml.m5.large"
)

# What instance type to use for training
training_instance_type = ParameterString(
    name="TrainingInstanceType",
    default_value="ml.m5.xlarge"
)

# Where the input data is stored
input_data = ParameterString(
    name="InputData",
    default_value=input_data_uri,
)

# What is the default status of the model when registering with model registry.
model_approval_status = ParameterString(
    name="ModelApprovalStatus",
    default_value="PendingManualApproval"
)


## Pipeline Step: Pre process data (step_preprocess_data)
In the first step, we create an sklearn processor use it in the process_step. We also create a caching configuration we provide to each step to cache the results for a given time.

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep, CacheConfig

# Create cache configuration
cache_config = CacheConfig(enable_caching=True, expire_after="T30m")

# Create SKlean processor object
sklearn_processor = SKLearnProcessor(
    framework_version="0.20.0",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="credit-processing-job"
)

# Use the sklearn_processor in a Sagemaker pipelines ProcessingStep
step_preprocess_data = ProcessingStep(
    name="PreprocessCreditData",
    processor=sklearn_processor,
    cache_config=cache_config,
    inputs=[
      ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),  
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/output/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/output/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/output/test"),
        ProcessingOutput(output_name="baseline_with_headers", source="/opt/ml/processing/output/baseline")
    ],
    code="preprocessing.py",
)

## Pipeline Step: Train a model (step_train_model)
In the second step, we use the train and validation output from the precious processing step.

We retrieve the XGBoost container, create an XGBoost estimator, specify hyper parameters, and create the step.

In [None]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep
from sagemaker.estimator import Estimator

# Where to store the trained model
model_path = f"s3://{bucket}/CreditTrain"

# Fetch container to use for training
image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.2-2",
    py_version="py3",
    instance_type=training_instance_type,
)

# Create XGBoost estimator object
xgb_estimator = Estimator(
    image_uri=image_uri,
    instance_type=training_instance_type,
    instance_count=1,
    output_path=model_path,
    role=role,
    disable_profiler=True,
)

# Specify hyperparameters
xgb_estimator.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='binary:logistic',
                        num_round=25)

# Use the xgb_estimator in a Sagemaker pipelines ProcessingStep. 
# NOTE how the input to the training job directly references the output of the previous step.
step_train_model = TrainingStep(
    name="TrainCreditModel",
    estimator=xgb_estimator,
    cache_config=cache_config,
    inputs={
        "train": TrainingInput(
            s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                "train"
            ].S3Output.S3Uri,
            content_type="text/csv"
        ),
        "validation": TrainingInput(
            s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                "validation"
            ].S3Output.S3Uri,
            content_type="text/csv"
        )
    },
)

## Pipeline Step: Evaluate model (evaluate_model_processor)
To evaluate the model we just trained, we need to write an evalutation script that we run in a processing job

In [None]:
%%writefile evaluation.py
import json
import pathlib
import pickle
import tarfile
import joblib
import numpy as np
import pandas as pd
import xgboost


from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve

if __name__ == "__main__":
    model_path = f"/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")
    
    model = pickle.load(open("xgboost-model", "rb"))

    test_path = "/opt/ml/processing/test/test.csv"
    df = pd.read_csv(test_path, header=None)
    
    y_test = df.iloc[:, 0].to_numpy()
    df.drop(df.columns[0], axis=1, inplace=True)
    
    X_test = xgboost.DMatrix(df.values)
    
    prediction_probabilities = model.predict(X_test)
    predictions = np.round(prediction_probabilities)
    
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_test, predictions)
    fpr, tpr, _ = roc_curve(y_test, prediction_probabilities)

    
    print('Accuracy: ', accuracy)
    print('Precision: ', precision)
    print('Recall: ', recall)
    print('Confusion matrix: ', conf_matrix)

    # Available metrics to add to model: https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-model-quality-metrics.html
    report_dict = {
        "binary_classification_metrics": {
            "accuracy": {
                "value": accuracy,
                "standard_deviation" : "NaN"
            },
            "precision": {
                "value": precision,
                "standard_deviation" : "NaN"
            },
            "recall": {
                "value": recall,
                "standard_deviation" : "NaN"
            },
            "confusion_matrix" : {
              "0" : {
                "0" : int(conf_matrix[0][0]),
                "1" : int(conf_matrix[0][1])
              },
              "1" : {
                "0" : int(conf_matrix[1][0]),
                "1" : int(conf_matrix[1][1])
              },
            },
            "receiver_operating_characteristic_curve" : {
              "false_positive_rates" : list(fpr),
              "true_positive_rates" : list(tpr)
            },
        },
    }

    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))

Using the evaluation script, we create a ScriptProcessor object and use it in a ProcessingStep.

In [None]:
from sagemaker.processing import ScriptProcessor
from sagemaker.workflow.properties import PropertyFile

# Create ScriptProcessor object.
evaluate_model_processor = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name="script-credit-eval",
    role=role,
)

# Create a PropertyFile
# We use a PropertyFile to be able to reference outputs from a processing step, for instance to use in a condition step, which we'll see later on.
# For more information, visit https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-propertyfile.html
evaluation_report = PropertyFile(
    name="EvaluationReport",
    output_name="evaluation",
    path="evaluation.json"
)

# Use the evaluate_model_processor in a Sagemaker pipelines ProcessingStep. 
step_evaluate_model = ProcessingStep(
    name="EvaluateCreditModel",
    processor=evaluate_model_processor,
    cache_config=cache_config,
    inputs=[
        ProcessingInput(
            source=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                "test"
            ].S3Output.S3Uri,
            destination="/opt/ml/processing/test"
        )
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
    ],
    code="evaluation.py",
    property_files=[evaluation_report],
)

## Pipeline Step: Create model (step_create_model)

In [None]:
#from sagemaker.model import Model
#from sagemaker.workflow.steps import CreateModelStep

#model = Model(
#    image_uri=image_uri,
#    model_data=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,
#    sagemaker_session=session,
#    role=role,
#)


#step_create_model = CreateModelStep(
#    name="CreateCreditModel",
#    model=model,
#)

## Pipeline Step: Register model (step_register_model)

I don't think the model_metrics step is working..?

In [None]:
from sagemaker.model_metrics import MetricsSource, ModelMetrics 
from sagemaker.workflow.step_collections import RegisterModel


model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_evaluate_model.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json"
    )
)

# Crete a RegisterModel step, which registers your model with Sagemaker Model Registry.
step_register_model = RegisterModel(
    name="RegisterCreditModel",
    estimator=xgb_estimator,
    model_data=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.xlarge", "ml.m5.large"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics
)

## Pipeline Step: Compute Model Monitor data quality baseline (step_create_data_baseline)


In [None]:

# Create Processor object using the model monitor image
baseline_processor = sagemaker.processing.Processor(
    base_job_name="credit-risk-baseline-processor",
    image_uri=sagemaker.image_uris.retrieve(framework='model-monitor', region='eu-west-1'),
    role=role,
    instance_count=1,
    instance_type=processing_instance_type,
    env = {
        "dataset_format": "{\"csv\": {\"header\": true} }",
        "dataset_source": "/opt/ml/processing/sm_input",
        "output_path": "/opt/ml/processing/sm_output",
        "publish_cloudwatch_metrics": "Disabled"
    }
)

# Create a Sagemaker Pipeline step, using the baseline_processor.
step_create_data_baseline = ProcessingStep(
    name="CreateModelQualityBaseline",
    processor=baseline_processor,
    cache_config=cache_config,
    inputs=[
        ProcessingInput(
            source=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                "baseline_with_headers"
            ].S3Output.S3Uri,
            destination="/opt/ml/processing/sm_input",
        )
    ],
    outputs=[
        ProcessingOutput(
            source="/opt/ml/processing/sm_output",
            destination="s3://{}/{}/baseline".format(bucket, prefix),
            output_name="baseline_result",
        )
    ],
)

## Pipeline Condition Step: Meets accuracy requirements? (cond_gte)
Adding conditions to the pipeline is done with a ConditionStep.
In this case, we only want to register the model with model registry, and compute a new data quality baseline for model monitoring, if the new model meets some quality condition

In [None]:
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import (
    ConditionStep,
    JsonGet,
)

# Create Condition
cond_gte = ConditionGreaterThanOrEqualTo(
    left=JsonGet(
        step=step_evaluate_model,
        property_file=evaluation_report,
        json_path="binary_classification_metrics.accuracy.value"
    ),
    right=0.7
)

# Create a Sagemaker Pipelines ConditionStep, using the condition we just created.
step_cond = ConditionStep(
    name="AccuracyCondition",
    conditions=[cond_gte],
    if_steps=[step_register_model],
    else_steps=[], 
)

## Pipeline Creation: Orchestrate all steps

Now that we've created all steps we need in our pipeline, we need to orchestrate all the steps.

In [None]:
from sagemaker.workflow.pipeline import Pipeline

# Create a Sagemaker Pipeline
pipeline_name = f"CreditPipeline"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type, 
        processing_instance_count,
        training_instance_type,
        model_approval_status,
        input_data,
    ],
    steps=[step_preprocess_data, step_train_model, step_evaluate_model, step_create_data_baseline, step_cond],
)

#### Submit pipeline, and start it.

In [None]:
import json

# Inspect pipeline definition.
#json.loads(pipeline.definition())

# Submit pipline
pipeline.upsert(role_arn=role)

# Execute pipeline
execution = pipeline.start()

# Wait for pipeline to finish
execution.wait()

# List the execution steps to check out the status and artifacts:
execution.list_steps()