In [2]:
!pip install -U sagemaker

Collecting sagemaker
  Using cached sagemaker-2.222.0-py3-none-any.whl.metadata (14 kB)
Using cached sagemaker-2.222.0-py3-none-any.whl (1.5 MB)
Installing collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.215.0
    Uninstalling sagemaker-2.215.0:
      Successfully uninstalled sagemaker-2.215.0
Successfully installed sagemaker-2.222.0
[0m

In [3]:
import sys

import boto3
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession

sagemaker_session = sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
pipeline_session = PipelineSession()
bucket = sagemaker_session.default_bucket()
s3 = boto3.resource("s3")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [5]:
prefix = 'olist_review_score'
base_uri = f"s3://{bucket}/{prefix}"
base_data_uri = f"{base_uri}/data"

input_data_uri = f"{base_data_uri}/train"

batch_data_uri = f"{base_data_uri}/validation"


## Define Parameters to Parametrize Pipeline Execution ##

In [6]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)
input_data = ParameterString(
    name="InputData",
    default_value=input_data_uri,
)
batch_data = ParameterString(
    name="BatchData",
    default_value=batch_data_uri,
)
# mse_threshold = ParameterFloat(name="MseThreshold", default_value=6.0)

## Define a Processing Step for Feature Engineering ##

In [8]:
from sagemaker.inputs import TrainingInput

# set up inputs for training
train_data = TrainingInput(
    input_data_uri,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
validation_data = TrainingInput(
    batch_data_uri,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

data_channels = {"train": train_data, 'validation': validation_data}

# set up validation for eval script
eval_data = ParameterString(
    name="InputDataUrl",
    default_value=batch_data_uri,
)

In [19]:
from io import StringIO
import pandas as pd

def get_s3_data_shape(bucket_name, file_key):
    # Create a boto3 client
    s3 = boto3.client('s3')
    
    # Read the data from S3
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    data = obj['Body'].read().decode('utf-8')
    
    # Load data into a pandas DataFrame
    df = pd.read_csv(StringIO(data))
    
    # Get the shape of the DataFrame
    data_shape = df.shape
    return data_shape

X_val_shape = get_s3_data_shape(bucket, f'{prefix}/data/validation/validation_data.csv')
X_val_shape

(8677, 10)

In [20]:
from sagemaker import hyperparameters
from sagemaker.estimator import Estimator
from sagemaker.image_uris import retrieve

model_path = f"s3://{bucket}/{prefix}/CICD/"

image_uri = retrieve(
    framework="knn",
    region=region,
    py_version="py3",
    instance_type="ml.m5.xlarge",
)
knn_train = Estimator(
    image_uri=image_uri,
    instance_type=instance_type,
    instance_count=1,
    output_path=model_path,
    role=role,
    sagemaker_session=pipeline_session,
)
knn_train.set_hyperparameters(
    k=10,  # Number of neighbors
    predictor_type='classifier',
    feature_dim=X_val_shape[1] - 1,
    sample_size=X_val_shape[0],
)

train_args = knn_train.fit(
    inputs={
        "train": train_data,
        "validation": validation_data,
    }
)



## Define a Training Step to Train a Model ##

In [22]:
from sagemaker.workflow.steps import TrainingStep

step_train = TrainingStep(
    name="ReviewScoreTrain",
    step_args=train_args,
)

## Define Model Evaluation Processing Step ##

In [7]:
!mkdir -p code

In [None]:
%%writefile code/evaluation.py
import json
import pathlib
import pickle
import tarfile

import joblib
import numpy as np
import pandas as pd
import xgboost

from sklearn.metrics import mean_squared_error


if __name__ == "__main__":
    model_path = f"/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")

    model = pickle.load(open("xgboost-model", "rb"))

    test_path = "/opt/ml/processing/test/test.csv"
    df = pd.read_csv(test_path, header=None)

    y_test = df.iloc[:, 0].to_numpy()
    df.drop(df.columns[0], axis=1, inplace=True)

    X_test = xgboost.DMatrix(df.values)

    predictions = model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    std = np.std(y_test - predictions)
    report_dict = {
        "regression_metrics": {
            "mse": {"value": mse, "standard_deviation": std},
        },
    }

    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))