In [13]:
import boto3
import json
import numpy as np
import pandas as pd
import os
from PIL import Image
import sagemaker
import shutil
import tarfile

#Dataset 
from sklearn.datasets import load_breast_cancer

import urllib.request
from urllib.error import HTTPError
from sklearn.model_selection import train_test_split

import zipfile

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = "ta-ml-deployment-sagemaker"
print("Using Bucket: " + bucket)
print(f"In region: {region}")

Using Bucket: ta-ml-deployment-sagemaker
In region: us-east-2


## Loading Iris Dataset

In [14]:
breast_cancer = load_breast_cancer()

# Create a dataframe with feature names
df = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)

# Add target column
df['target'] = breast_cancer.target

# Split features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42
)

In [15]:
X_train['target'] = y_train
X_test['target'] = y_test

In [16]:
X_train.to_csv("train-v-1.csv", index=False)
X_test.to_csv("test-v-1.csv", index=False)

## ML Deployment

### Send Data to S3

In [17]:
sm_data_prefix = "sagemaker/breast-cancer"
#Upload the train data
trainpath = sess.upload_data(
    path="train-v-1.csv",
    bucket=bucket,
    key_prefix=sm_data_prefix
)

#upload the test data
testpath = sess.upload_data(
    path='test-v-1.csv',
    bucket=bucket,
    key_prefix=sm_data_prefix
)

print("Upload to S3 bucket successful!\n")
print(f"trainpath = {trainpath}")
print(f"testpath = {testpath}")

Upload to S3 bucket successful!

trainpath = s3://ta-ml-deployment-sagemaker/sagemaker/breast-cancer/train-v-1.csv
testpath = s3://ta-ml-deployment-sagemaker/sagemaker/breast-cancer/test-v-1.csv


### Write The Script.py File to Deploy To SageMaker

In [18]:
%%writefile script.py

import sklearn
import pathlib
import json
import boto3
from io import StringIO
import numpy as np
import pandas as pd
import argparse
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
import os

def model_fn(model_dir):
    # Load model once and set to evaluation mode
    model = joblib.load(os.path.join(model_dir, 'model.joblib'))
    return model


if __name__=='__main__':
    print("[INFO] Extracting args...")
    parser = argparse.ArgumentParser()

    #Model hyperparameter
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=42)

    #Directories as arguments
    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train_file", type=str, default="train-v-1.csv")
    parser.add_argument("--test_file", type=str, default="test-v-1.csv")

    args, _ = parser.parse_known_args()

    print("[INFO] Reading data...")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label = features.pop(-1)

    print("[INFO] Train, test, features, and labels successfully extracted!\n")

    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print(f"Features columns: {features}")
    print(f"Label column: {label}\n")

    print("[INFO] Training RandomForest Classifier...\n")
    model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state)
    model.fit(X_train, y_train)
    model_path = os.path.join(args.model_dir, 'model.joblib')
    joblib.dump(model, model_path)

    print(f"Model path: {model_path}")

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    precision = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print("[INFO] METRIC RESULTS ON TEST DATA\n")
    print(f"Test accuracy: {acc}")
    print(f"Test precision: {precision}")
    print(f"Test recall: {recall}")

Overwriting script.py


In [19]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point='script.py',
    role="arn:aws:iam::361769570735:role/ta-ml-deployment",
    instance_count=1,
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.m4.xlarge",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 42
    },
    base_job_name="RF-custom-sklearn",
    use_spot_instance=True,
    # max_wait=7200,
    # max_run=3600
)

In [20]:
#launch the training job that we just created 
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-11-12-04-14-58-968


2024-11-12 04:15:03 Starting - Starting the training job...
2024-11-12 04:15:17 Starting - Preparing the instances for training...
2024-11-12 04:15:59 Downloading - Downloading the training image.....2024-11-12 04:16:51,218 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-11-12 04:16:51,222 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-11-12 04:16:51,269 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-11-12 04:16:51,458 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-11-12 04:16:51,472 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-11-12 04:16:51,485 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-11-12 04:16:51,494 sagemaker-training-toolkit INFO     Invoking user script
Training Env:
{
    "additional_framework_parameters": {},
    "channel_input_di

In [21]:
sklearn_estimator.latest_training_job.wait(logs="None")
model_output = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]
print(f"Model output path: {model_output}")


2024-11-12 04:17:08 Starting - Preparing the instances for training
2024-11-12 04:17:08 Downloading - Downloading the training image
2024-11-12 04:17:08 Training - Training image download completed. Training in progress.
2024-11-12 04:17:08 Uploading - Uploading generated training model
2024-11-12 04:17:08 Completed - Training job completed
Model output path: s3://sagemaker-us-east-2-361769570735/RF-custom-sklearn-2024-11-12-04-14-58-968/output/model.tar.gz


### Make a copy of the model output for versioning purposes

In [22]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "RandomForestClassifier-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name = model_name,
    model_data = model_output,
    role="arn:aws:iam::361769570735:role/ta-ml-deployment",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION
)


### Endpoint deployment

In [23]:
endpoint_name = "RandomForestClassifier-sklearn-model-"+ strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Model Endpoint={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name
)

Model Endpoint=RandomForestClassifier-sklearn-model-2024-11-12-04-17-28


INFO:sagemaker:Creating model with name: RandomForestClassifier-sklearn-model-2024-11-12-04-17-28
INFO:sagemaker:Creating endpoint-config with name RandomForestClassifier-sklearn-model-2024-11-12-04-17-28
INFO:sagemaker:Creating endpoint with name RandomForestClassifier-sklearn-model-2024-11-12-04-17-28


-----!

In [24]:
X_test.drop('target', axis=1, inplace=True)

In [25]:
print(predictor.predict(X_test[0:2]))

[1 0]


### Deleting endpoint cause money

In [None]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)