# Develop, Train, Optimize and Deploy Scikit-Learn Random Forest

Original notebook https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb

* Doc https://sagemaker.readthedocs.io/en/stable/using_sklearn.html
* SDK https://sagemaker.readthedocs.io/en/stable/sagemaker.sklearn.html
* boto3 https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#client

In this notebook we show how to use Amazon SageMaker to develop, train, tune and deploy a Scikit-Learn based ML model (Random Forest). More info on Scikit-Learn can be found here https://scikit-learn.org/stable/index.html. We use the California Housing dataset, present in Scikit-Learn: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html. The California Housing dataset was originally published in:

> Pace, R. Kelley, and Ronald Barry. "Sparse spatial autoregressions." Statistics & Probability Letters 33.3 (1997): 291-297.
 
**This sample is provided for demonstration purposes, make sure to conduct appropriate testing if derivating this code for your own use-cases!**

In [37]:
import datetime
import time
from time import gmtime, strftime
import tarfile

import boto3
import pandas as pd
import json
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing


sm_boto3 = boto3.client("sagemaker")

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = sess.default_bucket()  # this could also be a hard-coded bucket name

print("Using bucket " + bucket)

Using bucket sagemaker-us-east-1-741453530198


## Prepare data
We load a dataset from sklearn, split it and send it to S3

In [38]:
# we use the California housing dataset
data = fetch_california_housing()

In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=42
)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX["target"] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX["target"] = y_test

In [40]:
trainX.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,4.2143,37.0,5.288235,0.973529,860.0,2.529412,33.81,-118.12,2.285
1,5.3468,42.0,6.364322,1.08794,957.0,2.404523,37.16,-121.98,2.799
2,3.9191,36.0,6.110063,1.059748,711.0,2.235849,38.45,-122.69,1.83
3,6.3703,32.0,6.0,0.990196,1159.0,2.272549,34.16,-118.41,4.658
4,2.3684,17.0,4.795858,1.035503,706.0,2.088757,38.57,-121.33,1.5


In [41]:
trainX.to_csv("california_housing_train.csv")
testX.to_csv("california_housing_test.csv")

In [42]:
# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path="california_housing_train.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)
print('trainpath:',trainpath)
testpath = sess.upload_data(
    path="california_housing_test.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)
print('testpath:',testpath)

trainpath: s3://sagemaker-us-east-1-741453530198/sagemaker/sklearncontainer/california_housing_train.csv
testpath: s3://sagemaker-us-east-1-741453530198/sagemaker/sklearncontainer/california_housing_test.csv


## Writing a *Script Mode* script
The below script contains both training and inference functionality and can run both in SageMaker Training hardware or locally (desktop, SageMaker notebook, on prem, etc). Detailed guidance here https://sagemaker.readthedocs.io/en/stable/using_sklearn.html#preparing-the-scikit-learn-training-script

In [43]:
%%writefile script.py

import argparse
import joblib
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ == "__main__":

    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--min-samples-leaf", type=int, default=3)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="california_housing_train.csv")
    parser.add_argument("--test-file", type=str, default="california_housing_test.csv")
    parser.add_argument(
        "--features", type=str
    )  # in this script we ask user to explicitly name features
    parser.add_argument(
        "--target", type=str
    )  # in this script we ask user to explicitly name the target

    args, _ = parser.parse_known_args()

    print("reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("building training and testing datasets")
    X_train = train_df[args.features.split()]
    X_test = test_df[args.features.split()]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # train
    print("training model")
    model = RandomForestRegressor(
        n_estimators=args.n_estimators, min_samples_leaf=args.min_samples_leaf, n_jobs=-1
    )

    model.fit(X_train, y_train)

    # print abs error
    print("validating model")
    abs_err = np.abs(model.predict(X_test) - y_test)

    # print couple perf metrics
    for q in [10, 50, 90]:
        print("AE-at-" + str(q) + "th-percentile: " + str(np.percentile(a=abs_err, q=q)))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)
    print(args.min_samples_leaf)

Overwriting script.py


## SageMaker Training

### Launching a training job with the Python SDK

In [44]:
# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.c5.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="rf-scikit",
    metric_definitions=[{"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}],
    hyperparameters={
        "n-estimators": 100,
        "min-samples-leaf": 3,
        "features": "MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude",
        "target": "target",
    },
)

In [45]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

2023-01-22 03:35:09 Starting - Starting the training job...
2023-01-22 03:35:34 Starting - Preparing the instances for trainingProfilerReport-1674358508: InProgress
......
2023-01-22 03:36:34 Downloading - Downloading input data.....[34m2023-01-22 03:37:12,618 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-01-22 03:37:12,621 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-01-22 03:37:12,656 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-01-22 03:37:12,802 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-01-22 03:37:12,812 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-01-22 03:37:12,822 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-01-22 03:37:12,831 sagemaker-training-toolkit INFO     Invoking 

### Launching a tuning job with the Python SDK

In [46]:
# we use the Hyperparameter Tuner
from sagemaker.tuner import IntegerParameter

# Define exploration boundaries
hyperparameter_ranges = {
    "n-estimators": IntegerParameter(20, 100),
    "min-samples-leaf": IntegerParameter(2, 6),
}

# create Optimizer
Optimizer = sagemaker.tuner.HyperparameterTuner(
    estimator=sklearn_estimator,
    hyperparameter_ranges=hyperparameter_ranges,
    base_tuning_job_name="RF-tuner",
    objective_type="Minimize",
    objective_metric_name="median-AE",
    metric_definitions=[
        {"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}
    ],  # extract tracked metric from logs with regexp
    max_jobs=10,
    max_parallel_jobs=2,
)

In [47]:
Optimizer.fit({"train": trainpath, "test": testpath})

.........................................................................!


In [48]:
# get tuner results in a df
results = Optimizer.analytics().dataframe()
while results.empty:
    time.sleep(1)
    results = Optimizer.analytics().dataframe()
results.head()

Unnamed: 0,min-samples-leaf,n-estimators,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,2.0,88.0,RF-tuner-230122-0337-010-9497f561,Completed,0.201341,2023-01-22 03:43:25+00:00,2023-01-22 03:43:57+00:00,32.0
1,3.0,55.0,RF-tuner-230122-0337-009-5be9a852,Completed,0.203131,2023-01-22 03:42:31+00:00,2023-01-22 03:43:27+00:00,56.0
2,2.0,87.0,RF-tuner-230122-0337-008-db696603,Completed,0.201737,2023-01-22 03:41:55+00:00,2023-01-22 03:43:05+00:00,70.0
3,3.0,32.0,RF-tuner-230122-0337-007-d4c46e9b,Completed,0.2077,2023-01-22 03:41:38+00:00,2023-01-22 03:42:04+00:00,26.0
4,3.0,53.0,RF-tuner-230122-0337-006-136afd39,Completed,0.205888,2023-01-22 03:41:13+00:00,2023-01-22 03:41:39+00:00,26.0


In [49]:
results.sort_values('FinalObjectiveValue')

Unnamed: 0,min-samples-leaf,n-estimators,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,2.0,88.0,RF-tuner-230122-0337-010-9497f561,Completed,0.201341,2023-01-22 03:43:25+00:00,2023-01-22 03:43:57+00:00,32.0
2,2.0,87.0,RF-tuner-230122-0337-008-db696603,Completed,0.201737,2023-01-22 03:41:55+00:00,2023-01-22 03:43:05+00:00,70.0
1,3.0,55.0,RF-tuner-230122-0337-009-5be9a852,Completed,0.203131,2023-01-22 03:42:31+00:00,2023-01-22 03:43:27+00:00,56.0
8,3.0,54.0,RF-tuner-230122-0337-002-40de2a08,Completed,0.204595,2023-01-22 03:39:08+00:00,2023-01-22 03:40:10+00:00,62.0
9,2.0,77.0,RF-tuner-230122-0337-001-5e95d02a,Completed,0.20475,2023-01-22 03:39:03+00:00,2023-01-22 03:40:10+00:00,67.0
6,4.0,35.0,RF-tuner-230122-0337-004-66112289,Completed,0.205314,2023-01-22 03:40:25+00:00,2023-01-22 03:40:51+00:00,26.0
7,3.0,72.0,RF-tuner-230122-0337-003-7e010ed3,Completed,0.205562,2023-01-22 03:40:21+00:00,2023-01-22 03:40:43+00:00,22.0
4,3.0,53.0,RF-tuner-230122-0337-006-136afd39,Completed,0.205888,2023-01-22 03:41:13+00:00,2023-01-22 03:41:39+00:00,26.0
3,3.0,32.0,RF-tuner-230122-0337-007-d4c46e9b,Completed,0.2077,2023-01-22 03:41:38+00:00,2023-01-22 03:42:04+00:00,26.0
5,4.0,87.0,RF-tuner-230122-0337-005-84e6e82f,Completed,0.210246,2023-01-22 03:40:55+00:00,2023-01-22 03:41:21+00:00,26.0


## Deploy to a serverless endpoint

In [50]:
# Serverless Endpoint Config
from sagemaker.serverless import ServerlessInferenceConfig
serverless_config = ServerlessInferenceConfig(
    memory_size_in_mb=4096, 
    max_concurrency=3)

In [51]:
# Serverless Endpoint Creation
endpoint_name = "sklearn-serverless-ep" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
sklearn_estimator.deploy(endpoint_name = endpoint_name, serverless_inference_config=serverless_config)
print(' Endpoint Name:',endpoint_name)

-----! Endpoint Name: sklearn-serverless-ep2023-01-22-03-44-02


In [52]:
example = pd.read_csv(testpath, sep = ',', index_col=0)
example = example.head(5)
example

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,1.6812,25.0,4.192201,1.022284,1392.0,3.877437,36.06,-119.01,0.477
1,2.5313,30.0,5.039384,1.193493,1565.0,2.679795,35.14,-119.46,0.458
2,3.4801,52.0,3.977155,1.185877,1310.0,1.360332,37.8,-122.44,5.00001
3,5.7376,17.0,6.163636,1.020202,1705.0,3.444444,34.28,-118.72,2.186
4,3.725,34.0,5.492991,1.028037,1063.0,2.483645,36.62,-121.93,2.78


In [53]:
client = boto3.client('sagemaker-runtime')
response = client.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=example.drop('target', axis=1).to_csv(header=False, index=False).encode("utf-8"),
    ContentType="text/csv")
result = response["Body"].read()
json.loads(result.decode("utf-8"))

[0.4889784360112106,
 0.7285699512987014,
 4.824959212499994,
 2.6290830394383393,
 2.3370831850649347]

## Don't forget to delete the endpoint !

In [54]:
from sagemaker.predictor import Predictor
predictor = Predictor(endpoint_name=endpoint_name)
predictor.delete_endpoint(delete_endpoint_config=True)