* changed by nov05 on 2024-12-10   
* All operations are executed from local conda env `awsmle_py310`.  
* changed dataset, separated `train.py` and `inference.py`, added testing code, etc.   
* auto-scaling tutorial  
    https://youtu.be/afIMvatnkzE  
    https://youtu.be/bbcf1xyGIYw  
    https://youtu.be/ai9RexqqjVs  


In [1]:
%pwd

'd:\\github\\udacity-aws-mle-nano-course5\\excercise_4.8'

In [117]:
! notepad C:\Users\guido\.aws\credentials

In [1]:
## reset the session after updating credentials
import boto3 # type: ignore
boto3.DEFAULT_SESSION = None
sagemaker_client = boto3.client("sagemaker")
## Define IAM role
import sagemaker # type: ignore
from sagemaker import get_execution_role # type: ignore
role_arn = get_execution_role()  ## get role ARN
if 'AmazonSageMaker-ExecutionRole' not in role_arn:
    print(f"Role ARN (voclabs): {role_arn}")  ## arn:aws:iam::026211625715:role/voclabs
    ## your own role here
    role_arn = "arn:aws:iam::026211625715:role/service-role/AmazonSageMaker-ExecutionRole-20241209T041445"
session = sagemaker.Session()
region = session.boto_region_name
bucket = session.default_bucket()
print("AWS Region: {}".format(region))
print("Default SageMaker Bucket: {}".format(bucket))
print("Role Arn (SageMaker): {}".format(role_arn))



sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\guido\AppData\Local\sagemaker\sagemaker\config.yaml


Role ARN (voclabs): arn:aws:iam::026211625715:role/voclabs
AWS Region: us-east-1
Default SageMaker Bucket: sagemaker-us-east-1-026211625715
Role Arn (SageMaker): arn:aws:iam::026211625715:role/service-role/AmazonSageMaker-ExecutionRole-20241209T041445


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
## ImportError: `load_boston` has been removed from scikit-learn since version 1.2.
data = load_boston()
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=42
)
trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX["target"] = y_train
testX = pd.DataFrame(X_test, columns=data.feature_names)
testX["target"] = y_test
## don't panic. skip this cell. scroll down.
## read the error message if you are interested.  

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=42
)
trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX["target"] = y_train
testX = pd.DataFrame(X_test, columns=data.feature_names)
testX["target"] = y_test

In [9]:
trainX.sample(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
7237,6.981,31.0,6.645833,1.01875,1362.0,2.8375,34.21,-118.33,4.029
11051,3.1205,30.0,4.13253,0.971888,753.0,3.024096,36.97,-122.0,2.405
3751,7.5443,9.0,6.723404,0.995745,1675.0,3.56383,37.33,-121.77,3.484
7602,3.4868,36.0,5.227666,1.008646,886.0,2.553314,33.9,-118.13,2.084
357,5.2043,15.0,5.551195,0.977816,1408.0,2.40273,35.64,-117.68,0.957


In [17]:
trainX.head()
trainX.to_csv("train.csv")
testX.to_csv("test.csv")
# send data to S3. SageMaker will take training data from s3
train_s3_path = session.upload_data(
    path="train.csv", 
    bucket=bucket, 
    key_prefix="california_housing"
)
test_s3_path = session.upload_data(
    path="test.csv", 
    bucket=bucket, 
    key_prefix="california_housing"
)

In [67]:
%%writefile train.py
import argparse
import joblib
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

if __name__ == "__main__":

    print("extracting arguments...")
    parser = argparse.ArgumentParser()
    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--min-samples-leaf", type=int, default=3)
    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train.csv")
    parser.add_argument("--test-file", type=str, default="test.csv")
    parser.add_argument(
        "--features", type=str
    )  # in this script we ask user to explicitly name features
    parser.add_argument(
        "--target", type=str
    )  # in this script we ask user to explicitly name the target
    args, _ = parser.parse_known_args()

    print("reading data...")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("building training and testing datasets...")
    features = args.features.split()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # train
    print("training model...")
    model = RandomForestRegressor(
        n_estimators=args.n_estimators, 
        min_samples_leaf=args.min_samples_leaf, 
        n_jobs=-1
    )
    model.fit(X_train, y_train)

    # print abs error
    print("validating model...")
    abs_err = np.abs(model.predict(X_test) - y_test)
    # print couple perf metrics
    for q in [10, 50, 90]:
        print("AE-at-" + str(q) + "th-percentile: " + str(np.percentile(a=abs_err, q=q)))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("👉 model persisted at " + path)

Overwriting train.py


In [None]:
%%writefile inference.py
import joblib
import os

## inference functions
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))  ## classifier
    return clf

Overwriting inference.py


In [80]:
features = ' '.join(trainX.columns[:-1].to_list())
features

'MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude'

In [None]:
## fit and save model locally 
## it has nothing to do with the code that comes after
!python train.py --n-estimators 100 \
--min-samples-leaf 2 \
--model-dir ./ \
--train ./ \
--test ./ \
--features f"\"{features_str}\"" \
--target target

extracting arguments...
reading data...
building training and testing datasets...
training model...
validating model...
AE-at-10th-percentile: 0.030122602056276572
AE-at-50th-percentile: 0.19986898214285687
AE-at-90th-percentile: 0.7725330218809531
👉 model persisted at ./model.joblib


In [81]:
%%time
# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn
FRAMEWORK_VERSION = "1.0-1" ## "0.23-1", "1.0-1"
sklearn_estimator = SKLearn(
    entry_point="train.py",
    role=role_arn,
    instance_count=1,
    instance_type="ml.m4.xlarge", # "ml.c5.xlarge",
    framework_version=FRAMEWORK_VERSION,
    dependencies=["requirements.txt"],   
    base_job_name="rf-scikit-train",
    metric_definitions=[{
        "Name": "median-AE", 
        "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"
    }],
    hyperparameters={
        "n-estimators": 100,
        "min-samples-leaf": 3,
        # "features": "CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT",
        "features": features,
        "target": "target",
    },
)
# launch training job, with asynchronous call
sklearn_estimator.fit(
    {
        "train": train_s3_path, 
        "test": test_s3_path
    }, 
    wait=True
)
## e.g. Creating training-job with name: rf-scikit-train-2024-12-11-07-41-02-083
## CPU times: total: 4.81 s
## Wall time: 3min 29s

2024-12-11 08:26:43 Starting - Starting the training job...
2024-12-11 08:26:57 Starting - Preparing the instances for training...
2024-12-11 08:27:21 Downloading - Downloading input data...
2024-12-11 08:27:51 Downloading - Downloading the training image...
2024-12-11 08:28:52 Training - Training image download completed. Training in progress...2024-12-11 08:29:03,941 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-12-11 08:29:03,945 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-12-11 08:29:03,948 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-12-11 08:29:03,966 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-12-11 08:29:04,231 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
/miniconda3/bin/python -m pip install -r requirements.txt
2024-12-11 08:29:05,202 sagemaker-training-toolkit INFO 

In [82]:
## print out training job history
sklearn_estimator.latest_training_job.wait(logs="None")
model_artifact = sagemaker_client.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]
print("Model artifact persisted at " + model_artifact)


2024-12-11 08:29:30 Starting - Preparing the instances for training
2024-12-11 08:29:30 Downloading - Downloading the training image
2024-12-11 08:29:30 Training - Training image download completed. Training in progress.
2024-12-11 08:29:30 Uploading - Uploading generated training model
2024-12-11 08:29:30 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-026211625715/rf-scikit-train-2024-12-11-08-26-36-576/output/model.tar.gz


In [None]:
%%time
## deploy the trained model
## The SageMaker scikit-learn environment includes the following by default:
## scikit-learn, joblib, numpy, pandas, Other commonly used ML and data processing libraries
from sagemaker.sklearn.model import SKLearnModel
# from sagemaker.serializers import CSVSerializer
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
try:
    model_artifact
except:
    model_artifact = "s3://sagemaker-us-east-1-026211625715/rf-scikit-train-2024-12-11-08-26-36-576/output/model.tar.gz"
model = SKLearnModel(
    model_data=model_artifact,
    role=role_arn,
    entry_point="inference.py",
    framework_version=FRAMEWORK_VERSION,
    dependencies=["deploy_requirements.txt"],
)
predictor = model.deploy(
    instance_type="ml.c5.large", ## "ml.m4.xlarge", "ml.c5.large"
    initial_instance_count=1)
## e.g. endpoint: sagemaker-scikit-learn-2024-12-11-06-38-03-653  
## Wall time: 3min 35s

------!CPU times: total: 1.53 s
Wall time: 3min 35s


In [63]:
import numpy as np
import pandas as pd 
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
try:
    predictor
except:
    ## instantiate a predictor 
    endpoint_name = "sagemaker-scikit-learn-2024-12-11-11-19-45-103"
    predictor = Predictor(endpoint_name=endpoint_name)
predictor.serializer = JSONSerializer()
predictor.deserializer = JSONDeserializer()
## read test.csv to testX dataframe
testX = pd.read_csv("test.csv", index_col=0)
testX.sample(2)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
3516,1.8299,9.0,3.730475,1.102603,1780.0,2.725881,32.72,-117.13,1.375
4591,2.9107,28.0,4.083333,0.981481,504.0,4.666667,34.07,-117.98,1.525


In [65]:
## randomly select data points from the test dataset
row = testX.sample(3)  ## <class 'pandas.core.frame.DataFrame'>
input_data = row.drop('target', axis=1).to_numpy()
## make a prediction
prediction = predictor.predict(input_data)
print(f"👉\tPredition: {prediction},", "type:", type(prediction))
print("\tAbsolute Error:", np.abs(prediction - row['target'].values))
print("👉\tInput data:")
print(input_data)
testX.iloc[row.index,:]

👉	Predition: [2.617300347402599, 1.9462303535353538, 1.7003786709956707], type: <class 'list'>
	Absolute Error: [0.20230035 0.31176965 0.07337867]
👉	Input data:
[[ 4.95500000e+00  4.00000000e+00  5.39045936e+00  1.09893993e+00
   3.77500000e+03  2.22320377e+00  3.30000000e+01 -1.17070000e+02]
 [ 3.99290000e+00  1.40000000e+01  4.57104796e+00  1.04973357e+00
   3.17100000e+03  2.81616341e+00  3.37300000e+01 -1.17920000e+02]
 [ 3.86720000e+00  4.20000000e+01  4.41666667e+00  9.82456140e-01
   7.76000000e+02  3.40350877e+00  3.39000000e+01 -1.18070000e+02]]


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
2128,4.955,4.0,5.390459,1.09894,3775.0,2.223204,33.0,-117.07,2.415
1498,3.9929,14.0,4.571048,1.049734,3171.0,2.816163,33.73,-117.92,2.258
2971,3.8672,42.0,4.416667,0.982456,776.0,3.403509,33.9,-118.07,1.627


In [None]:
## set up autoscaling and test it
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
def send_request():
    row = testX.sample(2)  
    input_data = row.drop('target', axis=1).to_numpy()
    predictor.predict(input_data)
num_requests = 10 ## # Adjust the number to simulate load
for _ in tqdm(range(100)):
    with ThreadPoolExecutor(max_workers=num_requests) as executor:
        futures = [executor.submit(send_request) for _ in range(num_requests)]
        # for future in futures:
        #     future.result()  # Wait for the request to complete

100%|██████████| 1000/1000 [04:15<00:00,  3.92it/s]


## **⚠️ Delete endpoint model and endpoint (IMPORTANT)**   

In [70]:
import boto3
## delete the endpoint model
endpoint_info = sagemaker_client.describe_endpoint(EndpointName=predictor.endpoint_name)
endpoint_config_name = endpoint_info['EndpointConfigName']
endpoint_config_info = sagemaker_client.describe_endpoint_config(EndpointConfigName=endpoint_config_name)
model_name = endpoint_config_info['ProductionVariants'][0]['ModelName']
sagemaker_client.delete_model(ModelName=model_name)
print("🟢 Model deleted:", model_name)
## delete the endpoint
# sagemaker_client.delete_endpoint(
#     EndpointName=predictor.endpoint_name)
predictor.delete_endpoint()
print("🟢 Endpoint deleted:", predictor.endpoint_name)

🟢 Model deleted: sagemaker-scikit-learn-2024-12-11-11-19-44-029


🟢 Endpoint deleted: sagemaker-scikit-learn-2024-12-11-11-19-45-103


---   

* 🟢⚠️ Issue in the deployment solved:     

  1. add dependencies   
      ```python
      %%writefile inference.py
      import joblib
      import os
      ```   
  2. pass a str here in `train.py` (`!python --features f""\{features}\""` in bash)  
      ```python
      hyperparameters={
          "features": features,
      },
      ```

  ```text
  UnexpectedStatusException: Error hosting endpoint sagemaker-scikit-learn-2024-12-11-03-37-43-750: Failed. Reason: 
  The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch
  logs for this endpoint.. Try changing the instance type or reference the troubleshooting page 
  https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference-troubleshooting.html  
  ```  

  ```text

  2024-12-11T04:01:16.378Z
  169.254.178.2 - - [11/Dec/2024:04:01:12 +0000] "GET /ping HTTP/1.1" 500 141 "-" "AHC/2.0"
  2024-12-11T04:01:17.307Z
  2024-12-11 04:01:17,128 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)
  2024-12-11T04:01:17.307Z
  2024-12-11 04:01:17,129 INFO - sagemaker_sklearn_container.serving - Encountered an unexpected error.
  2024-12-11T04:01:17.307Z
  [2024-12-11 04:01:17 +0000] [19] [ERROR] Error handling request /ping
  2024-12-11T04:01:17.307Z
  Traceback (most recent call last): File "/miniconda3/lib/python3.7/site-packages/gunicorn/workers/base_async.py", line 55, in handle self.handle_request(listener_name, req, client, addr) File "/miniconda3/lib/python3.7/site-packages/gunicorn/workers/ggevent.py", line 143, in handle_request super().handle_request(listener_name, req, sock, addr) File "/miniconda3/lib/python3.7/site-packages/gunicorn/workers/base_async.py", line 106, in handle_request respiter = self.wsgi(environ, resp.start_response) File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/serving.py", line 128, in main serving_env.module_dir) File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/serving.py", line 105, in import_module user_module = importlib.import_module(module_name) File "/miniconda3/lib/python3.7/importlib/__init__.py", line 118, in import_module if name.startswith('.'):
  2024-12-11T04:01:17.307Z
  AttributeError: 'NoneType' object has no attribute 'startswith'
  ```

* Check the container packages   
  https://github.com/aws/sagemaker-scikit-learn-container/blob/master/requirements.txt    

* Check the args documentation  
  https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html  