In [4]:
pip install sagemaker -U

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting sagemaker
  Downloading sagemaker-2.126.0.tar.gz (654 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m654.9/654.9 KB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting botocore<1.30.0,>=1.29.35
  Downloading botocore-1.29.41-py3-none-any.whl (10.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m90.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-2.126.0-py2.py3-none-any.whl size=890086 sha256=a08966ef2cde0c8fb0537b8387b87d64c6a5d7ec402d2213358376d18ae2d950
  Stored in directory: /home/ec2-user/.cache/pip/wheels/2f/4e/63/345e2f96c60d3f77a2b8be1182a430341092f763b4479dc578
Successfully built sagemaker
Installing collect

In [1]:
import datetime
import time
import tarfile

import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston

sm_boto3 = boto3.client('runtime.sagemaker')

sess = sagemaker.Session(boto3.session.Session())

region = sess.boto_session.region_name

bucket = sess.default_bucket()  # this could also be a hard-coded bucket name

print("Using bucket " + bucket)



Using bucket sagemaker-us-east-1-408035773647


In [2]:
# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path="data/train.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)

testpath = sess.upload_data(
    path="data/test.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)

In [9]:
%%writefile script.py

import argparse
import joblib
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ == "__main__":

    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--min-samples-leaf", type=int, default=3)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train.csv")
    parser.add_argument("--test-file", type=str, default="test.csv")
    parser.add_argument(
        "--features", type=str
    )  # in this script we ask user to explicitly name features
    parser.add_argument(
        "--target", type=str, default="offer_completed"
    )  # in this script we ask user to explicitly name the target

    args, _ = parser.parse_known_args()

    print("reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("building training and testing datasets")
    print(train_df.columns)

    X_train = train_df.drop(columns=[args.target])
    X_test = test_df.drop(columns=[args.target])
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # train
    print("training model")
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import SGDClassifier, LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier

    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline
    std = StandardScaler()
    
    X_train = std.fit_transform(X_train)
    X_test = std.transform(X_test)

    
    model = RandomForestClassifier(max_depth= 6, min_samples_split= 5)
    model.fit(X_train, y_train)

    print("traning Accuracy: ", model.score(X_train,y_train))
    print("testing Accuracy: ", model.score(X_test,y_test))



    # print abs error
    print("validating model")
    abs_err = np.abs(model.predict(X_test) - y_test)

    # print couple perf metrics
    for q in [10, 50, 90]:
        print("AE-at-" + str(q) + "th-percentile: " + str(np.percentile(a=abs_err, q=q)))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)

Overwriting script.py


In [10]:
! python script.py --model-dir ./ \
                   --train ./data \
                   --test ./data \
                   --features 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT' \
                   --target offer_completed

extracting arguments
reading data
building training and testing datasets
Index(['time', 'age', 'income', 'reward', 'difficulty', 'duration',
       'offer_type_bogo', 'offer_type_discount', 'offer_type_informational',
       'offer_id_0b1e1539f2cc45b7b9fa7c272da2e1d7',
       'offer_id_2298d6c36e964ae4a3e7e9706d1fb8c2',
       'offer_id_2906b810c7d4411798c6938adc9daaa5',
       'offer_id_3f207df678b143eea3cee63160fa8bed',
       'offer_id_4d5c57ea9a6940dd891ad53e9dbe8da0',
       'offer_id_5a8bc65990b245e5a138643cd4eb9837',
       'offer_id_9b98b8c7a33c4b65b9aebfe6a799e6d9',
       'offer_id_ae264e3637204a6fb9bb56bc8210ddfd',
       'offer_id_f19421c1d4aa40978ebb69ca19b0e20d',
       'offer_id_fafdcd668e3743c1bb461111dcafc2a4', 'email', 'mobile',
       'social', 'web', 'offer_completed'],
      dtype='object')
training model
traning Accuracy:  0.7950541718890572
testing Accuracy:  0.7948221109855212
validating model
AE-at-10th-percentile: 0.0
AE-at-50th-percentile: 0.0
AE-at-90th-perc

In [11]:
# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",#"ml.c5.xlarge",#"ml.m4.xlarge",
    framework_version=FRAMEWORK_VERSION
)

import time
# tic = time.clock()

# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

# toc = time.clock()
# print(toc - tic)

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2023-01-01-18-21-57-786


2023-01-01 18:21:58 Starting - Starting the training job......
2023-01-01 18:22:33 Starting - Preparing the instances for training.........
2023-01-01 18:24:21 Downloading - Downloading input data.....[34m2023-01-01 18:25:08,246 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-01-01 18:25:08,249 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-01-01 18:25:08,287 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-01-01 18:25:08,473 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-01-01 18:25:08,487 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-01-01 18:25:08,502 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-01-01 18:25:08,515 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining En

In [17]:
sm_boto3 = boto3.client('sagemaker')
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)

Model artifact persisted at s3://sagemaker-us-east-1-408035773647/sagemaker-scikit-learn-2023-01-01-18-21-57-786/output/model.tar.gz


In [18]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

predictor = model.deploy(instance_type="ml.c5.large", initial_instance_count=1)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2023-01-01-18-29-08-491
INFO:sagemaker:Creating endpoint-config with name sagemaker-scikit-learn-2023-01-01-18-29-08-933
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2023-01-01-18-29-08-933


-----!

In [19]:
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint_name)

{'ResponseMetadata': {'RequestId': 'c01859a0-9d13-4971-9e2f-b4ca47a87ea7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c01859a0-9d13-4971-9e2f-b4ca47a87ea7',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sun, 01 Jan 2023 18:56:09 GMT'},
  'RetryAttempts': 0}}