# SageMaker Studio Notebook for Data Transformation, Training, Model Registeration and Deployment

## Transform the data and train a model inside a Jupyter notebook.

In this workshop we will demonstrate traditional approach to model development and training directly in parameterized Jupyter notebooks.

In this notebook we will predict house prices based on the well-known Boston Housing dataset with a simple regression model in Tensorflow 2. This public dataset contains 13 features regarding housing stock of towns in the Boston area. Features include average number of rooms, accessibility to radial highways, adjacency to a major river, etc.

To begin, we'll import some necessary packages and set up directories for training and test data. We'll also set up a SageMaker Session to perform various operations.

In [None]:
!pip install matplotlib seaborn

In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import tensorflow as tf
import sagemaker
import boto3

In [None]:
import os

data_dir = os.path.join(os.getcwd(), 'data')
os.makedirs(data_dir, exist_ok=True)

train_dir = os.path.join(os.getcwd(), 'data/train')
os.makedirs(train_dir, exist_ok=True)

test_dir = os.path.join(os.getcwd(), 'data/test')
os.makedirs(test_dir, exist_ok=True)

raw_dir = os.path.join(os.getcwd(), 'data/raw')
os.makedirs(raw_dir, exist_ok=True)

batch_dir = os.path.join(os.getcwd(), 'data/batch')
os.makedirs(batch_dir, exist_ok=True)

# Dataset transformation <a class="anchor" id="SageMakerProcessing">

Next, we'll transform the dataset.

We'll now save the raw feature data, and also save the labels for training and testing.

In [None]:
from tensorflow.python.keras.datasets import boston_housing

(x_train, y_train), (x_test, y_test) = boston_housing.load_data()
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

np.save(os.path.join(raw_dir, 'x_train.npy'), x_train)
np.save(os.path.join(raw_dir, 'x_test.npy'), x_test)
np.save(os.path.join(raw_dir, 'y_train.npy'), y_train)
np.save(os.path.join(raw_dir, 'y_test.npy'), y_test)

Next, we'll execute the data preprocessing as shown below.

In [None]:
import glob
import numpy as np
import os
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
x_train = np.load(os.path.join(raw_dir, 'x_train.npy'))
scaler.fit(x_train)

We will download the training, testing and raw dataset in the directory structure created above.

In [None]:
input_files = glob.glob('{}/raw/*.npy'.format(data_dir))
print('\nINPUT FILE LIST: \n{}\n'.format(input_files))
for file in input_files:
    raw = np.load(file)
    # only transform feature columns
    if 'y_' not in file:
        transformed = scaler.transform(raw)
    if 'train' in file:
        if 'y_' in file:
            output_path = os.path.join(train_dir, 'y_train.npy')
            np.save(output_path, raw)
            print('SAVED LABEL TRAINING DATA FILE\n')
        else:
            output_path = os.path.join(train_dir, 'x_train.npy')
            np.save(output_path, transformed)
            print('SAVED TRANSFORMED TRAINING DATA FILE\n')
    else:
        if 'y_' in file:
            output_path = os.path.join(test_dir, 'y_test.npy')
            np.save(output_path, raw)
            print('SAVED LABEL TEST DATA FILE\n')
        else:
            output_path = os.path.join(test_dir, 'x_test.npy')
            np.save(output_path, transformed)
            print('SAVED TRANSFORMED TEST DATA FILE\n')

#  Training <a class="anchor" id="SageMakerHostedTraining">

Now that we've prepared a dataset, we can move on to model training.

In [None]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

def get_train_data(train_dir):
    x_train = np.load(os.path.join(train_dir, 'x_train.npy'))
    y_train = np.load(os.path.join(train_dir, 'y_train.npy'))
    print('x train', x_train.shape,'y train', y_train.shape)

    return x_train, y_train


def get_test_data(test_dir):
    x_test = np.load(os.path.join(test_dir, 'x_test.npy'))
    y_test = np.load(os.path.join(test_dir, 'y_test.npy'))
    print('x test', x_test.shape,'y test', y_test.shape)

    return x_test, y_test

def get_model():
    inputs = tf.keras.Input(shape=(13,))
    hidden_1 = tf.keras.layers.Dense(13, activation='tanh')(inputs)
    hidden_2 = tf.keras.layers.Dense(6, activation='sigmoid')(hidden_1)
    outputs = tf.keras.layers.Dense(1)(hidden_2)
    return tf.keras.Model(inputs=inputs, outputs=outputs)


## Paramterized Cell

Below cell is parameterized. That implies that the parameters values set during the job execution will be applied after this cell.

In cells after this, values of 'batch_size', 'epochs' and 'Learning_rate' will be provided at runtime in the notebook job in form of variables.

We will use SageMaker sessiona varibales to retrieve, session client information and default bucket 

In [None]:
#parameterized cell
sess = sagemaker.Session()
bucket = sess.default_bucket()                    # Set a default S3 bucket

In [None]:
x_train, y_train = get_train_data(train_dir)
x_test, y_test = get_test_data(test_dir)
model_dir = os.environ['SM_MODEL_DIR']
device = '/cpu:0'
print(device)
batch_size = int(param_batch_size)
epochs = int(param_epochs)
learning_rate = float(param_learning_rate)
print('batch_size = {}, epochs = {}, learning rate = {}'.format(batch_size, epochs, learning_rate))

with tf.device(device):
    model = get_model()
    optimizer = tf.keras.optimizers.SGD(learning_rate)
    model.compile(optimizer=optimizer, loss='mse')
    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
              validation_data=(x_test, y_test))

    # evaluate on test set
    scores = model.evaluate(x_test, y_test, batch_size, verbose=2)
    print("\nTest MSE :", scores)


The unzipped being saved on the local instance directory will include the assets required by TensorFlow Serving to load the model and serve it, including a .pb file.

In [None]:
model.save(model_dir+'/trainedmodel')

## Save the model in S3 bucket.
Upload the trained model along with other model assets in the 'Default S3 bucket' after creating a zipped tar file.

In [None]:
import tarfile
import os.path
def make_tarfile(output_filename, source_dir):
    with tarfile.open(output_filename, "w:gz") as tar:
        tar.add(source_dir, arcname=os.path.basename(source_dir))

make_tarfile("model.tar.gz", model_dir)
file_name= 'model.tar.gz'
s3_client = boto3.client('s3')
response = s3_client.upload_file(file_name, bucket, file_name)

# Publish the Model in Sagemaker Model registry
## Create a Model package group.


In [None]:
import time
import os
from sagemaker import get_execution_role, session
import boto3

region = boto3.Session().region_name

role = get_execution_role()

sm_client = boto3.client('sagemaker', region_name=region)


In [None]:
model_package_group_name = "BlogPostGroup" + str(round(time.time()))
model_package_group_input_dict = {
 "ModelPackageGroupName" : model_package_group_name,
 "ModelPackageGroupDescription" : "Sample model package group"
}

create_model_package_group_response = sm_client.create_model_package_group(**model_package_group_input_dict)
print('ModelPackageGroup Arn : {}'.format(create_model_package_group_response['ModelPackageGroupArn']))


Use Image location for the selected algorithm as per documentation [provided here](https://docs.aws.amazon.com/sagemaker/latest/dg/neo-deployment-hosting-services-container-images.html). Also provide the S3 Bucket URL that contains the model artifacts.

In [None]:
modelpackage_inference_specification =  {
    "InferenceSpecification": {
      "Containers": [
         {
            "Image": '301217895009.dkr.ecr.us-west-2.amazonaws.com/sagemaker-inference-tensorflow:2.9-cpu-py3',
	    "ModelDataUrl": 's3://'+bucket+'/model.tar.gz'
         }
      ],
      "SupportedContentTypes": [ "text/csv" ],
      "SupportedResponseMIMETypes": [ "text/csv" ],
   }
 }

In [None]:
create_model_package_input_dict = {
    "ModelPackageGroupName" : model_package_group_name,
    "ModelPackageDescription" : "Model to detect 3 different types of irises (Setosa, Versicolour, and Virginica)",
    "ModelApprovalStatus" : "PendingManualApproval"
}
create_model_package_input_dict.update(modelpackage_inference_specification)
create_model_package_response = sm_client.create_model_package(**create_model_package_input_dict)
model_package_arn = create_model_package_response["ModelPackageArn"]
print('ModelPackage Version ARN : {}'.format(model_package_arn))

# Deploy model for Real time inference.

In [None]:
!pip install -U sagemaker
from sagemaker import image_uris

sagemaker_client = boto3.client('sagemaker', region_name=region)

# Name of the framework or algorithm
framework='tensorflow'
#framework='xgboost' # Example

# Version of the framework or algorithm
version = '2.9'
#version = '0.90-1' # Example

image_scope = 'inference'

instance_type = 'ml.m5.large'

# Specify an AWS container image. 
container = image_uris.retrieve(region=region, 
                                framework=framework, 
                                image_scope=image_scope,
                                instance_type=instance_type,
                                version=version)

In [None]:
# Create a variable w/ the model S3 URI
# First, provide the name of your S3 bucket
s3_bucket = bucket 


# Replace with the name of your model artifact
model_filename = 'model.tar.gz'


# Relative S3 path
model_s3_key = model_filename

# Combine bucket name, model file name, and relate S3 path to create S3 model URI
model_url = f's3://{s3_bucket}/{model_s3_key}'                            
                        

Create a Model to be deployed by a real time inference endpoint.

In [None]:
model_name = 'BlogPostModelDeployment'

#Create model
create_model_response = sagemaker_client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = {
        'Image': container,
        'ModelDataUrl': model_url,
    })

Create Endpoint Configuration

In [None]:
import datetime
from time import gmtime, strftime

# Create an endpoint config name. Here we create one based on the date  
# so it we can search endpoints based on creation time.
endpoint_config_name = 'BlogPostModelDeploymentEndpointConfig'                         

endpoint_config_response = sagemaker_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name, # You will specify this name in a CreateEndpoint request.
    # List of ProductionVariant objects, one for each model that you want to host at this endpoint.
    ProductionVariants=[
        {
            "VariantName": "variant1", # The name of the production variant.
            "ModelName": model_name, 
            "InstanceType": instance_type, # Specify the compute instance type.
            "InitialInstanceCount": 1 # Number of instances to launch initially.
        }
    ]
)

print(f"Created EndpointConfig: {endpoint_config_response['EndpointConfigArn']}")

Create Deployment Endpoint

In [None]:
# The name of the endpoint. The name must be unique within an AWS Region in your AWS account.
endpoint_name = 'BlogPostModelDeploymentEndpoint' 

# The name of the endpoint configuration associated with this endpoint.
endpoint_config_name='BlogPostModelDeploymentEndpointConfig'

create_endpoint_response = sagemaker_client.create_endpoint(
                                            EndpointName=endpoint_name, 
                                            EndpointConfigName=endpoint_config_name) 
