In [277]:
import boto3
from sagemaker import session
from sagemaker.s3 import S3Downloader

from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer

import pandas as pd
import numpy as np

import io as StringIO
import time
import random

#### Globals

In [262]:
# Session
sm_session = session.Session(boto3.Session())

# S3 client
s3_client = boto3.client('s3')

# S3 bucket
bucket = 'sagemaker-demo-third-party-models'

# Model provider name
prefix = 'prosper'

# Model endpoint name
endpoint_name = 'third-party-model-endpoint'

# Data files
buyer_zip_code_features_data_file = 'data/sample_basic_zip.csv'
zip_code_features_data_file = 'data/zip_features.csv'
#training_data_file = 'train/train-no-headers.csv'
training_data_file = 'train/train.csv'

# Create predictor endpoint
predictor = Predictor(endpoint_name=endpoint_name, 
                      sagemaker_session=None, 
                      serializer=CSVSerializer())

#### Loading data files from S3

In [263]:
# Training data dictionary
training_data = {
    'basic_zip': {'file_name' : buyer_zip_code_features_data_file, 'file_uri':'', 'data':''},
    'zip_features': {'file_name' : zip_code_features_data_file, 'file_uri':'', 'data':''}}

# S3 data file locations
training_data['basic_zip']['file_uri'] = 's3://{}/{}/{}'.format(bucket, prefix, training_data['basic_zip']['file_name'])
training_data['zip_features']['file_uri'] = 's3://{}/{}/{}'.format(bucket, prefix, training_data['zip_features']['file_name'])

# Download files
training_data['basic_zip']['data'] = S3Downloader.read_file(training_data['basic_zip']['file_uri'])
training_data['zip_features']['data'] = S3Downloader.read_file(training_data['zip_features']['file_uri'])

In [264]:
# Testing

# print(training_data['basic_zip']['file_uri'])
# print(training_data['zip_features']['file_uri'])

# len(training_data['basic_zip']['data'].splitlines()) # 1000 rows
# len(training_data['zip_features']['data'].splitlines()) # 28845 rows

#### Helper functions

In [265]:
def get_zip_code_features(zip_features_df, zip_code):

    # Get the zip code features using a Dataframe query
    zip_features = zip_features_df[(zip_features_df.zip_code == zip_code)]
       
    # If no match found, then create empty encoding list
    result = np.zeros((np.add(num_cluster_classes, num_division_classes)), dtype=int)    

    # Defensive coding
    if(len(zip_features) > 0):

        # Get matching feature values
        cluster = zip_features['cluster'].values[0]
        division = zip_features['division'].values[0]

        # One-hot encode feature values
        cluster_encoded = np.eye(num_cluster_classes, dtype=int)[cluster]
        division_encoded = np.eye(num_division_classes, dtype=int)[division]

        # Concatenate the encoded features
        result = np.concatenate( (cluster_encoded, division_encoded) )
       
    # Convert array of integers to a comma-delimited string
    #result = ",".join(result.astype(str))    
    
    # Return            
    return result

In [266]:
def predict(sample, predictor=predictor, delay=0.5):
   
    # Defensive coding
    if(len(sample) > 0):

        # Invoke the model's inference endpoint
        response = predictor.predict(data=sample)

        # Decode bytes to string
        response = response.decode('utf-8')
        
        # Suspends execution for # milliseconds
        time.sleep(delay)        

        # Return 
        return response

#### Create baseline training dataset

In [267]:
# Total encoded zip classes
num_encoded_zip_classes = 25

# Zipcode cluster classes
num_cluster_classes = 16

# Zipcode division classes
num_division_classes = 9

# Labels for data files
cols = ['gender', 'age_range', 'household_income_range', 'zip_features']
encoded_zip_feature_cols = np.array(['zip_feature_{}'.format(i) for i in range(0, num_encoded_zip_classes)])

# Convert data files to dataframe
basic_zip_df = pd.read_csv(io.StringIO(training_data['basic_zip']['data']), names=cols, nrows=65536)

zip_features_df = pd.read_csv(io.StringIO(training_data['zip_features']['data'])) # File includes col headers

# Get list of one-hot encode zip features based on zipcode
encoded_zip_features = [get_zip_code_features(zip_features_df, zipcode) for zipcode in basic_zip_df['zip_features']]

# Update zip_features (zipcode) column with binary encoded zip features
# basic_zip_df['zip_features'] = encoded_zip_features

# Create dataframe for the encoded zip_features
encoded_zip_features_df = pd.DataFrame(encoded_zip_features, columns=encoded_zip_feature_cols)

# Concatenate the first three columns of the home buyers file with the encoded zip features
df = pd.concat([basic_zip_df[['gender', 'age_range', 'household_income_range']], encoded_zip_features_df], axis=1)

In [269]:
# Load dataset into memory (since it is a relatively small dataset) 
data_stream = io.StringIO()
df.to_csv(data_stream, sep=',', encoding='utf-8', index=False) #header=False

# Get stream data from memory
train_csv = data_stream.getvalue()

In [270]:
# Upload to S3 bucket
key = '{}/{}'.format(prefix, training_data_file)
s3_client.put_object(Body=train_csv,
                     Bucket=bucket, 
                     Key=key, 
                     ContentType='text/csv')

{'ResponseMetadata': {'RequestId': '2D32697A1395F4C3',
  'HostId': 'ktBlOHIhGHVR9gVeMXGhcIge6l533dYgB8h3kVr27qTDAH2kfiQgwl6J2qg36RKNEbZgmSFD6+w=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'ktBlOHIhGHVR9gVeMXGhcIge6l533dYgB8h3kVr27qTDAH2kfiQgwl6J2qg36RKNEbZgmSFD6+w=',
   'x-amz-request-id': '2D32697A1395F4C3',
   'date': 'Thu, 24 Dec 2020 01:35:47 GMT',
   'etag': '"7f597b051396972da8fad7d702d6078f"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"7f597b051396972da8fad7d702d6078f"'}

#### Create baseline training dataset with ground truth (optional)
##### Be sure to check hourly rates before running a model's Batch Transform job

1. Amazon SageMaker > Batch transform jobs > Create batch transform job
2. Input data configuration: Split type = Line || Content type = text/csv
3. Output data configuration: Assemble with = Line || Accept = text/csv
4. Input/output filtering and data joins: Join source = Input - merge input data with job output
5. Download output file from S3, rename to .csv, open file
6. Add a header row, copy/paste the headers from training_data_file
7. Insert a new first column
8. Shift/move last column to new first column and name as 'target'

In [271]:
# Convert buyer zip_code samples to list
samples = df.values.tolist()

# Convert zip_code samples to list
zip_samples = zip_features_df['zip_code'].values.tolist()

#### Generate baseline data to trigger 'No Issues'

In [300]:
# Replay the training dataset as sample inference data

# Invoke real-time inference endpoint using baseline data
for index, sample in enumerate(samples[0:10]):

    # Removes the open/close bracket from string -- not required
    # sample = str(sample)[1:-1] 
    
    # Get inference response
    response = predict(sample)
    
    # Display the model's prediction probability
    print('Sample {0} >> Input: {1}: >> Prediction: {2}'.format(index, sample, response))
    

Sample 0 >> Input: [0, 5, 10, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]: >> Prediction: 0.4805711507797241
Sample 1 >> Input: [1, 7, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]: >> Prediction: 0.46936097741127014
Sample 2 >> Input: [0, 4, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]: >> Prediction: 0.5250639915466309
Sample 3 >> Input: [1, 7, 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]: >> Prediction: 0.46936097741127014
Sample 4 >> Input: [0, 3, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]: >> Prediction: 0.5310230255126953
Sample 5 >> Input: [1, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]: >> Prediction: 0.46936097741127014
Sample 6 >> Input: [1, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]: >> Prediction: 0.49675580859184265
Sample 7 >> Input: [

#### Generate data to induce data quality constraint violations

In [304]:
# Get min/max ranges for 'gender', 'age_range', 'household_income_range'
gender_min, gender_max = min(df['gender']), max(df['gender'])
age_range_min, age_range_max = min(df['age_range']), max(df['age_range'])
household_income_range_min, household_income_range_max = min(df['household_income_range']), max(df['household_income_range'])

noise_factor = 1

# Invoke real-time inference endpoint using baseline data
for index in range(10):
                
    # Assign random values to each feature
    gender = random.randint(gender_min, gender_max) * noise_factor
    age_range = random.randint(gender_min, gender_max) * noise_factor
    household_income_range = random.randint(household_income_range_min, household_income_range_max) * noise_factor

    # Shuffle the zip code samples
    random.shuffle(zip_samples)

    # Get random zipcode value        
    zip_features = get_zip_code_features(zip_features_df, zip_samples[0]) * noise_factor
    zip_features = ",".join(zip_features.astype(str))    

    # format request data as comma-delimited string
    sample = f'{gender},{age_range},{household_income_range},{zip_features}'

    # Get inference response
    response = predict(sample)
    
    # Display the model's prediction probability
    print('Sample {0} >> Input: {1}: >> Prediction: {2}'.format(index, sample, response))


Sample 0 >> Input: 1,1,13,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0: >> Prediction: 0.5332071781158447
Sample 1 >> Input: 1,1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0: >> Prediction: 0.5360097289085388
Sample 2 >> Input: 1,1,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0: >> Prediction: 0.5332071781158447
Sample 3 >> Input: 1,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0: >> Prediction: 0.5304257273674011
Sample 4 >> Input: 0,1,4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0: >> Prediction: 0.5410555005073547
Sample 5 >> Input: 0,1,10,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0: >> Prediction: 0.5360277891159058
Sample 6 >> Input: 1,0,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0: >> Prediction: 0.5373708009719849
Sample 7 >> Input: 0,0,22,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0: >> Prediction: 0.5401649475097656
Sample 8 >> Input: 1,1,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0: >> Prediction: 0.5373708009719849
Sam

#### Monitoring Schedule management

In [None]:
# !aws sagemaker describe-monitoring-schedule --monitoring-schedule-name 'third-party-model-data-quality-schedule'
# !aws sagemaker list-monitoring-executions --monitoring-schedule-name 'third-party-model-data-quality-schedule'

#### Resource Cleanup

In [None]:
# Step 1.
# !aws sagemaker stop-monitoring-schedule --monitoring-schedule-name 'third-party-model-data-quality-schedule'
# time.sleep(30) # allow time for processing
# !aws sagemaker list-monitoring-schedules --endpoint-name 'third-party-model-endpoint'

# Step 2.
# !aws sagemaker delete-monitoring-schedule --monitoring-schedule-name 'third-party-model-data-quality-schedule'
# time.sleep(30) # allow time for processing
# !aws sagemaker list-monitoring-schedules --endpoint-name 'third-party-model-endpoint'

# Step 3.
# !aws sagemaker delete-endpoint --endpoint-name 'third-party-model-endpoint'
# time.sleep(30) # allow time for processing
# !aws sagemaker list-endpoints --name-contains 'third-party-model-endpoint'

# Step 4.
# !aws sagemaker delete-endpoint-config --endpoint-config-name 'third-party-model-endpoint-config'
# time.sleep(30) # allow time for processing
# !aws sagemaker list-endpoint-configs 'third-party-model-endpoint-config'

# Step 5.
# !aws sagemaker delete-model --model-name 'third-party-model'
# time.sleep(30) # allow time for processing
# !aws sagemaker list-models --name-contains 'third-party-model'