In [None]:
import boto3
from sagemaker import session

from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer

import pandas as pd
import numpy as np

import io
import time
import random

#### Globals

In [None]:
# Session
sm_session = session.Session(boto3.Session())

# S3 client
s3_client = boto3.client('s3')

# S3 bucket
bucket = 'sagemaker-demo-third-party-models'

# Model provider name
prefix = 'prosper'

# Model endpoint name
endpoint_name = 'third-party-model-endpoint'

# Data files
buyer_zip_code_features_data_file = 'train/sample_basic_zip.csv'
zip_code_features_data_file = 'train/zip_features.csv'
training_data_file = 'train/train.csv'

# Total encoded zip classes
num_encoded_zip_classes = 25

# Zipcode cluster classes
num_cluster_classes = 16

# Zipcode division classes
num_division_classes = 9

#### Loading data files from S3

In [None]:
# Labels for data file
cols = ['gender', 'age_range', 'household_income_range', 'zip_features']

# Get sample_basic_zip
key = '{}/{}'.format(prefix, buyer_zip_code_features_data_file)

# Download from S3
basic_zip_file_obj = s3_client.get_object(Bucket=bucket, Key=key)

# Save to in-memory binary stream since file is relatively small (< 1 Mb)
basic_zip_buf = io.BytesIO(basic_zip_file_obj['Body'].read())

# Convert to Dataframe
basic_zip_df = pd.read_csv(basic_zip_buf, names=cols, encoding='utf-8')

# Verify
basic_zip_df.head()

Unnamed: 0,gender,age_range,household_income_range,zip_features
0,0,5,10,17011
1,1,7,13,71923
2,0,4,21,84095
3,1,7,8,13905
4,0,3,4,22032


In [None]:
# Get zip_features (file includes col headers)
key = '{}/{}'.format(prefix, zip_code_features_data_file)

# Download from S3
zip_features_file_obj = s3_client.get_object(Bucket=bucket, Key=key)

# Save to in-memory binary stream since file is relatively small (< 1 Mb)
zip_features_buf = io.BytesIO(zip_features_file_obj['Body'].read())

# Convert to Dataframe
zip_features_df = pd.read_csv(zip_features_buf, encoding='utf-8')

# Verify
zip_features_df.head()

Unnamed: 0,zip_code,cluster,division
0,1001,3,0
1,1002,4,0
2,1005,14,0
3,1007,14,0
4,1008,14,0


#### Helper functions

In [None]:
def get_zip_code_features(zip_features_df, zip_code):

    # Get the zip code features using a Dataframe query
    zip_features = zip_features_df[(zip_features_df.zip_code == int(zip_code))]
       
    # If no match found, then create empty encoding list
    result = np.zeros((np.add(num_cluster_classes, num_division_classes)), dtype=int)    

    # Defensive coding
    if(len(zip_features) > 0):

        # Get matching feature values
        cluster = zip_features['cluster'].values[0]
        division = zip_features['division'].values[0]

        # One-hot encode feature values
        cluster_encoded = np.eye(num_cluster_classes, dtype=int)[cluster]
        division_encoded = np.eye(num_division_classes, dtype=int)[division]

        # Concatenate the encoded features
        result = np.concatenate( (cluster_encoded, division_encoded) )
           
    # Return            
    return result

In [None]:
def predict(sample, predictor=predictor, delay=0.5):
   
    # Defensive coding
    if(len(sample) > 0):
        
        # Create predictor endpoint
        predictor = Predictor(endpoint_name=endpoint_name, 
                              sagemaker_session=None, 
                              serializer=CSVSerializer())        

        # Invoke the model's inference endpoint
        response = predictor.predict(data=sample)

        # Decode bytes to string
        response = response.decode('utf-8')
        
        # Suspends execution for # milliseconds
        time.sleep(delay)        

        # Return 
        return response

#### Create baseline training dataset

In [None]:
# Labels for one-hot encoded zip data features: zip_feature_0 - zip_feature_24
encoded_zip_feature_cols = np.array(['zip_feature_{}'.format(i) for i in range(0, num_encoded_zip_classes)])

# Get list of one-hot encoded zip features based on zipcode
encoded_zip_features = [get_zip_code_features(zip_features_df, zipcode) for zipcode in basic_zip_df['zip_features']]

# Create dataframe for the encoded zip_features
encoded_zip_features_df = pd.DataFrame(encoded_zip_features, columns=encoded_zip_feature_cols)

# Concatenate the first three columns of the home buyers file with the encoded zip features
df = pd.concat([basic_zip_df[['gender', 'age_range', 'household_income_range']], encoded_zip_features_df], axis=1)

# Adding a placeholder target column as first column
# I get a "extra_column_check" violation error without the target column
# Can't find documentation as to why I need to include the additional column for a DataQuality monitoring job.
# The addt'l col would make sense for a ModelQuality monitoring job -- will reach out to the SM MM devs for more info.
# Seemingly, an error should be thrown if baseline columns are < or > inference cols, not if cols are equal length
df.insert(loc=0, column='target', value=0.00)

In [None]:
# Load dataset into memory (since it is a relatively small dataset) 
data_stream = io.StringIO()
df.to_csv(data_stream, sep=',', encoding='utf-8', index=False) #header=False

# Get stream data from memory
train_csv = data_stream.getvalue()

In [None]:
# Upload to S3 bucket
key = '{}/{}'.format(prefix, training_data_file)
s3_client.put_object(Body=train_csv,
                     Bucket=bucket, 
                     Key=key, 
                     ContentType='text/csv')

#### Preprocess data for inference endpoint

In [None]:
# Select all columns except added 'target' for inference
df = df.iloc[:, 1:]

# Convert buyer zip_code samples to list
samples = df.values.tolist()

# Convert zip_code samples to list
zip_samples = zip_features_df['zip_code'].values.tolist()

#### Generate baseline data to trigger 'No Issues'

In [None]:
# Replay the training dataset as sample inference data
batch_size = 10

# Invoke real-time inference endpoint using baseline data
for index, sample in enumerate(samples[0:batch_size]):

    # Removes the open/close bracket from string -- not required
    sample = str(sample)[1:-1] 
    
    # Get inference response
    response = predict(sample)
    
    # Display the model's prediction probability
    print('Sample {0} >> Input: {1}: >> Prediction: {2}'.format(index, sample, response))
    

Sample 0 >> Input: 0, 5, 10, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0: >> Prediction: 0.4805711507797241
Sample 1 >> Input: 1, 7, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0: >> Prediction: 0.46936097741127014
Sample 2 >> Input: 0, 4, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0: >> Prediction: 0.5250639915466309
Sample 3 >> Input: 1, 7, 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0: >> Prediction: 0.46936097741127014
Sample 4 >> Input: 0, 3, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0: >> Prediction: 0.5310230255126953
Sample 5 >> Input: 1, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0: >> Prediction: 0.46936097741127014
Sample 6 >> Input: 1, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0: >> Prediction: 0.49675580859184265
Sample 7 >> Input: 1, 5, 8, 0, 0, 

#### Generate data to induce data quality constraint violations

In [None]:
# Number of samples generated for the inference endpoint
batch_size = 10

# Set min/max ranges for 'gender', 'age_range', 'household_income_range'
gender_min, gender_max = 0, 1
age_range_min, age_range_max = 1, 6
household_income_range_min, household_income_range_max = 0, 24

# Let's create a 'data_type_check' data quality constraint violation by sending negative fractional values
# instead of the expected positive integer values
noise_factor = -0.5

# Invoke real-time inference endpoint
for index in range(batch_size):
                
    # Assign random values to each feature
    gender = random.randint(gender_min, gender_max) * noise_factor
    age_range = random.randint(age_range_min, age_range_max) * noise_factor
    household_income_range = random.randint(household_income_range_min, household_income_range_max) * noise_factor

    # Shuffle the zip code samples
    random.shuffle(zip_samples)

    # One-hot encode the random zipcode
    zip_features = get_zip_code_features(zip_features_df, zip_samples[0]) # * noise_factor
    zip_features = ",".join(zip_features.astype(str))    

    # Format request data as comma-delimited string
    sample = f'{gender},{age_range},{household_income_range},{zip_features}'

    # Get inference response
    response = predict(sample)
    
    # Display the model's prediction probability
    print('Sample {0} >> Input: {1}: >> Prediction: {2}'.format(index, sample, response))

Sample 0 >> Input: -0.0,-0.5,-5.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0: >> Prediction: 0.5188708305358887
Sample 1 >> Input: -0.0,-1.0,-8.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0: >> Prediction: 0.5188708305358887
Sample 2 >> Input: -0.0,-2.0,-9.5,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0: >> Prediction: 0.5188708305358887
Sample 3 >> Input: -0.0,-1.0,-2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0: >> Prediction: 0.5188708305358887
Sample 4 >> Input: -0.5,-3.0,-0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0: >> Prediction: 0.5188708305358887
Sample 5 >> Input: -0.5,-2.5,-2.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0: >> Prediction: 0.5188708305358887
Sample 6 >> Input: -0.0,-0.5,-3.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1: >> Prediction: 0.5188708305358887
Sample 7 >> Input: -0.5,-1.0,-7.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1: >> Prediction: 0.5188708305358887
Sample 8 >> Input: -0.0,-1.5,-9.5,0,0,0,0,0,0,0,

In [None]:
# Manully induce a completeness_check constraint violation:
completeness_check_gender = ''""',5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0'
completeness_check_age = '0,'""',0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0'
completeness_check_hir = '0,1,'""',0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0'

# Uncomment and run cell for each feature
# sample = completeness_check_gender
# sample = completeness_check_age
sample = completeness_check_hir

# Get inference response
response = predict(sample)

# Display the model's prediction probability
print('Sample >> Input: {0}: >> Prediction: {1}'.format(sample, response))

#### Monitoring Schedule management

In [None]:
# !aws sagemaker describe-monitoring-schedule --monitoring-schedule-name 'third-party-model-data-quality-schedule'
# !aws sagemaker list-monitoring-executions --monitoring-schedule-name 'third-party-model-data-quality-schedule'

#### Resource Cleanup

In [None]:
# Step 1.
# print('Stopping monitoring schedule...')
# !aws sagemaker stop-monitoring-schedule --monitoring-schedule-name 'third-party-model-data-quality-schedule'
# time.sleep(30) # allow time for processing
# !aws sagemaker list-monitoring-schedules --endpoint-name 'third-party-model-endpoint'

# Step 2.
# print('Deleting monitoring schedule...')
# !aws sagemaker delete-monitoring-schedule --monitoring-schedule-name 'third-party-model-data-quality-schedule'
# time.sleep(30) # allow time for processing
# !aws sagemaker list-monitoring-schedules --endpoint-name 'third-party-model-endpoint'

# Step 3.
# print('Deleting model endpoint...')
# !aws sagemaker delete-endpoint --endpoint-name 'third-party-model-endpoint'
# time.sleep(30) # allow time for processing
# !aws sagemaker list-endpoints --name-contains 'third-party-model-endpoint'

# Step 4.
# print('Deleting model endpoint config...')
# !aws sagemaker delete-endpoint-config --endpoint-config-name 'third-party-model-endpoint-config'
# time.sleep(30) # allow time for processing
# !aws sagemaker list-endpoint-configs --name-contains 'third-party-model-endpoint-config'

# Step 5.
# print('Deleting model...')
# !aws sagemaker delete-model --model-name 'third-party-model'
# time.sleep(30) # allow time for processing
# !aws sagemaker list-models --name-contains 'third-party-model'