In [1]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container
my_region = boto3.session.Session().region_name # set the region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " 
      + containers[my_region] + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-2 region. You will use the 825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [7]:
!pip install s3fs

[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [15]:
train = pd.read_csv('s3://aws-trends-sample-s3/training/x_cv.csv')
train.name = 'train'

In [29]:
train_target = pd.read_csv('s3://aws-trends-sample-s3/training/y_cv.csv')

In [20]:
train.head()

Unnamed: 0,buying__high,buying__low,buying__med,buying__vhigh,maint__high,maint__low,maint__med,maint__vhigh,doors__2,doors__3,...,doors__5more,persons__2,persons__4,persons__more,lug_boot__big,lug_boot__med,lug_boot__small,safety__high,safety__low,safety__med
0,0,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,...,1,0,1,0,0,1,0,0,0,1
2,0,0,0,1,0,0,1,0,0,0,...,1,1,0,0,0,0,1,1,0,0
3,0,0,1,0,0,0,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0
4,0,1,0,0,0,1,0,0,0,1,...,0,0,1,0,0,1,0,0,1,0


In [147]:
test_u = pd.read_csv('s3://aws-trends-sample-s3/testing/x_hold.csv')
test_u.name = 'test_u'
test_m = pd.read_csv('s3://aws-trends-sample-s3/testing/x_hold_modified.csv')
test_m.name = 'test_m'

In [27]:
test_target = pd.read_csv('s3://aws-trends-sample-s3/testing/y_hold.csv')

In [212]:
psi_columns = list(train.columns)
psi_columns = psi_columns  + ['test_dataset']
psi_data = pd.DataFrame(columns = psi_columns)

In [187]:
def psi_calculate_onehot(train,test):
    tracking = []
    for i in train.columns:
        col_train = train[str(i)].value_counts(normalize=True)
        col_test = test[str(i)].value_counts(normalize=True)
        psi = 0
        for j in list(col_train.index.sort_values()):
            try:
                col_test[j]
            except KeyError:
                col_test[j] = 0.0000001
            psi += ((col_test[j] - col_train[j])*math.log(col_test[j]/col_train[j]))
        if psi > 0.25:
            print(i+': Has a very different distribution')
        tracking.append(psi)
    tracking = tracking + [str(test.name)]
    return tracking

In [190]:
bucket_name = 'aws-trends-sample-s3'
prefix = 'sagemaker/DEMO-xgboost-dm'

sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(containers[my_region],role, train_instance_count=1, train_instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name, prefix),sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective= 'multi:softmax',num_class=4,num_round=100)

In [191]:
pd.concat([train_target['decision'], train], axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [192]:
xgb.fit({'train': s3_input_train})

2019-12-08 20:45:45 Starting - Starting the training job...
2019-12-08 20:45:46 Starting - Launching requested ML instances...
2019-12-08 20:46:40 Starting - Preparing the instances for training......
2019-12-08 20:47:37 Downloading - Downloading input data...
2019-12-08 20:47:59 Training - Downloading the training image.[34mArguments: train[0m
[34m[2019-12-08:20:48:19:INFO] Running standalone xgboost training.[0m
[34m[2019-12-08:20:48:19:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2019-12-08:20:48:19:INFO] File size need to be processed in the node: 0.06mb. Available memory size in the node: 8528.13mb[0m
[34m[2019-12-08:20:48:19:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:48:19] S3DistributionType set as FullyReplicated[0m
[34m[20:48:19] 1382x21 matrix with 29022 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[20:48:19] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 16 extra nodes, 0


2019-12-08 20:48:31 Uploading - Uploading generated training model
2019-12-08 20:48:31 Completed - Training job completed
Training seconds: 54
Billable seconds: 54


In [194]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

-------------------------------------------------------------------------------------!

In [211]:
accuracy = list()

In [209]:
def classification_report(test_target,predictions_array):
    cm = pd.crosstab(index=test_target['decision'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
    tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
    print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
    print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
    print("Observed")
    print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
    print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))
    return p

In [213]:
test_data_array = test_u.values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array

accuracy.append(['test_u',classification_report(test_target,predictions_array)])

psi_data = psi_data.append(pd.Series(psi_calculate_onehot(train,test_u), index=psi_columns), ignore_index=True)


Overall Classification Rate: 95.6%

Predicted      No Purchase    Purchase
Observed
No Purchase    98% (250)    16% (10)
Purchase        2% (4)     84% (53) 



In [197]:
test_data_array = test_m.values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array

classification_report(test_target,predictions_array)

accuracy.append(['test_m',p])

psi_data = psi_data.append(pd.Series(psi_calculate_onehot(train,test_m), index=psi_columns), ignore_index=True)


Overall Classification Rate: 69.0%

Predicted      No Purchase    Purchase
Observed
No Purchase    80% (216)    92% (44)
Purchase        20% (55)      8% (4) 



In [176]:
test_d1 = pd.read_csv('s3://aws-trends-sample-s3/testing/x_hold - D1.csv')
test_d1.name = 'test_d1'

In [177]:
test_d2 = pd.read_csv('s3://aws-trends-sample-s3/testing/x_hold - D2.csv')
test_d3 = pd.read_csv('s3://aws-trends-sample-s3/testing/x_hold - D3.csv')
test_d4 = pd.read_csv('s3://aws-trends-sample-s3/testing/x_hold - D4.csv')
test_d5 = pd.read_csv('s3://aws-trends-sample-s3/testing/x_hold - D5.csv')
test_d2.name = 'test_d2'
test_d3.name = 'test_d3'
test_d4.name = 'test_d4'
test_d5.name = 'test_d5'

In [220]:
test_data_array = test_d1.values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array

accuracy.append(['test_d1',classification_report(test_target,predictions_array)])

psi_data = psi_data.append(pd.Series(psi_calculate_onehot(train,test_d1), index=psi_columns), ignore_index=True)


Overall Classification Rate: 57.9%

Predicted      No Purchase    Purchase
Observed
No Purchase    82% (147)    81% (89)
Purchase        18% (33)     19% (21) 



In [215]:
test_data_array = test_d2.values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array

accuracy.append(['test_d2',classification_report(test_target,predictions_array)])

psi_data = psi_data.append(pd.Series(psi_calculate_onehot(train,test_d2), index=psi_columns), ignore_index=True)


Overall Classification Rate: 56.2%

Predicted      No Purchase    Purchase
Observed
No Purchase    83% (140)    81% (99)
Purchase        17% (28)     19% (23) 



In [216]:
test_data_array = test_d3.values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array

accuracy.append(['test_d3',classification_report(test_target,predictions_array)])

psi_data = psi_data.append(pd.Series(psi_calculate_onehot(train,test_d3), index=psi_columns), ignore_index=True)


Overall Classification Rate: 66.9%

Predicted      No Purchase    Purchase
Observed
No Purchase    82% (182)    79% (59)
Purchase        18% (39)     21% (16) 



In [217]:
test_data_array = test_d4.values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array

accuracy.append(['test_d4',classification_report(test_target,predictions_array)])

psi_data = psi_data.append(pd.Series(psi_calculate_onehot(train,test_d4), index=psi_columns), ignore_index=True)


Overall Classification Rate: 65.7%

Predicted      No Purchase    Purchase
Observed
No Purchase    81% (181)    79% (61)
Purchase        19% (42)     21% (16) 



In [218]:
test_data_array = test_d5.values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array

accuracy.append(['test_d5',classification_report(test_target,predictions_array)])

psi_data = psi_data.append(pd.Series(psi_calculate_onehot(train,test_d5), index=psi_columns), ignore_index=True)


Overall Classification Rate: 63.4%

Predicted      No Purchase    Purchase
Observed
No Purchase    80% (179)    86% (64)
Purchase        20% (45)     14% (10) 



In [221]:
accuracy

[['test_u', 95.58359621451105],
 ['test_d2', 56.20689655172414],
 ['test_d3', 66.8918918918919],
 ['test_d4', 65.66666666666666],
 ['test_d5', 63.42281879194631],
 ['test_d1', 57.931034482758626]]

In [222]:
psi_data

Unnamed: 0,buying__high,buying__low,buying__med,buying__vhigh,maint__high,maint__low,maint__med,maint__vhigh,doors__2,doors__3,...,persons__2,persons__4,persons__more,lug_boot__big,lug_boot__med,lug_boot__small,safety__high,safety__low,safety__med,test_dataset
0,0.005167,0.001393,1.7e-05,0.001393,0.000432,0.000439,1.7e-05,1.7e-05,0.004009,0.003839,...,0.004357,0.027883,0.009256,2.6e-05,0.001271,0.001687,0.018243,0.014107,0.000163,test_u
1,0.005167,0.001393,1.7e-05,0.001393,0.000432,0.000439,1.7e-05,1.7e-05,0.004009,0.003839,...,0.004357,0.027883,0.009256,2.6e-05,0.001271,0.001687,0.375349,1.069595,0.000163,test_d2
2,0.005167,0.001393,1.7e-05,0.001393,0.000432,0.000439,1.7e-05,1.7e-05,0.004009,0.003839,...,0.004357,0.027883,0.009256,2.6e-05,0.001271,0.001687,0.286071,0.014107,1.202105,test_d3
3,0.005167,0.001393,0.267752,0.721593,0.000432,0.000439,1.7e-05,1.7e-05,0.004009,0.003839,...,0.004357,0.027883,0.009256,2.6e-05,0.001271,0.001687,0.018243,0.014107,0.000163,test_d4
4,0.005167,0.001393,1.7e-05,0.001393,0.000432,0.229788,1.7e-05,0.800421,0.004009,0.003839,...,0.004357,0.027883,0.009256,2.6e-05,0.001271,0.001687,0.018243,0.014107,0.000163,test_d5
5,0.005167,0.001393,1.7e-05,0.001393,0.000432,0.000439,1.7e-05,1.7e-05,0.004009,0.003839,...,0.969153,0.027883,0.622273,2.6e-05,0.001271,0.001687,0.018243,0.014107,0.000163,test_d1


In [223]:
pd.DataFrame(accuracy,columns=['data','accuracy']).to_csv('accuracy.csv')
key = "accuracy"
url = 's3://{}/{}'.format(bucket_name, key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('accuracy.csv')
print('Done writing to {}'.format(url))

Done writing to s3://aws-trends-sample-s3/accuracy


In [224]:
psi_data.to_csv('psi.csv')
key = "psi"
url = 's3://{}/{}'.format(bucket_name, key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('psi.csv')
print('Done writing to {}'.format(url))

Done writing to s3://aws-trends-sample-s3/psi


In [225]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': '5F62E5BB4F1D0454',
   'HostId': '0EPEhYR+3+rqy10dhKk7VA1NadJAbRMb7bX0kBt+ddfhyMOsE+uK8TO7E2aEpXdc+5YK44j39a0=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': '0EPEhYR+3+rqy10dhKk7VA1NadJAbRMb7bX0kBt+ddfhyMOsE+uK8TO7E2aEpXdc+5YK44j39a0=',
    'x-amz-request-id': '5F62E5BB4F1D0454',
    'date': 'Sun, 08 Dec 2019 21:20:33 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'sagemaker/DEMO-xgboost-dm/train/train.csv'},
   {'Key': 'accuracy'},
   {'Key': 'psi'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2019-12-08-20-45-45-356/output/model.tar.gz'}]}]