In [2]:
import numpy as np
import pandas as pd

# Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

## Upload Data to S3

In [4]:
bucket_name = 'arch-ml-bucket-us-east-1'
training_file_key = 'telco_customer_churn/telco_train.csv'
validation_file_key = 'telco_customer_churn/telco_validation.csv'
test_file_key = 'telco_customer_churn/telco_test.csv'

s3_model_output_location = r's3://{0}/telco_customer_churn/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_file_key)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_file_key)

In [5]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://arch-ml-bucket-us-east-1/telco_customer_churn/model
s3://arch-ml-bucket-us-east-1/telco_customer_churn/telco_train.csv
s3://arch-ml-bucket-us-east-1/telco_customer_churn/telco_validation.csv
s3://arch-ml-bucket-us-east-1/telco_customer_churn/telco_test.csv


In [6]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [7]:
write_to_s3('telco_train.csv', bucket_name,training_file_key)
write_to_s3('telco_validation.csv', bucket_name,validation_file_key)
write_to_s3('telco_test.csv', bucket_name,test_file_key)

## Get Training Algorithm Docker Image

In [11]:
# Registry Path for algorithms provided by SageMaker
#  https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

In [8]:
role = get_execution_role()

In [9]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::497576017422:role/service-role/AmazonSageMaker-ExecutionRole-20190426T115051


## Build Model

In [10]:
sess = sagemaker.Session()

In [12]:
# Access appropriate algorithm container image
#  Specify how many instances to use for distributed training and what type of machine to use
#  Finally, specify where the trained model artifacts needs to be stored
#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
#    Optionally, give a name to the training job using base_job_name

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=s3_model_output_location,
                                       sagemaker_session=sess,
                                       base_job_name ='xgboost-telco-v1')

In [13]:
# Specify hyper parameters that appropriate for the training algorithm
# XGBoost Training Parameter Reference: 
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

# Same as those in localmode
estimator.set_hyperparameters(max_depth=3,objective="binary:logistic",
                              eta=0.3,subsample=1,num_round=100)

In [14]:
estimator.hyperparameters()

{'max_depth': 3,
 'objective': 'binary:logistic',
 'eta': 0.3,
 'subsample': 1,
 'num_round': 100}

## Train

In [15]:
# content type can be libsvm or csv for XGBoost
training_input_config = sagemaker.session.s3_input(s3_data=s3_training_file_location,content_type="csv")
validation_input_config = sagemaker.session.s3_input(s3_data=s3_validation_file_location,content_type="csv")

In [16]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://arch-ml-bucket-us-east-1/telco_customer_churn/telco_train.csv'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://arch-ml-bucket-us-east-1/telco_customer_churn/telco_validation.csv'}}, 'ContentType': 'csv'}


In [17]:
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit({'train':training_input_config, 'validation':validation_input_config})

INFO:sagemaker:Creating training-job with name: xgboost-telco-v1-2019-04-29-10-47-37-280


2019-04-29 10:47:37 Starting - Starting the training job...
2019-04-29 10:47:39 Starting - Launching requested ML instances......
2019-04-29 10:48:49 Starting - Preparing the instances for training......
2019-04-29 10:50:05 Downloading - Downloading input data...
2019-04-29 10:50:37 Training - Training image download completed. Training in progress.
2019-04-29 10:50:37 Uploading - Uploading generated training model.
[31mArguments: train[0m
[31m[2019-04-29:10:50:36:INFO] Running standalone xgboost training.[0m
[31m[2019-04-29:10:50:36:INFO] File size need to be processed in the node: 0.43mb. Available memory size in the node: 8423.55mb[0m
[31m[2019-04-29:10:50:36:INFO] Determined delimiter of CSV input is ','[0m
[31m[10:50:36] S3DistributionType set as FullyReplicated[0m
[31m[10:50:36] 3451x40 matrix with 138040 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-04-29:10:50:36:INFO] Determined delimiter of CSV input is ','[0m
[


2019-04-29 10:50:44 Completed - Training job completed
Billable seconds: 40


## Deploy the model

In [18]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.t2.medium',
                             endpoint_name = 'xgboost-telco-v1')

INFO:sagemaker:Creating model with name: xgboost-2019-04-29-10-51-33-711
INFO:sagemaker:Creating endpoint with name xgboost-telco-v1


----------------------------------------------------------------------------------------!

## Run Prediction

In [19]:
endpoint_name = 'xgboost-telco-v1'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

In [20]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [3]:
df_test = pd.read_csv('telco_test.csv')
df_test.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,1,5,0,0,34.25,163.55,0,...,1,0,0,1,0,0,0,0,1,0
1,1,0,0,1,70,1,0,64.95,4551.5,0,...,1,0,0,0,1,0,1,0,0,0
2,1,0,0,0,4,1,1,74.8,321.9,1,...,1,0,0,1,0,0,1,0,0,0
3,1,0,0,0,19,1,1,84.75,1651.95,0,...,1,0,0,1,0,0,0,0,1,0
4,0,0,1,1,72,1,1,115.5,8425.15,0,...,0,0,1,0,0,1,0,0,1,0


In [26]:
arr = df_test.iloc[0].values
','.join(arr.astype('str'))

'0.0,0.0,1.0,1.0,5.0,0.0,0.0,34.25,163.55,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0'

In [43]:
df_test.iloc[0]

gender                                       0.00
SeniorCitizen                                0.00
Partner                                      1.00
Dependents                                   1.00
tenure                                       5.00
PhoneService                                 0.00
PaperlessBilling                             0.00
MonthlyCharges                              34.25
TotalCharges                               163.55
MultipleLines_No                             0.00
MultipleLines_No phone service               1.00
MultipleLines_Yes                            0.00
InternetService_DSL                          1.00
InternetService_Fiber optic                  0.00
InternetService_No                           0.00
OnlineSecurity_No                            1.00
OnlineSecurity_No internet service           0.00
OnlineSecurity_Yes                           0.00
OnlineBackup_No                              0.00
OnlineBackup_No internet service             0.00


In [27]:
df_test.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges',
       'MultipleLines_No', 'MultipleLines_No phone service',
       'MultipleLines_Yes', 'InternetService_DSL',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No', 'OnlineSecurity_No internet service',
       'OnlineSecurity_Yes', 'OnlineBackup_No',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No', 'DeviceProtection_No internet service',
       'DeviceProtection_Yes', 'TechSupport_No',
       'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit 

In [22]:
df_test_truth = pd.read_csv('telco_test_truth.csv')
df_test_truth.head()

Unnamed: 0,Churn
0,0
1,0
2,1
3,0
4,0


In [23]:
predictor.predict(df_test.iloc[0])

b'0.351302921772'

In [24]:
df_test_truth.iloc[0]

Churn    0
Name: 0, dtype: int64