In [1]:
import sagemaker
from sagemaker import get_execution_role

import boto3

sess = sagemaker.Session()
role = get_execution_role()
bucket = "mastering-ml-aws"
prefix = "chapter4/sagemaker"
container = sagemaker.amazon.amazon_estimator.get_image_uri('us-east-1', "xgboost", "latest")
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)


In [2]:
sagemaker_model = sagemaker.estimator.Estimator(container,
                                                role,
                                                train_instance_count=1,
                                                train_instance_type='ml.c4.4xlarge',
                                                train_volume_size=30,
                                                train_max_run=360000,
                                                input_mode='File',
                                                output_path=s3_output_location,
                                                sagemaker_session=sess)


In [19]:
s3_validation_data = 's3://mastering-ml-aws/chapter4/test-vector-csv/'
s3_train_data = 's3://mastering-ml-aws/chapter4/training-vector-csv/'
s3_test_data = 's3://mastering-ml-aws/chapter4/test-vector-csv-no-label/'


In [8]:
sagemaker_model.set_hyperparameters(objective='binary:logistic',
                                    max_depth=5,
                                    eta=0.2,
                                    gamma=4,
                                    min_child_weight=6,
                                    subsample=0.7,
                                    silent=0,
                                    num_round=50)

train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated',
                                        content_type='text/csv', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated',
                                             content_type='text/csv', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

sagemaker_model.fit(inputs=data_channels, logs=True)


INFO:sagemaker:Creating training-job with name: xgboost-2019-02-09-13-52-55-230


2019-02-09 13:52:55 Starting - Starting the training job...
2019-02-09 13:52:56 Starting - Launching requested ML instances......
2019-02-09 13:54:03 Starting - Preparing the instances for training......
2019-02-09 13:55:24 Downloading - Downloading input data
2019-02-09 13:55:24 Training - Downloading the training image..
[31mArguments: train[0m
[31m[2019-02-09:13:55:29:INFO] Running standalone xgboost training.[0m
[31m[2019-02-09:13:55:29:INFO] File size need to be processed in the node: 1550.03mb. Available memory size in the node: 22264.79mb[0m
[31m[2019-02-09:13:55:29:INFO] Determined delimiter of CSV input is ','[0m
[31m[13:55:29] S3DistributionType set as FullyReplicated[0m
[31m[13:55:36] 3233720x100 matrix with 323372000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-02-09:13:55:36:INFO] Determined delimiter of CSV input is ','[0m
[31m[13:55:36] S3DistributionType set as FullyReplicated[0m
[31m[13:55:38] 809374x1

[31m[13:56:46] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[43]#011train-error:0.169796#011validation-error:0.169888[0m
[31m[13:56:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[44]#011train-error:0.169796#011validation-error:0.169888[0m
[31m[13:56:49] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 2 pruned nodes, max_depth=5[0m
[31m[45]#011train-error:0.169796#011validation-error:0.169888[0m
[31m[13:56:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 2 pruned nodes, max_depth=5[0m
[31m[46]#011train-error:0.169796#011validation-error:0.169888[0m
[31m[13:56:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=5[0m
[31m[47]#011train-error:0.169796#011validation-error:0.169888[0m
[31m[13:56:54] src/tree/updater_prune.cc:74: tree pruning e

In [24]:
transformer = sagemaker_model.transformer(instance_count=1, instance_type='ml.m4.2xlarge',
                                          output_path=s3_output_location, strategy='SingleRecord')
transformer.transform(s3_test_data, content_type='text/csv')
transformer.wait()


INFO:sagemaker:Creating model with name: xgboost-2019-02-09-13-52-55-230
INFO:sagemaker:Creating transform job with name: xgboost-2019-02-09-14-55-57-076


............................................!


In [25]:
transformer.output_path


's3://mastering-ml-aws/chapter4/sagemaker/output'

In [33]:
!aws s3 ls s3://mastering-ml-aws/chapter4/sagemaker/output/ | head

                           PRE xgboost-2019-02-09-13-52-55-230/
2019-02-09 14:59:25          0 _SUCCESS.out
2019-02-09 14:59:27     121773 part-00000-cee24b45-7f40-4309-a808-8120c23d9088-c000.csv.out
2019-02-09 14:59:27     121772 part-00001-cee24b45-7f40-4309-a808-8120c23d9088-c000.csv.out
2019-02-09 14:59:27     121771 part-00002-cee24b45-7f40-4309-a808-8120c23d9088-c000.csv.out
2019-02-09 14:59:27     121772 part-00003-cee24b45-7f40-4309-a808-8120c23d9088-c000.csv.out
2019-02-09 14:59:27     121772 part-00004-cee24b45-7f40-4309-a808-8120c23d9088-c000.csv.out
2019-02-09 14:59:27     121772 part-00005-cee24b45-7f40-4309-a808-8120c23d9088-c000.csv.out
2019-02-09 14:59:27     121772 part-00006-cee24b45-7f40-4309-a808-8120c23d9088-c000.csv.out
2019-02-09 14:59:27     121771 part-00007-cee24b45-7f40-4309-a808-8120c23d9088-c000.csv.out

[Errno 32] Broken pipe
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'>
BrokenPipeError: [Errno 32] Broken 

In [8]:
import pandas as pd

scores_df = pd.read_csv(
    's3://mastering-ml-aws/chapter4/sagemaker/output/part-00000-cee24b45-7f40-4309-a808-8120c23d9088-c000.csv.out',
    header=None, names=['score'])


In [42]:
scores_df.head(5)

Unnamed: 0,score
0,0.12479
1,0.16786
2,0.16786
3,0.16786
4,0.16786


In [48]:
!aws s3 ls --recursive s3://mastering-ml-aws/chapter4/sagemaker/ | grep model

2019-02-09 13:57:04      12982 chapter4/sagemaker/output/xgboost-2019-02-09-13-52-55-230/output/model.tar.gz


In [5]:
!tar xvf /tmp/model.tar.gz

xgboost-model


In [4]:
!aws s3 cp s3://mastering-ml-aws/chapter4/sagemaker/output/xgboost-2019-02-09-13-52-55-230/output/model.tar.gz /tmp/model.tar.gz

Completed 12.7 KiB/12.7 KiB (130.2 KiB/s) with 1 file(s) remainingdownload: s3://mastering-ml-aws/chapter4/sagemaker/output/xgboost-2019-02-09-13-52-55-230/output/model.tar.gz to ../../../../../tmp/model.tar.gz


In [2]:
!pip install xgboost

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/54/21/8b2ec99862903a6d3aed62ce156d21d114b8666e669c46d9e54041df9496/xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl (16.6MB)
[K    100% |████████████████████████████████| 16.6MB 2.7MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-0.81
[33mYou are using pip version 10.0.1, however version 19.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [6]:
import xgboost
import pickle as pkl

model_local = pkl.load(open('xgboost-model', 'rb'))


In [9]:
!aws s3 ls s3://mastering-ml-aws/chapter4/test-vector-csv/


2019-02-08 15:34:54          0 _SUCCESS
2019-02-08 15:34:45  163201548 part-00000-93d44a8d-421b-4cba-8292-4caca4ec1764-c000.csv
2019-02-08 15:34:44  162166800 part-00001-93d44a8d-421b-4cba-8292-4caca4ec1764-c000.csv


In [20]:
column_names = ['click'] + ['f' + str(i) for i in range(0, 100)]
validation_df = pd.read_csv(
    's3://mastering-ml-aws/chapter4/test-vector-csv/part-00000-93d44a8d-421b-4cba-8292-4caca4ec1764-c000.csv',
    header=None, names=column_names)


In [21]:
validation_df = validation_df[:3000]


In [22]:
matrix = xgboost.DMatrix(validation_df[column_names[1:]])
validation_df['score'] = model_local.predict(matrix)


In [23]:
validation_df.head()


Unnamed: 0,click,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,score
0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226555
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16786
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.512123
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.512123
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141251


In [70]:
validation_df[column_names[1:]].as_matrix()


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
from sklearn.metrics import roc_auc_score

roc_auc_score(validation_df['click'], validation_df['score'])


0.6509825765238708