In [411]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pickle
import sagemaker
from sagemaker.xgboost import XGBoost
from sagemaker.mxnet import MXNet
from sagemaker import get_execution_role
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri

sns.set()

%matplotlib inline

# Exploratory Data Analysis

In [412]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

print(train_data.head())

# print missing data
print(train_data.isnull().sum())

def plot_bar_graph(train_data, feature):
    survived = train_data[train_data['Survived'] == 1][feature].value_counts()
    dead = train_data[train_data['Survived'] == 0][feature].value_counts()
    df = pd.DataFrame([survived, dead])
    df.index = ['Survived', 'Dead']
    df.plot(kind='bar', stacked=False, figsize=(10, 5))
    
def plot_feature_count(train_data, feature):
    count = train_data[feature].value_counts()
    # mean = train_data[feature].mean()
    # print("mean = ", mean)
    print(count)
    df = pd.DataFrame([count])
    df.index = [feature]
    df.plot(kind='bar')

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
Pa

# Feature Engineering

In [413]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# move target label to first column, requirement of sagemaker XGBoost
survived = train_data['Survived']
train_data.drop(labels=['Survived'], axis=1, inplace=True)

# remove irrelevant feature data
train_data.drop(labels=['Name', 'PassengerId', 'Ticket'], axis=1, inplace=True)

# insert target as first column
train_data.insert(0, 'Survived', survived)

# drop NaN rows for embarked
train_data = train_data.dropna(subset=['Embarked'])

# replace age NaN with mean age
mean_age = train_data['Age'].mean()
train_data['Age'] = train_data['Age'].fillna(mean_age)

# extract cabin number
train_data['Cabin'] = train_data['Cabin'].apply(lambda x : str(x)[0])

# round age
train_data['Age'] = train_data['Age'].apply(lambda x : int(x))

# round fare
train_data['Fare'] = train_data['Fare'].apply(lambda x : int(x))

# plot_feature_count(train_data, 'Cabin')

# replace cabin NaN with maximum 'S'
train_data['Cabin'] = train_data['Cabin'].apply(lambda x : 'C' if x == 'n' else x)

# plot_feature_count(train_data, 'Cabin')

# label encode sex, cabin and embarked
label_encoder = LabelEncoder()
train_data['Sex'] = label_encoder.fit_transform(train_data['Sex'])
train_data['Cabin'] = label_encoder.fit_transform(train_data['Cabin'])
train_data['Embarked'] = label_encoder.fit_transform(train_data['Embarked'])

bucket = 'sagemaker-us-east-1-756448110530'
prefix = 'dataset'

# plot_bar_graph('Cabin')

print(train_data.head(80))

train_xgboost, validation_xgboost, test_xgboost = np.split(train_data.sample(frac=1, random_state=1729), [int(0.7 * len(train_data)), int(0.9 * len(train_data))])

# remove header as it is not required by XGBoost
train_xgboost.to_csv('train_xgboost.csv', header=False, index=False)
validation_xgboost.to_csv('validation_xgboost.csv', header=False, index=False)
test_xgboost.to_csv('test_xgboost.csv', header=False, index=False)

    Survived  Pclass  Sex  Age  SibSp  Parch  Fare  Cabin  Embarked
0          0       3    1   22      1      0     7      2         2
1          1       1    0   38      1      0    71      2         0
2          1       3    0   26      0      0     7      2         2
3          1       1    0   35      1      0    53      2         2
4          0       3    1   35      0      0     8      2         2
..       ...     ...  ...  ...    ...    ...   ...    ...       ...
76         0       3    1   29      0      0     7      2         2
77         0       3    1   29      0      0     8      2         2
78         1       2    1    0      0      2    29      2         2
79         1       3    0   30      0      0    12      2         2
80         0       3    1   22      0      0     9      2         2

[80 rows x 9 columns]


In [414]:
sagemaker_session = sagemaker.Session()
train_path = sagemaker_session.upload_data(path='train_xgboost.csv', key_prefix='dataset')
validation_path = sagemaker_session.upload_data(path='validation_xgboost.csv', key_prefix='dataset')
test_path = sagemaker_session.upload_data(path='test.csv', key_prefix='test')
print(validation_path)

s3://sagemaker-us-east-1-756448110530/dataset/validation_xgboost.csv


In [415]:
container = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version='0.90-1')

In [416]:
s3_input_train = sagemaker.s3_input(s3_data=train_path, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=validation_path, content_type='csv')
s3_input_test = sagemaker.s3_input(s3_data=test_path, content_type='csv')

# Train Model

In [417]:
estimator = sagemaker.estimator.Estimator(container,
                                          'AmazonSageMaker-ExecutionRole-20190815T111389',
                                          train_instance_count=1,
                                          train_instance_type='ml.m4.2xlarge',
                                          output_path='s3://{}/{}/output'.format(bucket, prefix),
                                          sagemaker_session=sagemaker_session,
                                          train_use_spot_instances=True,
                                          train_max_run=120,
                                          train_max_wait=180,
                                         )
estimator.set_hyperparameters(eta=0.1,
                             objective='binary:logistic',
                             num_round=25,
                             eval_metric='accuracy')

In [418]:
### estimator = XGBoost(entry_point='program.py',
#                     role='AmazonSageMaker-ExecutionRole-20190815T111389',
#                     train_instance_count=1,
#                     train_instance_type='local',
#                     framework_version='0.90-2',
#                     hyperparameters = {
#                         'max_depth':2,
#                         'eta':0.2,
#                         'objective':'binary:logistic',
#                         'silent':0,
#                         'num_round':100,
#                         'eval_metric':'auc'
#                     }
#                  )

In [409]:
estimator.fit({
    'train': s3_input_train,
    'validation': s3_input_validation
})

2020-04-01 20:42:24 Starting - Starting the training job...
2020-04-01 20:42:26 Starting - Launching requested ML instances...
2020-04-01 20:43:32 Starting - Preparing the instances for training......
2020-04-01 20:44:40 Downloading - Downloading input data
2020-04-01 20:44:40 Training - Downloading the training image...
2020-04-01 20:45:16 Uploading - Uploading generated training model
2020-04-01 20:45:16 Completed - Training job completed
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value accuracy to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mI