In [84]:
import os
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session
from sagemaker.predictor import csv_serializer, json_deserializer
#Use instead of image uri
#from sagemaker.xgboost.estimator import XGBoost

In [71]:
#You will get the current AWS role you are working on
role = get_execution_role()
#The region you are working on
region = boto3.Session().region_name
bucket = "sagemaker-us-east-1"
prefix = "tryouts"

In [161]:
import pandas as pd
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Check capstone ipynb for more information on EDA and cleaning.
In a nutshell below how to transform it

In [162]:
def find_title(string):
    start = string.find(",")
    end = string.find(".")
    return(string[start +2:end].strip())
df['title'] = df.Name.apply(find_title)
df.title.replace({
    "Capt": "Crew",
    "Col": "Crew",
    "Major": "Crew",
    "Jonkheer": "Royal",
    "Don": "Royal",
    "Sir" : "Royal",
    "Dr": "Crew",
    "Rev": "Crew",
    "the Countess":"Royal",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royal"}, inplace = True)

df['Cabin'] = df.Cabin.str[0]
df['young'] = (df.Age<=15).astype(int)
df["family"] = df.SibSp + df.Parch
#Make sure target variable is your first column
df = df[["Survived", "Pclass", "Sex", "Age", "family", "young", "Embarked", "title"]]
#our filling shall be done ater train-val split, however, for simplicty we will perform before
df.fillna(value={"Embarked": df.Embarked.mode(), "Cabin": "other", "Age": df.Age.median()}, inplace = True)
df = pd.get_dummies(df, columns=["Sex", "Embarked", "Pclass", "title"], drop_first=True)
train = df.sample(frac = 0.8, random_state = 12)
test = df[~df.index.isin(train.index)]

train.to_csv("train_transformation.csv", index = False, header = False)
test.to_csv("validation_transformation.csv", index = False, header = False)

#you do not have to create the buckets prior performing this
train_loc = boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', 'train.csv')).upload_file('train_transformation.csv')
test_loc = boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(prefix, "validation", "test.csv")).upload_file('validation_transformation.csv')

In [168]:
#The above functions does not provide string location, so you have to do it manually.
#Remember to create the bucket before the above and below operations
train_loc = s3_input('s3://{}/{}/{}/'.format(bucket, prefix, 'train'), content_type="csv")
test_loc = s3_input("s3://{}/{}/{}/".format(bucket, prefix, "validation"), content_type="csv")

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [165]:
train.columns

Index(['Survived', 'Age', 'family', 'young', 'Sex_male', 'Embarked_Q',
       'Embarked_S', 'Pclass_2', 'Pclass_3', 'title_Master', 'title_Miss',
       'title_Mr', 'title_Mrs', 'title_Royal'],
      dtype='object')

In [166]:
output_path = 's3://{}/{}/model'.format(bucket, prefix)
container = get_image_uri(region, 'xgboost', repo_version='1.0-1')

#https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html
hyperparameters = {"max_depth":"5", 
                   "eta":"0.2",
                   "gamma":"4", 
                   "min_child_weight":"6", 
                   "subsample":"0.7", 
                   "eval_metric": "error",
                   "objective":"reg:logistic", 
                   "num_round":"50"}

#https://aws.amazon.com/sagemaker/pricing/instance-types/
estimator = sagemaker.estimator.Estimator(image_name=container, 
                                          hyperparameters=hyperparameters,
                                          role=role,
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.large', 
                                          output_path=output_path)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [169]:
estimator.fit({"train":train_loc, "validation":test_loc})

2020-08-22 18:09:23 Starting - Starting the training job...
2020-08-22 18:09:25 Starting - Launching requested ML instances......
2020-08-22 18:10:38 Starting - Preparing the instances for training......
2020-08-22 18:11:28 Downloading - Downloading input data...
2020-08-22 18:12:24 Training - Training image download completed. Training in progress.
2020-08-22 18:12:24 Uploading - Uploading generated training model.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value error to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of

In [170]:
predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-------------!

In [185]:
test_kaggle = pd.read_csv("test.csv")
passenger = test_kaggle.PassengerId
test_kaggle['title'] = test_kaggle.Name.apply(find_title)
test_kaggle.title.replace({
    "Capt": "Crew",
    "Col": "Crew",
    "Major": "Crew",
    "Jonkheer": "Royal",
    "Don": "Royal",
    "Sir" : "Royal",
    "Dr": "Crew",
    "Rev": "Crew",
    "the Countess":"Royal",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royal"}, inplace = True)
test_kaggle['Cabin'] = test_kaggle.Cabin.str[0]
test_kaggle['young'] = (test_kaggle.Age<=15).astype(int)
test_kaggle["family"] = test_kaggle.SibSp + test_kaggle.Parch
#Make sure target variable is your first column
test_kaggle = test_kaggle[["Pclass", "Sex", "Age", "family", "young", "Embarked", "title"]]
#our filling shall be done ater train-val split, however, for simplicty we will perform before
test_kaggle.fillna(value={"Embarked": test_kaggle.Embarked.mode(), "Cabin": "other", "Age": test_kaggle.Age.median()}, inplace = True)
test_kaggle = pd.get_dummies(test_kaggle, columns=["Sex", "Embarked", "Pclass", "title"], drop_first=True)
test_kaggle["title_Royal"] = test_kaggle.title_Dona
test_kaggle = test_kaggle.drop("title_Dona", axis =1)

In [187]:
predictor.content_type = "text/csv"
predictor.serializer = csv_serializer
predictor.deserializer = None

scores = []
for obs in range(test_kaggle.shape[0]):
    if obs % 100 == 0:
        print("we are in observation number: {}".format(obs))
    else:
        pass
    prediction = predictor.predict([list(test_kaggle.iloc[obs])])
    scores.append(float(prediction))
scores = [1 if score > 0.5 else 0 for score in scores]

we are in observation number: 0
we are in observation number: 100
we are in observation number: 200
we are in observation number: 300
we are in observation number: 400


In [188]:
final_output = passenger.to_frame()
final_output["Survived"] = scores
final_output.to_csv("submission1.csv", index = False, header = True)

In [189]:
estimator.delete_endpoint()

estimator.delete_endpoint() will be deprecated in SageMaker Python SDK v2. Please use the delete_endpoint() function on your predictor instead.
