
Titanic Survival Prediction with SageMaker
This notebook walks through the steps to:
1. Read and split Titanic data into train/test sets.
2. Train an XGBoost model using SageMaker.
3. Make predictions on the test set and calculate accuracy.
4. Monitor the training and predictions with CloudWatch.


In [4]:
import boto3
import pandas as pd
from sklearn.model_selection import train_test_split
import sagemaker
from sagemaker import Session
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sklearn.metrics import accuracy_score

In [7]:
bucket_name = 'test-bucket-hamady'
session = sagemaker.Session()
role = sagemaker.get_execution_role()

In [13]:

s3 = boto3.client('s3')
response = s3.get_object(Bucket=bucket_name, Key="processed/titanic.csv")
titanic_data = pd.read_csv(response['Body'])


X = titanic_data.drop("Survived", axis=1)
y = titanic_data["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


train_data = X_train.copy()
train_data['Survived'] = y_train
train_data = train_data[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked_Q", "Embarked_S"]]


train_data.to_csv("X_train_with_labels.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_test.to_csv("y_test.csv", index=False)
s3.upload_file("X_train_with_labels.csv", bucket_name, "splitData/X_train_with_labels.csv")
s3.upload_file("X_test.csv", bucket_name, "splitData/X_test.csv")
s3.upload_file("y_test.csv", bucket_name, "splitData/y_test.csv")

In [15]:
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
import sagemaker


container = sagemaker.image_uris.retrieve("xgboost", session.boto_region_name, version="1.5-1")
xgb = Estimator(
    image_uri=container,
    role=role,  
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=f"s3://{bucket_name}/xgboost-output",
    sagemaker_session=session
)


xgb.set_hyperparameters(objective="binary:logistic", num_round=100, max_depth=5, eta=0.2, subsample=0.8)


train_input = TrainingInput(
    f"s3://{bucket_name}/splitData/X_train_with_labels.csv",
    content_type="csv",
    input_mode="File",
  
)


xgb.fit({"train": train_input})


2025-01-04 05:14:02 Starting - Starting the training job...
2025-01-04 05:14:23 Starting - Preparing the instances for training...
2025-01-04 05:14:49 Downloading - Downloading input data...
2025-01-04 05:15:15 Downloading - Downloading the training image...
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-01-04 05:16:05.696 ip-10-0-187-160.eu-west-3.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-01-04 05:16:05.720 ip-10-0-187-160.eu-west-3.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-01-04:05:16:06:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-01-04:05:16:06:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-01-04:05:16:06:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-01-04:05:16:06:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2025-01-04:0

In [16]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type="ml.m5.large")


------!