In [1]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
import boto3
import pandas as pd

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
session = sagemaker.Session()
bucket = session.default_bucket()
role = get_execution_role()

# Load processed data
df = pd.read_csv('processed_income.csv')

# Split into train and validation sets
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Save splits locally
train_df.to_csv('train.csv', index=False, header=False)
val_df.to_csv('validation.csv', index=False, header=False)

# Upload to S3
prefix = 'hyperparam-tuning'
s3_train_path = session.upload_data('train.csv', bucket=bucket, key_prefix=prefix)
s3_val_path = session.upload_data('validation.csv', bucket=bucket, key_prefix=prefix)

In [3]:
container = sagemaker.image_uris.retrieve("xgboost", session.boto_region_name, version="1.5-1")

xgb = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    volume_size=5,
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=session
)

In [4]:
xgb.set_hyperparameters(
    objective='binary:logistic',
    num_round=100,
    eval_metric='auc'
)

In [5]:
hyperparameter_ranges = {
    'eta': ContinuousParameter(0.01, 0.3),
    'max_depth': IntegerParameter(3, 10),
    'min_child_weight': ContinuousParameter(1, 10),
    'subsample': ContinuousParameter(0.5, 1.0)
}

In [8]:
!head train.csv

-0.4092049461031111,1,9,1.1347387637961643,4,9,1,4,0,-0.1459204835588534,-0.2166595270325901,0.7744682102025865,38,1,0.4813157607053656,Adult
-0.1892671943743597,3,8,0.357339569890111,2,3,0,4,1,-0.1459204835588534,4.4662567751693,0.7744682102025865,38,1,0.4813157607053656,Adult
1.4236096516364842,5,6,-1.974858011828049,5,2,1,4,1,-0.1459204835588534,-0.2166595270325901,-0.0354294469727769,38,0,0.8854505393728745,Senior
-1.288955953018117,3,8,0.357339569890111,2,4,0,4,1,-0.1459204835588534,-0.2166595270325901,0.4505091473324411,38,0,0.5888136024594133,Young
-0.849080449560614,3,15,-0.0313600270629156,0,7,1,4,1,-0.1459204835588534,-0.2166595270325901,-0.0354294469727769,38,0,0.8854505393728745,Adult
0.3972334769023108,3,9,1.1347387637961643,2,2,0,4,1,-0.1459204835588534,-0.2166595270325901,-2.303142887063794,38,0,-0.6553997454304762,Middle
-0.4092049461031111,3,0,-1.5861584148750223,4,6,1,4,0,-0.1459204835588534,-0.2166595270325901,0.3695193816149048,38,0,0.6236344865992596,Adult
1.716859

In [9]:
!head validation.csv

-0.849080449560614,3,15,-0.0313600270629156,0,0,1,4,0,-0.1459204835588534,-0.2166595270325901,-0.1974089784078496,38,0,1.064152841813325,Adult
0.4705460608118946,6,11,-0.4200596240159423,2,3,5,4,0,-0.1459204835588534,-0.2166595270325901,-0.0354294469727769,38,0,0.8854505393728745,Middle
-0.7024552817414464,3,9,1.1347387637961643,2,3,0,2,1,-0.1459204835588534,-0.2166595270325901,1.1794170387902685,38,1,0.3918843898344584,Adult
-0.6291426978318626,3,9,1.1347387637961643,4,6,1,4,0,-0.1459204835588534,-0.2166595270325901,-0.0354294469727769,38,0,0.8854505393728745,Adult
-0.7024552817414464,5,15,-0.0313600270629156,0,2,1,4,1,0.15224427268102,-0.2166595270325901,0.7744682102025865,38,0,0.6493462469803678,Adult
0.9104215642693976,3,12,1.523438360749191,2,9,0,4,1,-0.1459204835588534,-0.2166595270325901,0.5314989130499774,38,1,0.557675561610585,Middle
1.4236096516364842,5,11,-0.4200596240159423,2,3,0,4,1,1.888424338306659,-0.2166595270325901,1.58436586737795,38,1,1.1176530284534367,Senior
-1.21

In [10]:
# Move target column 'income >50K' to the front
cols = list(df.columns)
cols.remove('income >50K')
cols = ['income >50K'] + cols
df = df[cols]

# Split and save again
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_csv('train.csv', index=False, header=False)
val_df.to_csv('validation.csv', index=False, header=False)

In [11]:
s3_train_path = session.upload_data('train.csv', bucket=bucket, key_prefix=prefix)
s3_val_path = session.upload_data('validation.csv', bucket=bucket, key_prefix=prefix)

In [12]:
tuner = HyperparameterTuner(
    estimator=xgb,
    objective_metric_name='validation:auc',
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=[
        {'Name': 'validation:auc', 'Regex': 'validation.*auc:([0-9\\.]+)'}
    ],
    max_jobs=5,
    max_parallel_jobs=2,
    strategy='Bayesian'
)

In [13]:
tuner.fit({'train': train_input, 'validation': validation_input})

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


............................................................!


In [14]:
best_training_job = tuner.best_training_job()
print("Best training job:", best_training_job)

best_estimator = sagemaker.estimator.Estimator.attach(best_training_job)
best_estimator.hyperparameters()

Best training job: sagemaker-xgboost-251105-1337-004-ba85f011

2025-11-05 13:41:38 Starting - Found matching resource for reuse
2025-11-05 13:41:38 Downloading - Downloading the training image
2025-11-05 13:41:38 Training - Training image download completed. Training in progress.
2025-11-05 13:41:38 Uploading - Uploading generated training model
2025-11-05 13:41:38 Completed - Resource retained for reuse


{'_tuning_objective_metric': 'validation:auc',
 'eta': '0.16362045671726433',
 'eval_metric': 'auc',
 'max_depth': '4',
 'min_child_weight': '3.158673855611485',
 'num_round': '100',
 'objective': 'binary:logistic',
 'subsample': '0.6343212480484763'}