In [1]:
from pathlib import Path
import pandas as pd
import boto3
import sagemaker
from sagemaker import image_uris
from sagemaker.estimator import Estimator
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter
from sagemaker.inputs import TrainingInput
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from sagemaker.model import Model
from sklearn.metrics import accuracy_score
import numpy as np
import tarfile
import shutil

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


## Train

Read data

In [2]:
DATA_PROCESSED = Path("../data/processed")
train_df = pd.read_csv(DATA_PROCESSED/"train.csv", header = None)
val_df   = pd.read_csv(DATA_PROCESSED/"validation.csv", header = None)
print(train_df.shape, val_df.shape)

(3492, 16) (748, 16)


In [3]:
prefix = 'xgboost'
bucket_name = 'excalibur-ai-model'
train_key = f'{prefix}/train/train.csv'
validation_key = f'{prefix}/validation/validation.csv'

s3 = boto3.client('s3')
s3.upload_file(DATA_PROCESSED/'train.csv', bucket_name, train_key)
s3.upload_file(DATA_PROCESSED/'validation.csv', bucket_name, validation_key)

s3_train_path = f's3://{bucket_name}/{train_key}'
s3_validation_path = f's3://{bucket_name}/{validation_key}'

print(f"Uploaded training data to: {s3_train_path}")
print(f"Uploaded training data to: {s3_validation_path}")

Uploaded training data to: s3://excalibur-ai-model/xgboost/train/train.csv
Uploaded training data to: s3://excalibur-ai-model/xgboost/validation/validation.csv


In [4]:
boto_session = boto3.Session(region_name = 'us-east-1')
session = sagemaker.Session(boto_session = boto_session)
role = sagemaker.get_execution_role()
region = session.boto_region_name

Instantiate xgboost container

In [5]:
container = image_uris.retrieve(
    framework = "xgboost",
    region = "us-east-1",
    version = "1.7-1"  
)
display(container)

'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1'

In [6]:
prefix = 'xgboost'
bucket_name = 'excalibur-ai-model'
xgb = Estimator(
    image_uri = container,
    role = role,
    instance_count = 1,
    instance_type = 'ml.m5.large',
    output_path = f's3://{bucket_name}/{prefix}/output',
    sagemaker_session = session
)

Set model parameters

In [7]:
xgb.set_hyperparameters(
    objective = "binary:logistic",
    num_round = 500, #number of boosting iterations
)

In [8]:
hyperparameter_ranges = {
    "max_depth": IntegerParameter(5, 10), 
    # Controls the maximum depth of each decision tree.
    # Larger values make the model more complex, but risk overfitting.
    
    "eta": ContinuousParameter(0.01, 0.25), 
    # Learning rate (shrinkage). Smaller values make learning slower but more precise.
    # Often paired with more boosting rounds.

    "gamma": ContinuousParameter(0.0, 5.0),
    # Minimum loss reduction required to make a further partition on a leaf node of the tree. 
    # The larger, the more conservative the algorithm is.
    
    "min_child_weight": IntegerParameter(1, 10), 
    # Minimum sum of instance weight (hessian) needed in a child.
    # Larger values can make the model more conservative (less complex splits).
    
    "subsample": ContinuousParameter(0.8, 1.0), 
    # Fraction of the training data used to grow each tree.
    # Prevents overfitting; lower values add randomness.

    "colsample_bytree": ContinuousParameter(0.2, 1.0)
    # Subsample ratio of columns when constructing each tree.
}

Tune model

In [9]:
tuner = HyperparameterTuner(
    estimator = xgb,
    objective_metric_name = "validation:auc", # How well the model separates the positive cases from the negative cases 
    hyperparameter_ranges = hyperparameter_ranges,
    metric_definitions = [{
        "Name": "validation:auc",
        "Regex": ".*\\[.*\\].*validation-auc:([0-9\\.]+)"
    }],
    max_jobs = 20, # Different training exercises that will be performed during the hyperparameter tuning.
    objective_type = "Maximize" # The highest value of the target metric
)

Train model

In [10]:
s3_train_path = 'excalibur-ai-model/xgboost/train/train.csv'
s3_validation_path = 'excalibur-ai-model/xgboost/validation/validation.csv'

tuner.fit({
    "train": TrainingInput(f's3://{s3_train_path}', content_type = "text/csv"),
    "validation": TrainingInput(f's3://{s3_validation_path}', content_type = "text/csv")
})

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


...............................................................................................................................................................................................................................................................................................................!


Show parameters set by best estimator

In [11]:
best_estimator = tuner.best_estimator()
best_estimator.hyperparameters()


2026-02-23 13:14:07 Starting - Found matching resource for reuse
2026-02-23 13:14:07 Downloading - Downloading the training image
2026-02-23 13:14:07 Training - Training image download completed. Training in progress.
2026-02-23 13:14:07 Uploading - Uploading generated training model
2026-02-23 13:14:07 Completed - Resource retained for reuse


{'_tuning_objective_metric': 'validation:auc',
 'colsample_bytree': '0.4405575686369554',
 'eta': '0.24021326137376825',
 'gamma': '4.063165316339813',
 'max_depth': '8',
 'min_child_weight': '1',
 'num_round': '500',
 'objective': 'binary:logistic',
 'subsample': '0.8254940017132779'}

## Test

Instantiate predictor

In [12]:
test_predictor = best_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer()
)

------!

In [13]:
test_predictor.serializer = CSVSerializer()
test_predictor.deserializer = JSONDeserializer()

Read test dataset

In [14]:
DATA_PROCESSED = Path("../data/processed")
X_test_df = pd.read_csv(DATA_PROCESSED/"X_test.csv")
y_test_df  = pd.read_csv(DATA_PROCESSED/"y_test.csv")

In [15]:
payload = X_test_df.to_csv(header=False, index=False)
test_result = test_predictor.predict(payload)

Get predicted labels

In [16]:
if "predictions" in test_result:
    test_predictions = np.array([p["score"] for p in test_result["predictions"]])
test_predicted_labels = (test_predictions >= 0.5).astype(int)

In [17]:
y_test = y_test_df.iloc[:, 0].values
print("Predicciones:", test_predicted_labels.shape)
print("Reales:", y_test.shape)

Predicciones: (749,)
Reales: (749,)


Compare and get accuracy

In [18]:
accuracy = accuracy_score(y_test, test_predicted_labels)
print("Accuracy:", accuracy)

Accuracy: 0.7770360480640854


## Endpoint

Download trained model

In [19]:
PROJECT_ROOT = Path("..").resolve()
MODEL_PATH = PROJECT_ROOT / "model"
local_model_path = MODEL_PATH / "tmp"

In [20]:
model_s3_uri = best_estimator.model_data

session.download_data(
    path = str(local_model_path),
    bucket = model_s3_uri.split("/")[2],
    key_prefix = "/".join(model_s3_uri.split("/")[3:])
)

['/home/sagemaker-user/model/tmp/model.tar.gz']

Extract model.tar.gz

In [21]:
with tarfile.open(local_model_path / "model.tar.gz") as tar:
    tar.extractall(path = local_model_path)

  tar.extractall(path = local_model_path)


Get encoder, sacler and columns from EDA

In [22]:
ARTIFACTS_PATH = Path("../model/artifacts")
for file in [
    "onehot_encoder.joblib",
    "scaler.joblib",
    "feature_columns.joblib"
]:
    shutil.copy(
        ARTIFACTS_PATH / file,
        local_model_path / file
    )

Create inference.py

In [30]:
%%writefile inference.py
import json
import os
import io
import joblib
import pandas as pd
import xgboost as xgb
import numpy as np

MODEL_PATH = "/opt/ml/model"

MODEL_FILE = os.path.join(MODEL_PATH, "xgboost-model")
ENCODER_FILE = os.path.join(MODEL_PATH, "onehot_encoder.joblib")
SCALER_FILE = os.path.join(MODEL_PATH, "scaler.joblib")
FEATURE_COLUMNS_FILE = os.path.join(MODEL_PATH, "feature_columns.joblib")

# Load model and atifacts
def model_fn(model_dir):
    booster = xgb.Booster()
    booster.load_model(MODEL_FILE)

    encoder = joblib.load(ENCODER_FILE)
    scaler = joblib.load(SCALER_FILE)
    feature_columns = joblib.load(FEATURE_COLUMNS_FILE)

   # Assure version compatibility
    
    # Case .sparse vs .sparse_output
    if hasattr(encoder, 'sparse_output') and not hasattr(encoder, 'sparse'):
        encoder.sparse = encoder.sparse_output
    elif hasattr(encoder, 'sparse') and not hasattr(encoder, 'sparse_output'):
        encoder.sparse_output = encoder.sparse

    encoder.sparse = False
    if hasattr(encoder, 'sparse_output'):
        encoder.sparse_output = False

    # Case get_feature_names_out (v1.0+) vs get_feature_names (v0.24-)
    if not hasattr(encoder, 'get_feature_names_out'):
        if hasattr(encoder, 'get_feature_names'):
            encoder.get_feature_names_out = encoder.get_feature_names
        else:
            encoder.get_feature_names_out = lambda: [f"cat_{i}" for i in range(encoder.categories_[0].size)]

    
    return {
        "model": booster,
        "encoder": encoder,
        "scaler": scaler,
        "feature_columns": feature_columns
    }

# Transform input
def input_fn(request_body, request_content_type):

    if request_content_type == 'application/json':
        data = json.loads(request_body)
        df = pd.DataFrame([data])
        df.columns = [
            "did_you_get_injured_byaslip_or_fall_accident",
            "did_you_have_an_accident_at_work",
            "how_you_were_involved",
            "days_since_accident",
            "state_accident_occur",
            "were_you_affected_by_possible_malpractice",
            "were_you_involved_in_an_automobile_accident"
        ]

        df = df.replace(r'^\s*$', np.nan, regex=True)
        df = df.replace('None', np.nan)

        return df

    else:
        raise ValueError(f"Unsupported content type: {request_content_type}")

# Predict
def predict_fn(input_data, model_artifacts):
    df = input_data.copy()
    print("Columns received:", df.columns)
    
    booster = model_artifacts["model"]
    encoder = model_artifacts["encoder"]
    scaler = model_artifacts["scaler"]
    feature_columns = model_artifacts["feature_columns"]

    # Scale numeric column
    df["days_since_accident"] = scaler.transform(
        df[["days_since_accident"]]
    )
    
    # Convert boolean to 0/1
    boolean_cols = [
        "did_you_get_injured_byaslip_or_fall_accident",
        "did_you_have_an_accident_at_work",
        "were_you_affected_by_possible_malpractice",
        "were_you_involved_in_an_automobile_accident"
    ]

    for col in boolean_cols:
        df[col] = df[col].map({True: 1, False: 0})
    
    # Validate state
    valid_states = ["New York", "New Jersey"]
    state_value = df.loc[0, "state_accident_occur"]
    
    if state_value not in valid_states:
        raise ValueError(
            f"Invalid state_accident_occur: '{state_value}'. "
            f"Allowed values are {valid_states}."
        )

    # Encode categorical columns
    df["how_you_were_involved"] = df["how_you_were_involved"].fillna("Not_involved")
    categorical_cols = ["how_you_were_involved", "state_accident_occur"]    
    encoded = encoder.transform(df[categorical_cols])
    encoded_df = pd.DataFrame(
        encoded,
        columns=encoder.get_feature_names_out(categorical_cols),
        index=df.index
    )

    # Concat all columns
    final_df = pd.concat(
        [df[["days_since_accident"]], encoded_df],
        axis=1
    )

   # Align columns with feature_columns
    final_df = final_df.reindex(columns=feature_columns, fill_value=0)

    # Convert to DMatrix and predict
    dmatrix = xgb.DMatrix(final_df.values)
    predictions = booster.predict(dmatrix)
    return predictions

# Output
def output_fn(predictions, content_type):

    if content_type == "application/json":
        return json.dumps({
            "predictions": predictions.tolist()
        }), content_type

    else:
        raise ValueError(f"Unsupported content type: {content_type}")


Overwriting inference.py


Repackage

In [31]:
MODEL_BUILD_PATH = Path("../model/builds")
MODEL_BUILD_PATH.mkdir(parents = True, exist_ok = True)

new_model_tar = MODEL_BUILD_PATH / "model_complete.tar.gz"

with tarfile.open(new_model_tar, "w:gz") as tar:
    for file in local_model_path.iterdir():
        tar.add(file, arcname = file.name)

Upload model

In [32]:
model_complete_s3 = session.upload_data(
    path = str(new_model_tar),
    bucket = bucket_name,
    key_prefix = "xgboost/excalibur-aimodel"
)

print(model_complete_s3)

s3://excalibur-ai-model/xgboost/excalibur-aimodel/model_complete.tar.gz


Create endpoint

In [36]:
endpoint_name = "xgboost-excalibur"
region = "us-east-1"

sm_client = boto3.client("sagemaker", region_name=region)

existing_endpoints = sm_client.list_endpoints(NameContains = endpoint_name)["Endpoints"]
endpoint_names = [ep["EndpointName"] for ep in existing_endpoints]

if endpoint_name in endpoint_names:
    print(f"Endpoint '{endpoint_name}' already exists. Reusing it.")
    
    predictor = Predictor(
        endpoint_name = endpoint_name,
        sagemaker_session = session
    )

else:
    print(f"Creating endpoint '{endpoint_name}' using model_complete.tar.gz ...")

    model = Model(
        image_uri = best_estimator.image_uri,
        model_data = model_complete_s3,
        role = role,
        entry_point = "inference.py",
        sagemaker_session = session
    )

    predictor = model.deploy(
        initial_instance_count = 1,
        instance_type = "ml.m5.large",
        endpoint_name = endpoint_name
    )

Creating endpoint 'xgboost-excalibur' using model_complete.tar.gz ...
------!

WARNING: always delete endpoint and endpoint configuration after testing

In [35]:
sm_client = boto3.client("sagemaker", region_name="us-east-1")
#sm_client.delete_endpoint(EndpointName=endpoint_name)
#sm_client.delete_endpoint_config(EndpointConfigName="xgboost-excalibur")

{'ResponseMetadata': {'RequestId': 'ef2a5060-0e26-422d-a9ff-fb39b14e9be9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ef2a5060-0e26-422d-a9ff-fb39b14e9be9',
   'strict-transport-security': 'max-age=47304000; includeSubDomains',
   'x-frame-options': 'DENY',
   'content-security-policy': "frame-ancestors 'none'",
   'cache-control': 'no-cache, no-store, must-revalidate',
   'x-content-type-options': 'nosniff',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Mon, 23 Feb 2026 22:56:27 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}