# Amazon SageMaker Batch Transform: Associate prediction results with their corresponding input records


In [1]:
%store

Stored variables and their in-db values:
account_id                               -> '607916531205'
baseline_model_logistic_path             -> 'baseline_model_logistic.pkl'
baseline_model_path                      -> 'baseline_model.pkl'
create_base_csv_athena_db                -> True
create_base_csv_athena_table             -> True
database_name                            -> 'db_airline_delay_cause'
dev_feature_group_name                   -> 'airline_delay_features_dev'
dev_feature_store_table                  -> 'airline_delay_features_dev_1739939876'
dev_s3_path                              -> 's3://sagemaker-us-east-1-607916531205/data/develo
dev_s3_uri                               -> 's3://sagemaker-us-east-1-607916531205/feature-sto
dev_table_name                           -> 'development_data'
packages_installed                       -> True
prod_feature_group_name                  -> 'airline_delay_features_prod'
prod_feature_store_table                 -> 'airline_delay_fe

In [2]:
import boto3
import sagemaker
from pyathena import connect
import pandas as pd
import numpy as np

# ✅ Retrieve stored variables
%store -r region
%store -r role

# ✅ Initialize AWS Session
session = boto3.session.Session()
sagemaker_session = sagemaker.Session()

# ✅ Use SageMaker's default bucket
bucket = sagemaker_session.default_bucket()
prefix = "flight-delay-prediction-xgboost"  




sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [4]:
import time

# ✅ Retrieve stored feature group names and table names
%store -r dev_feature_group_name
%store -r prod_feature_group_name
%store -r dev_feature_store_table
%store -r prod_feature_store_table

# ✅ Initialize SageMaker client
sagemaker_client = boto3.client("sagemaker")

# ✅ Function to Get Feature Store Table Name (Skipping if already stored)
def get_feature_store_table_name(feature_group_name):
    print(f"⏳ Waiting for Feature Group '{feature_group_name}' to be available in Glue...")
    
    # Wait for Feature Group to be created
    while True:
        response = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)
        status = response["FeatureGroupStatus"]
        if status == "Created":
            print(f"✅ Feature Group '{feature_group_name}' is now active!")
            break
        print(f"⏳ Current status: {status}, retrying in 5 seconds...")
        time.sleep(5)
    
    # Retrieve Glue Table Name
    table_name = response["OfflineStoreConfig"]["DataCatalogConfig"]["TableName"]
    print(f"✅ Feature Store table registered in Glue: {table_name}")
    
    return table_name

# ✅ Use the stored feature store table names
dev_feature_store_table = dev_feature_store_table if "dev_feature_store_table" in locals() else get_feature_store_table_name(dev_feature_group_name)
prod_feature_store_table = prod_feature_store_table if "prod_feature_store_table" in locals() else get_feature_store_table_name(prod_feature_group_name)

# ✅ Store feature store table names for later use (if they were retrieved or new)
%store dev_feature_store_table
%store prod_feature_store_table


Stored 'dev_feature_store_table' (str)
Stored 'prod_feature_store_table' (str)


In [7]:
# ✅ Retrieve stored variables
%store -r dev_feature_store_table
%store -r s3_staging_dir
%store -r region

# ✅ Perform Athena query, returning the result as a pandas DataFrame
query = f"""
SELECT * FROM "sagemaker_featurestore"."{dev_feature_store_table}"
;
"""

# ✅ Set up Athena connection
conn = connect(s3_staging_dir=s3_staging_dir, region_name=region)

# ✅ Execute the query using Pandas
df_offline = pd.read_sql(query, conn)

# ✅ Now df_offline contains all the records from the Feature Store offline table
print(df_offline.head())

  df_offline = pd.read_sql(query, conn)


     event_time  year  month carrier airport  arr_flights  arr_del15  \
0  1.739940e+09  2015     10      UA     OGG          169         17   
1  1.739940e+09  2006      6      OO     CLE          211         76   
2  1.739940e+09  2011      6      DL     LEX           44         24   
3  1.739940e+09  2006      6      OO     CVG           53         18   
4  1.739940e+09  2011      6      DL     PDX          456         78   

   carrier_ct  weather_ct  nas_ct  ...  weather_delay  nas_delay  \
0          13           0       1  ...              0         68   
1          43           2       1  ...            158         34   
2           8           4       1  ...            405        154   
3          16           0       0  ...              0          0   
4          28           4      16  ...            466        701   

   security_delay  late_aircraft_delay  delay_rate  on_time  record_id  \
0               0                   84          10        0     128381   
1         

In [8]:
df_offline.drop('record_id', axis=1, inplace=True)
df_offline.drop('write_time', axis=1, inplace=True)
df_offline.drop('api_invocation_time', axis=1, inplace=True)
df_offline.drop('is_deleted', axis=1, inplace=True)

df_offline.head()

Unnamed: 0,event_time,year,month,carrier,airport,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,...,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,delay_rate,on_time
0,1739940000.0,2015,10,UA,OGG,169,17,13,0,1,...,0,0,1168,1016,0,68,0,84,10,0
1,1739940000.0,2006,6,OO,CLE,211,76,43,2,1,...,9,0,3564,1685,158,34,0,1687,36,0
2,1739940000.0,2011,6,DL,LEX,44,24,8,4,1,...,3,0,1607,337,405,154,0,711,54,0
3,1739940000.0,2006,6,OO,CVG,53,18,16,0,0,...,0,0,455,413,0,0,0,42,33,0
4,1739940000.0,2011,6,DL,PDX,456,78,28,4,16,...,1,0,5080,1693,466,701,0,2220,17,0


In [9]:
# Encode categorical features
from sklearn.preprocessing import LabelEncoder

for col in ['carrier', 'airport']:
    le = LabelEncoder()
    df_offline[col] = le.fit_transform(df_offline[col])


In [10]:
df_offline.fillna(0, inplace=True)  # Handle missing values

In [11]:
# Split production data into training & validation sets
from sklearn.model_selection import train_test_split

X = df_offline.drop('on_time', axis=1)
y = df_offline['on_time']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to CSV for SageMaker (label column must be first)
train_combined = pd.concat([y_train, X_train], axis=1)
val_combined = pd.concat([y_val, X_val], axis=1)

train_csv_path = "train.csv"
val_csv_path = "validation.csv"

train_combined.to_csv(train_csv_path, index=False, header=False)
val_combined.to_csv(val_csv_path, index=False, header=False)

# Upload data to S3
train_s3_path = sagemaker_session.upload_data(path=train_csv_path, bucket=bucket, key_prefix=prefix+"/train")
val_s3_path = sagemaker_session.upload_data(path=val_csv_path, bucket=bucket, key_prefix=prefix+"/validation")

---

## Training job and model creation

The below cell uses the [SageMaker Python SDK](https://github.com/aws/sagemaker-python-sdk) to kick off the training job using both our training set and validation set. Not that the objective is set to 'binary:logistic' which trains a model to output a probability between 0 and 1 (here the probability of a tumor being malignant).

In [13]:
# Retrieve XGBoost container image
from sagemaker.image_uris import retrieve
from sagemaker.inputs import TrainingInput
xgboost_image_uri = retrieve("xgboost", region, "1.5-1")

# Define XGBoost estimator
xgb = sagemaker.estimator.Estimator(
    image_uri=xgboost_image_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=f"s3://{bucket}/{prefix}/output",
    sagemaker_session=sagemaker_session
)

xgb.set_hyperparameters(
    objective="binary:logistic",
    num_round=100,
    max_depth=5,
    eta=0.2,
    subsample=0.8,
    eval_metric="auc"
)

# Define training inputs
train_input = TrainingInput(train_s3_path, content_type="text/csv")
val_input = TrainingInput(val_s3_path, content_type="text/csv")

# Train the model
xgb.fit({"train": train_input, "validation": val_input})

2025-02-19 05:16:18 Starting - Starting the training job...
2025-02-19 05:16:31 Starting - Preparing the instances for training...
2025-02-19 05:17:16 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-02-19 05:18:12.707 ip-10-2-112-41.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-02-19 05:18:12.729 ip-10-2-112-41.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-02-19:05:18:13:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-02-19:05:18:13:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2025-02-19:05:18:13:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-02-19:05:18:13:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-02-19:05:18:13:INFO] Running XGBoost Sag

In [16]:
# Deploy the model as a SageMaker endpoint
endpoint_name_single_request = "flight-delay-xgboost-endpoint-single-request"
predictor_single_request = xgb.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge",
    endpoint_name=endpoint_name_single_request
)

# ✅ Store the endpoint name for later retrieval (especially for cleanup)
%store endpoint_name_single_request
print(f"✅ Stored SageMaker endpoint name: {endpoint_name_single_request}")

In [18]:
import io
import numpy as np

single_sample = X_val.iloc[0].values.reshape(1, -1)
csv_buffer = io.StringIO()
np.savetxt(csv_buffer, single_sample, delimiter=",")
csv_text = csv_buffer.getvalue().strip()  # str

response = predictor_single_request.predict(
    csv_text,
    initial_args={"ContentType": "text/csv"}
)
print("Response:", response)


Response: b'1.0493714398762677e-05\n'


---

## Batch Transform



In [20]:
# Only the features (no label)

small_test_set = df_offline.drop('on_time', axis=1).sample(500)

small_test_csv_path = "small_test.csv"
small_test_set.to_csv(small_test_csv_path, index=False, header=False)

# Upload smaller dataset to S3
small_test_s3_path = sagemaker_session.upload_data(
    path=small_test_csv_path,
    bucket=bucket,
    key_prefix=prefix + "/small_test"
)


In [21]:
batch_input_s3 = small_test_s3_path
batch_output_s3 = f"s3://{bucket}/{prefix}/batch-output"

transformer = xgb.transformer(
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=batch_output_s3
)

transformer.transform(
    data=batch_input_s3,
    content_type="text/csv",
    split_type="Line"
)
transformer.wait()

print("Batch transform job complete.")

  from pandas import MultiIndex, Int64Index[0m
[34m[2025-02-19:05:31:18:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-02-19:05:31:18:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-02-19:05:31:18:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;

In [22]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

endpoint_name_batch_transform = "flight-delay-xgboost-endpoint-with-batch-transform"

predictor_batch_request = xgb.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge",
    endpoint_name=endpoint_name_batch_transform,
    serializer=CSVSerializer(),              # <--- important
    deserializer=JSONDeserializer()          # or StringDeserializer() depending on your output
)


------!

In [23]:
# Test inference with valid data (multiple samples)
test_samples = X_val.sample(5).values  # Select 5 random validation samples
import io
csv_buffer = io.StringIO()
np.savetxt(csv_buffer, test_samples, delimiter=",")
test_samples_csv = csv_buffer.getvalue().strip()

response = predictor_batch_request.predict(test_samples_csv)
print("Predicted probabilities of delay for test samples:", response)

# Cleanup: Uncomment to delete endpoint when done
# predictor.delete_endpoint()

Predicted probabilities of delay for test samples: {'predictions': [{'score': 4.370238002593396e-06}, {'score': 4.370238002593396e-06}, {'score': 1.0493714398762677e-05}, {'score': 4.370238002593396e-06}, {'score': 4.370238002593396e-06}]}


In [None]:
# Cleanup: Uncomment to delete endpoint when done
predictor_single_request.delete_endpoint()
predictor_batch_request.delete_endpoint()