## 1Ô∏è‚É£ Setup & Imports

In [26]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.amazon.amazon_estimator import get_image_uri
import boto3
import pandas as pd
import numpy as np
import json


## 2Ô∏è‚É£ Define S3 paths

In [27]:
session = sagemaker.Session()

bucket = "my-cloud-ai-bucket"       # <-- your bucket
prefix = "electricity"              # folder
region = session.boto_region_name

role = get_execution_role()

print("Region:", region)
print("Role:", role)


Region: us-east-1
Role: arn:aws:iam::711696934160:role/LabRole


## 3Ô∏è‚É£ Upload your local training & test CSVs

In [28]:
train_s3 = session.upload_data(
    path="electricity_train.csv",
    bucket=bucket,
    key_prefix=f"{prefix}"
)

test_s3_original = session.upload_data(
    path="electricity_test.csv",
    bucket=bucket,
    key_prefix=f"{prefix}"
)

print("Train S3:", train_s3)
print("Test S3 :", test_s3_original)


Train S3: s3://my-cloud-ai-bucket/electricity/electricity_train.csv
Test S3 : s3://my-cloud-ai-bucket/electricity/electricity_test.csv


## 4Ô∏è‚É£ Prepare Batch Transform Safe Test File

In [29]:
# Load test
test_df = pd.read_csv("electricity_test.csv")

# Correct training feature order
feature_cols = [
    "hour", "day_of_week", "month", "week", "is_weekend",
    "lag_1", "lag_24", "lag_168", "roll_24", "roll_168"
]

# Keep numeric features only
test_clean = test_df[feature_cols]

# Save WITHOUT header / index (SageMaker requirement)
clean_test_path = "electricity_test_noml.csv"
test_clean.to_csv(clean_test_path, header=False, index=False)

print("Prepared batch transform CSV:")
print(test_clean.head())


Prepared batch transform CSV:
   hour  day_of_week  month  week  is_weekend    lag_1   lag_24  lag_168  \
0     0            0      1     1           0  19894.0  20527.0  17583.0   
1     1            0      1     1           0  19912.5  19851.5  17460.0   
2     2            0      1     1           0  19747.0  18983.0  16496.0   
3     3            0      1     1           0  18429.0  17948.5  15535.0   
4     4            0      1     1           0  17264.5  17436.5  15011.0   

        roll_24      roll_168  
0  23172.854167  22958.660714  
1  23147.250000  22972.526786  
2  23142.895833  22986.139881  
3  23119.812500  22997.645833  
4  23091.312500  23007.940476  


## 5Ô∏è‚É£ Upload Clean Test File

In [30]:
test_s3 = session.upload_data(
    path=clean_test_path,
    bucket=bucket,
    key_prefix=f"{prefix}"
)

print("Clean test S3:", test_s3)


Clean test S3: s3://my-cloud-ai-bucket/electricity/electricity_test_noml.csv


## 6Ô∏è‚É£ Train XGBoost

In [31]:
container = get_image_uri(region, "xgboost", "1.5-1")
container

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1'

In [32]:
from sagemaker.estimator import Estimator

xgb_estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=10,
    output_path=f"s3://{bucket}/{prefix}/output",
    sagemaker_session=session,
)

xgb_estimator.set_hyperparameters(
    objective="reg:squarederror",
    num_round=150,
    max_depth=8,
    eta=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    gamma=0,
)


In [33]:
train_input = TrainingInput(train_s3, content_type="text/csv")

xgb_estimator.fit({"train": train_input})

print("Training completed.")


INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-11-23-19-07-20-371


2025-11-23 19:07:21 Starting - Starting the training job...
2025-11-23 19:07:36 Starting - Preparing the instances for training...
2025-11-23 19:08:24 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-11-23 19:09:16.789 ip-10-0-70-181.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-11-23 19:09:16.810 ip-10-0-70-181.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-11-23:19:09:17:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-11-23:19:09:17:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2025-11-23:19:09:17:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-11-23:19:09:17:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2025-11-23:19:09:17:INFO] Determined 0 GPU(s) available on the instance.[0m
[34m[2025-11-23:19:0

## 7Ô∏è‚É£ Retrieve Model Artifacts

In [34]:
training_job = xgb_estimator.latest_training_job
desc = training_job.describe()

model_artifact = desc["ModelArtifacts"]["S3ModelArtifacts"]
print("Model artifacts:", model_artifact)


Model artifacts: s3://my-cloud-ai-bucket/electricity/output/sagemaker-xgboost-2025-11-23-19-07-20-371/output/model.tar.gz


## 8Ô∏è‚É£ Create SageMaker Model

In [35]:
from sagemaker.model import Model

model_name = "electricity-xgb-model"

model = Model(
    name=model_name,
    model_data=model_artifact,
    image_uri=container,
    role=role
)

model.create(instance_type="ml.m5.large")
print("Model created:", model_name)


INFO:sagemaker:Creating model with name: electricity-xgb-model


Model created: electricity-xgb-model


## 9Ô∏è‚É£ Batch Transform Job

In [36]:
from sagemaker.transformer import Transformer

transformer = Transformer(
    model_name=model_name,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket}/{prefix}/batch_output"
)

print("Running batch transform...")

transformer.transform(
    data=test_s3,
    content_type="text/csv",
    split_type="Line"
)

transformer.wait()
print("Batch transform done.")


INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2025-11-23-19-10-39-321


Running batch transform...
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-11-23:19:16:14:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-11-23:19:16:14:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-11-23:19:16:14:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      

## üîü Download & Evaluate Predictions

In [37]:
# Download output
s3 = boto3.client("s3")

output_key = f"{prefix}/batch_output/{clean_test_path}.out"
local_out = "predictions.csv"

s3.download_file(bucket, output_key, local_out)

preds = pd.read_csv(local_out, header=None)
preds.columns = ["prediction"]

print(preds.head())


    prediction
0  1692.887939
1  1692.887939
2  1692.718262
3  1692.718262
4  1692.718262


In [38]:
# Full test includes actual values
test_full = pd.read_csv("electricity_test.csv")
y_true = test_full["demand_mw"]

rmse = np.sqrt(np.mean((preds["prediction"] - y_true) ** 2))
rmse


22730.11164378932