In [1]:
import sagemaker
from sagemaker.sklearn.model import SKLearnModel
from sagemaker import get_execution_role
import boto3
import pandas as pd

session = sagemaker.Session()
role = get_execution_role()

image_uri = sagemaker.image_uris.retrieve(
    framework='sklearn',
    region=session.boto_region_name,
    version='1.2-1'
)

model = SKLearnModel(
    model_data="s3://kaggle-housing-pipeline-data/models/xgb_pipeline.tar.gz",
    role=role,
    entry_point="inference.py",
    image_uri=image_uri
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
transformer = model.transformer(
    instance_count=1,
    instance_type="ml.m5.large",
    output_path="s3://kaggle-housing-pipeline-data/predictions/"
)

# Input test.csv path
test_data_path = "s3://kaggle-housing-pipeline-data/test.csv"

transformer.transform(
    data=test_data_path,
    content_type="text/csv",
    split_type="Line"
)

transformer.wait()

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2025-07-10-17-17-06-799
INFO:sagemaker:Creating transform job with name: sagemaker-scikit-learn-2025-07-10-17-17-07-464


In [None]:
# Download prediction output from S3
s3 = boto3.client("s3")
bucket_name = "kaggle-housing-pipeline-data"

# Replace this with actual output file key (check S3 console if needed)
output_key = "predictions/test.csv.out"
local_prediction_path = "predictions.csv"

s3.download_file(bucket_name, output_key, local_prediction_path)

# Read predictions
preds = pd.read_csv(local_prediction_path, header=None, names=["SalePrice"])

# Download original test.csv so we can extract Id column
local_test_path = "test.csv"
s3.download_file(bucket_name, "test.csv", local_test_path)

test_df = pd.read_csv(local_test_path)
result_df = pd.DataFrame({
    "Id": test_df["Id"],
    "SalePrice": preds["SalePrice"]
})

# Save result.csv locally
result_csv = "result.csv"
result_df.to_csv(result_csv, index=False)

# Upload result.csv to s3://kaggle-housing-pipeline-data/results/
s3.upload_file(result_csv, bucket_name, "results/result.csv")

print("✅ result.csv created and saved to s3://kaggle-housing-pipeline-data/results/result.csv")