### Create inference and pkl for the training feature

In [1]:
!pip install --upgrade pip

Collecting pip
  Using cached pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-25.1.1-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0.1
    Uninstalling pip-25.0.1:
      Successfully uninstalled pip-25.0.1
Successfully installed pip-25.1.1


In [2]:
import pandas as pd
import sklearn
import joblib
import boto3
import sagemaker
import google.protobuf

print("pandas:", pd.__version__)
print("scikit-learn:", sklearn.__version__)
print("joblib:", joblib.__version__)
print("boto3:", boto3.__version__)
print("sagemaker:", sagemaker.__version__)
print("protobuf:", google.protobuf.__version__)

pandas: 1.0.5
scikit-learn: 0.23.2
joblib: 0.14.1
boto3: 1.17.106
sagemaker: 2.34.0
protobuf: 3.12.0


In [3]:
# Import Libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import pickle
import tarfile
import joblib
import boto3
import os
from sagemaker.sklearn.model import SKLearnModel
from sagemaker import Session
import sagemaker

In [4]:
# Load the dataset
df = pd.read_csv("s3://sagemaker-us-east-1-531690656306/cardio_data/cardio_prod_split40.csv")
print(df.head())

        age  height_ft  weight_lbs  systolic_bp  diastolic_bp  cholesterol  \
0 -0.860483  -0.323580   -0.914107    -0.410657     -0.147087    -0.538657   
1 -0.268438   1.211180    0.266123    -0.410657     -0.147087    -0.538657   
2  0.915651  -1.052592   -0.983384    -0.410657     -0.147087    -0.538657   
3 -0.712472  -0.822378   -1.191845    -0.410657     -0.147087    -0.538657   
4  0.323606  -0.170104   -0.358631    -0.410657     -0.147087    -0.538657   

       gluc     smoke     alco    active  ...  age_years  is_hypertensive  \
0 -0.390761 -0.312731 -0.23822  0.494625  ...  -0.860483        -0.607947   
1 -0.390761 -0.312731 -0.23822 -2.021734  ...  -0.268438        -0.607947   
2 -0.390761 -0.312731 -0.23822  0.494625  ...   0.915651        -0.607947   
3 -0.390761 -0.312731 -0.23822  0.494625  ...  -0.712472        -0.607947   
4 -0.390761 -0.312731 -0.23822  0.494625  ...   0.323606        -0.607947   

   age_gluc_interaction  lifestyle_score  gender  bp_category  bmi_c

In [5]:
# Generate_model_and_inference.py

# Load full dataset for inference preparation
df = pd.read_csv("s3://sagemaker-us-east-1-531690656306/cardio_data/cardio_prod_split40.csv")  # Thai's Path
# df = pd.read_csv("s3://sagemaker-us-east-1-381492296191/cardio_data/cardio_prod_split40.csv")  # Prema's Path

# Drop label column for inference
df.drop(columns=["cardio"], inplace=True)

# Save as no-label inference CSV (no header as expected by SageMaker)
df.to_csv("cardio_prod_no_label.csv", index=False, header=False)

# Load full dataset again for training
df_train = pd.read_csv("s3://sagemaker-us-east-1-531690656306/cardio_data/cardio_prod_split40.csv")  # Thai's Path
# df_train = pd.read_csv("s3://sagemaker-us-east-1-381492296191/cardio_data/cardio_prod_split40.csv")  # Prema's Path

# Prepare features & labels
X = df_train.drop(columns=["cardio"])
y = df_train["cardio"]

# Encode any object-type features if still present
# for col in X.select_dtypes(include="object").columns:
    # le = LabelEncoder()
    # X[col] = le.fit_transform(X[col])

# Train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X, y)

# Save model with compatible pickle protocol
joblib.dump(model, "logistic_model.pkl", protocol=4)

# Create inference.py file
inference_code = '''
import pandas as pd
from io import StringIO
import joblib
import os

# Define feature columns matching your final dataset (23 columns)
FEATURE_COLUMNS = [
    'age', 'gender', 'height_ft', 'weight_lbs', 'systolic_bp', 'diastolic_bp',
    'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'bmi',
    'age_group', 'cholesterol_label', 'pulse_pressure', 'chol_bmi_ratio',
    'height_in', 'age_years', 'is_hypertensive', 'bp_category', 'bmi_category',
    'age_gluc_interaction', 'lifestyle_score'
]

def model_fn(model_dir):
    return joblib.load(os.path.join(model_dir, "logistic_model.pkl"))

def input_fn(input_data, content_type):
    if content_type == "text/csv":
        df = pd.read_csv(StringIO(input_data), header=None)
        if df.shape[1] != len(FEATURE_COLUMNS):
            raise ValueError(f"Column mismatch: expected {len(FEATURE_COLUMNS)} columns, got {df.shape[1]}")
        df.columns = FEATURE_COLUMNS
        return df
    else:
        raise ValueError(f"Unsupported content type: {content_type}")

def predict_fn(input_data, model):
    return model.predict(input_data)

def output_fn(prediction, content_type):
    return '\\n'.join(str(x) for x in prediction)
'''

# Write inference.py
with open("inference.py", "w") as f:
    f.write(inference_code)

model_filename = "logistic_model.pkl"

# Package model and inference code
tar_filename = 'logistic_model.tar.gz'
with tarfile.open(tar_filename, "w:gz") as tar:
    tar.add(model_filename)
    tar.add("inference.py")

print("logistic_model.tar.gz created successfully.")

logistic_model.tar.gz created successfully.


### Save files directly into S3 Bucket

In [6]:
# Define your bucket and prefix
bucket = 'sagemaker-us-east-1-531690656306' # Thai's Path
prefix = 'model'

# Your filenames (assuming you created these earlier)
model_filename = 'logistic_model.pkl'
tar_filename = 'logistic_model.tar.gz'
inference_csv_file = 'cardio_prod_no_label.csv'

# Upload all files to S3 (overwrite automatically)
s3_client = boto3.client('s3')

s3_client.upload_file(model_filename, bucket, f"{prefix}/{model_filename}")
s3_client.upload_file("inference.py", bucket, f"{prefix}/inference.py")
s3_client.upload_file(tar_filename, bucket, f"{prefix}/{tar_filename}")
s3_client.upload_file(inference_csv_file, bucket, f"{prefix}/{inference_csv_file}")

print("All files uploaded successfully!")

All files uploaded successfully!


### Compare Features in Training vs Inference Data

In [7]:
# Load the training data (with label)
# df_train = pd.read_csv("s3://sagemaker-us-east-1-381492296191/cardio_data/cardio_prod_split40.csv") # Prema's Path
df_train = pd.read_csv("s3://sagemaker-us-east-1-531690656306/cardio_data/cardio_prod_split40.csv") # Thai's path
train_features = df_train.drop(columns=["cardio"]).columns.tolist()

# Load the inference data (no label)
df_infer = pd.read_csv("cardio_prod_no_label.csv", header=None)

# Compare number of columns
print("Training Features Count:", len(train_features))
print("Inference Features Count:", df_infer.shape[1])

# Set column names on inference data to match training features for manual inspection (Optional)
df_infer.columns = train_features

# Compare data types and column names
print("\nTraining Feature Types:")
print(df_train[train_features].dtypes)

print("\nInference Data Types:")
print(df_infer.dtypes)

# Identify mismatched columns (by type or order)
mismatch_columns = [
    (col, df_train[col].dtype, df_infer[col].dtype)
    for col in train_features
    if df_train[col].dtype != df_infer[col].dtype
]

if mismatch_columns:
    print("\nMismatched Columns Found:")
    for col, train_type, infer_type in mismatch_columns:
        print(f"   - {col}: training type = {train_type}, inference type = {infer_type}")
else:
    print("\nAll feature columns match in name and dtype.")

Training Features Count: 23
Inference Features Count: 23

Training Feature Types:
age                     float64
height_ft               float64
weight_lbs              float64
systolic_bp             float64
diastolic_bp            float64
cholesterol             float64
gluc                    float64
smoke                   float64
alco                    float64
active                  float64
bmi                     float64
pulse_pressure          float64
chol_bmi_ratio          float64
height_in               float64
age_years               float64
is_hypertensive         float64
age_gluc_interaction    float64
lifestyle_score         float64
gender                  float64
bp_category             float64
bmi_category            float64
age_group               float64
cholesterol_label       float64
dtype: object

Inference Data Types:
age                     float64
height_ft               float64
weight_lbs              float64
systolic_bp             float64
diastolic_bp     

### Creating inference.py file

In [8]:
with tarfile.open("logistic_model.tar.gz", "w:gz") as tar:
    tar.add("logistic_model.pkl")
    tar.add("inference.py")

### Sagemaker Transform Job

In [9]:
# Get SageMaker session and role
session = Session()
role = sagemaker.get_execution_role()

# Define the model
model = SKLearnModel(
    # model_data="s3://sagemaker-us-east-1-381492296191/cardio_data/logistic_model.tar.gz", # Prema's path
    model_data="s3://sagemaker-us-east-1-531690656306/model/logistic_model.tar.gz", # Thai's path
    role=role,
    entry_point="inference.py",
    framework_version="0.23-1",
    sagemaker_session=session
)

# Create a transformer object
transformer = model.transformer(
    instance_count=1,
    instance_type="ml.m5.large",
    # output_path="s3://sagemaker-us-east-1-381492296191/cardio_data/predictions/", # Prema's Path
    output_path="s3://sagemaker-us-east-1-531690656306/cardio_data/predictions/", # Thai's Path
    accept="text/csv"
)

# Define input data location
# production_data_s3_uri = "s3://sagemaker-us-east-1-381492296191/cardio_data/cardio_prod_no_label.csv" # Prema's Path
production_data_s3_uri = "s3://sagemaker-us-east-1-531690656306/cardio_data/cardio_prod_no_label.csv" # Thai's Path

# Run batch transform
transformer.transform(
    data=production_data_s3_uri,
    content_type="text/csv",
    split_type="Line"
)

print(f"SageMaker Batch Transform job started for data: {production_data_s3_uri}")
# print(f"Output will be saved to: s3://sagemaker-us-east-1-381492296191/cardio_data/predictions/") # Prema's Path
print(f"Output will be saved to: s3://sagemaker-us-east-1-531690656306/cardio_data/predictions/") # Thai's Path

# Wait for completion
transformer.wait()

............................[34m2025-06-11 02:02:58,464 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2025-06-11 02:02:58,468 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2025-06-11 02:02:58,469 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
  

In [10]:
!aws s3 cp cardio_inference_transform_job_v2.ipynb s3://sagemaker-us-east-1-531690656306/cardio_project/cardio_inference_transform_job_v2.ipynb

upload: ./cardio_inference_transform_job_v2.ipynb to s3://sagemaker-us-east-1-531690656306/cardio_project/cardio_inference_transform_job_v2.ipynb
