### Create inference and pkl for the training feature

In [5]:
# Generate_model_and_inference.py
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import pickle
import tarfile

# Step 1: Load full dataset for inference preparation
df = pd.read_csv("s3://sagemaker-us-east-1-381492296191/cardio_data/cardio_prod_split40.csv")
df.drop(columns=["cardio"], inplace=True)

# Save as no-label inference CSV (no header as expected by SageMaker)
df.to_csv("cardio_prod_no_label.csv", index=False, header=False)

# Step 2: Load full dataset for training
df_train = pd.read_csv("s3://sagemaker-us-east-1-381492296191/cardio_data/cardio_prod_split40.csv")
X = df_train.drop(columns=["cardio"])
y = df_train["cardio"]

# Step 3: Encode any object-type features if still present (shouldn't be anymore)
for col in X.select_dtypes(include="object").columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Step 4: Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X, y)

# Step 5: Save model
with open("logistic_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Step 6: Create inference.py
inference_code = '''\
import pickle
import pandas as pd
from io import StringIO

# Define feature columns (excluding cholesterol_label)
FEATURE_COLUMNS = [
    'bmi', 'pulse_pressure', 'chol_bmi_ratio', 'age_gluc_interaction', 'age_years',
    'bp_category', 'bmi_category', 'age_group', 'age', 'gender',
    'systolic_bp', 'diastolic_bp', 'cholesterol', 'gluc', 'smoke',
    'alco', 'active', 'is_hypertensive', 'lifestyle_score'
]

def model_fn(model_dir):
    with open(f"{model_dir}/logistic_model.pkl", "rb") as f:
        return pickle.load(f)

def input_fn(input_data, content_type):
    df = pd.read_csv(StringIO(input_data), header=None)
    df.columns = FEATURE_COLUMNS
    return df

def predict_fn(input_data, model):
    return model.predict(input_data)

def output_fn(prediction, content_type):
    return '\\n'.join(str(x) for x in prediction)
'''

with open("inference.py", "w") as f:
    f.write(inference_code)

# Step 7: Package model and inference code
with tarfile.open("logistic_model.tar.gz", "w:gz") as tar:
    tar.add("logistic_model.pkl")
    tar.add("inference.py")

print("✅ logistic_model.tar.gz created successfully.")

✅ logistic_model.tar.gz created successfully.


import joblib
joblib.dump(model, 'logistic_model.pkl')

In [19]:
import joblib

# After training your model:
joblib.dump(model, "logistic_model.pkl")

['logistic_model.pkl']

### Compare Features in Training vs Inference Data

In [6]:
import pandas as pd

# Load the training data (with label)
df_train = pd.read_csv("s3://sagemaker-us-east-1-381492296191/cardio_data/cardio_prod_split40.csv")
train_features = df_train.drop(columns=["cardio"]).columns.tolist()

# Load the inference data (no label)
df_infer = pd.read_csv("cardio_prod_no_label.csv", header=None)

# Compare number of columns
print("✅ Training Features Count:", len(train_features))
print("✅ Inference Features Count:", df_infer.shape[1])

# Optional: Set column names on inference data to match training features for manual inspection
df_infer.columns = train_features

# Compare data types and column names
print("\n📌 Training Feature Types:")
print(df_train[train_features].dtypes)

print("\n📌 Inference Data Types:")
print(df_infer.dtypes)

# Identify mismatched columns (by type or order)
mismatch_columns = [
    (col, df_train[col].dtype, df_infer[col].dtype)
    for col in train_features
    if df_train[col].dtype != df_infer[col].dtype
]

if mismatch_columns:
    print("\n❌ Mismatched Columns Found:")
    for col, train_type, infer_type in mismatch_columns:
        print(f"   - {col}: training type = {train_type}, inference type = {infer_type}")
else:
    print("\n✅ All feature columns match in name and dtype.")

✅ Training Features Count: 19
✅ Inference Features Count: 19

📌 Training Feature Types:
bmi                     float64
pulse_pressure          float64
chol_bmi_ratio          float64
age_gluc_interaction    float64
age_years               float64
bp_category               int64
bmi_category              int64
age_group                 int64
age                       int64
gender                    int64
systolic_bp               int64
diastolic_bp              int64
cholesterol               int64
gluc                      int64
smoke                     int64
alco                      int64
active                    int64
is_hypertensive           int64
lifestyle_score           int64
dtype: object

📌 Inference Data Types:
bmi                     float64
pulse_pressure          float64
chol_bmi_ratio          float64
age_gluc_interaction    float64
age_years               float64
bp_category               int64
bmi_category              int64
age_group                 int64
age      

In [7]:
print("Training Features:\n", train_features)
print("Inference Columns:\n", df_infer.columns.tolist())

Training Features:
 ['bmi', 'pulse_pressure', 'chol_bmi_ratio', 'age_gluc_interaction', 'age_years', 'bp_category', 'bmi_category', 'age_group', 'age', 'gender', 'systolic_bp', 'diastolic_bp', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'is_hypertensive', 'lifestyle_score']
Inference Columns:
 ['bmi', 'pulse_pressure', 'chol_bmi_ratio', 'age_gluc_interaction', 'age_years', 'bp_category', 'bmi_category', 'age_group', 'age', 'gender', 'systolic_bp', 'diastolic_bp', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'is_hypertensive', 'lifestyle_score']


### Creating inference.py file

In [21]:
import tarfile

with tarfile.open("logistic_model.tar.gz", "w:gz") as tar:
    tar.add("logistic_model.pkl")
    tar.add("inference.py")

### Sagemaker Transform Job

In [8]:
from sagemaker.sklearn.model import SKLearnModel
from sagemaker import Session
import sagemaker

# Get SageMaker session and role
session = Session()
role = sagemaker.get_execution_role()

# Define the model
model = SKLearnModel(
    model_data="s3://sagemaker-us-east-1-381492296191/cardio_data/logistic_model.tar.gz",
    role=role,
    entry_point="inference.py",
    framework_version="0.23-1",
    sagemaker_session=session
)

# Create a transformer object
transformer = model.transformer(
    instance_count=1,
    instance_type="ml.m5.large",
    output_path="s3://sagemaker-us-east-1-381492296191/cardio_data/predictions/",
    accept="text/csv"
)

# Define input data location
production_data_s3_uri = "s3://sagemaker-us-east-1-381492296191/cardio_data/cardio_prod_no_label.csv"

# Run batch transform
transformer.transform(
    data=production_data_s3_uri,
    content_type="text/csv",
    split_type="Line"
)

print(f"SageMaker Batch Transform job started for data: {production_data_s3_uri}")
print(f"Output will be saved to: s3://sagemaker-us-east-1-381492296191/cardio_data/predictions/")

# Wait for completion
transformer.wait()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


INFO:sagemaker:Creating transform job with name: sagemaker-scikit-learn-2025-05-31-16-01-22-230


............................[34m2025-05-31 16:05:58,390 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2025-05-31 16:05:58,394 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2025-05-31 16:05:58,395 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
  