In [10]:
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
import io

# Step 1: Setup
role = get_execution_role()
region = boto3.session.Session().region_name
session = sagemaker.Session()
bucket = "cardiovale-solutions-datascience-pipeline"
train_key = "xgb/manual/train/train.csv"

# Step 2: Load data with extreme validation
s3_client = boto3.client("s3")
source_key = "feature-store/cardio/cardio-feature-group-22-21-14-34/autopilot_input.csv"
response = s3_client.get_object(Bucket=bucket, Key=source_key)
df = pd.read_csv(response["Body"])

# Step 3: Nuclear option for label cleaning
print("Original label distribution:")
print(df["cardio"].value_counts())

# Convert to numeric, drop NA, then force binary
df["cardio"] = pd.to_numeric(df["cardio"], errors="coerce")
df = df.dropna(subset=["cardio"])
df["cardio"] = np.where(df["cardio"] > 0.5, 1, 0).astype(int) 

# Final validation
assert set(df["cardio"].unique()) == {0, 1}, f"Invalid labels found: {df['cardio'].unique()}"
print("Final label distribution:")
print(df["cardio"].value_counts())

# Step 4: Prepare features
df = df.select_dtypes(include=[np.number])
df = df.replace([np.inf, -np.inf], np.nan).dropna()

# Step 5: Move label to last position
label_col = "cardio"
cols = [c for c in df.columns if c != label_col] + [label_col]
df = df[cols]

# Ensure label is the first column (required for CSV with no header)
label_col = "cardio"
cols = [label_col] + [col for col in df.columns if col != label_col]
df = df[cols]

# Step 6: Create train/test split
train_df = df.sample(frac=0.7, random_state=42)
test_df = df.drop(train_df.index)
print(f"Split complete - Train: {train_df.shape}, Test: {test_df.shape}")

# Step 7: Save to CSV with explicit formatting
csv_buffer = io.StringIO()
train_df.to_csv(csv_buffer, index=False, header=False, float_format="%.6f")
csv_content = csv_buffer.getvalue()

# Verify the last column
last_col_vals = [row.split(",")[-1].strip() for row in csv_content.split("\n") if row]
print(f"Last column sample values: {set(last_col_vals[:100])}")

# Upload with proper content type
s3_client.put_object(
    Bucket=bucket,
    Key=train_key,
    Body=csv_content,
    ContentType="text/csv"
)

# Step 8: Configure training
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.2-2")  

xgb_estimator = Estimator(
    image_uri=xgboost_container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket}/xgb/manual/output/",
    sagemaker_session=session,
    base_job_name="cardio-xgboost-final"
)

# Simplified hyperparameters
xgb_estimator.set_hyperparameters(
    objective="binary:logistic",
    num_round=50,
    eval_metric="error",
    verbosity=2  # Increased logging
)

# Step 9: Train with File mode
print("Starting training with verified data...")
xgb_estimator.fit(
    {"train": TrainingInput(
        f"s3://{bucket}/{train_key}",
        content_type="text/csv",
        input_mode="File",
        distribution="ShardedByS3Key"  # Better for large files
    )},
    wait=True,
    logs=True
)

Original label distribution:
cardio
0    35021
1    34979
Name: count, dtype: int64
Final label distribution:
cardio
0    35021
1    34979
Name: count, dtype: int64
Split complete - Train: (49000, 16), Test: (21000, 16)
Last column sample values: {'0.000000', '2.000000'}


INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: cardio-xgboost-final-2025-03-31-05-50-21-881


Starting training with verified data...
2025-03-31 05:50:24 Starting - Starting the training job...
2025-03-31 05:50:39 Starting - Preparing the instances for training...
2025-03-31 05:51:02 Downloading - Downloading input data...
2025-03-31 05:51:52 Downloading - Downloading the training image......
2025-03-31 05:52:54 Training - Training image download completed. Training in progress.
2025-03-31 05:52:54 Uploading - Uploading generated training model[34m[2025-03-31 05:52:45.462 ip-10-0-223-231.ec2.internal:8 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-03-31:05:52:45:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-03-31:05:52:45:INFO] Failed to parse hyperparameter eval_metric value error to Json.[0m
[34mReturning the value itself[0m
[34m[2025-03-31:05:52:45:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-03-31:05:52:45:INFO] No GPUs detected (nor

In [14]:
# Deploying The Predictor
xgb_predictor = xgb_estimator.deploy(initial_instance_count=1, instance_type='ml.m5.large')



INFO:sagemaker:Creating model with name: cardio-xgboost-final-2025-03-31-06-02-28-509


ClientError: An error occurred (AccessDeniedException) when calling the CreateModel operation: User: arn:aws:sts::786782285170:assumed-role/LabRole/SageMaker is not authorized to perform: sagemaker:CreateModel on resource: arn:aws:sagemaker:us-east-1:786782285170:model/cardio-xgboost-final-2025-03-31-06-02-28-509 with an explicit deny in an identity-based policy

In [16]:
from sagemaker.serializers import CSVSerializer

# Use test_df (or whatever variable holds your test data) instead of test
test_array = test_df.drop(['cardio'], axis=1).values

# Set the serializer for converting input to CSV format
xgb_predictor.serializer = CSVSerializer() 

# Make predictions
predictions = xgb_predictor.predict(test_array)

# If predictions come back as bytes, decode and clean them
if isinstance(predictions, bytes):
    predictions = predictions.decode('utf-8').strip()

# Convert the prediction string into a NumPy array
predictions_array = np.fromstring(predictions, sep=',')
print(predictions_array.shape)


NameError: name 'xgb_predictor' is not defined