In [1]:
# Cell 1: Load Data from S3
# This cell loads the 'WA_Fn-UseC_-Telco-Customer-Churn.csv' dataset
# directly from your specified S3 bucket into a Pandas DataFrame.

import pandas as pd
import sagemaker
import boto3

# Define your S3 bucket and file key
bucket_name = 'telco-churn-ml-nissoka' # Make sure this matches your bucket name exactly
file_key = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'

# Construct the S3 URI
s3_uri = f's3://{bucket_name}/{file_key}'

print(f"Attempting to load data from: {s3_uri}")

# Use pandas to read the CSV directly from S3
try:
    df = pd.read_csv(s3_uri)
    print("Data loaded successfully!")
    print(f"Shape of the dataset: {df.shape}")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())
except Exception as e:
    print(f"Error loading data: {e}")



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Attempting to load data from: s3://telco-churn-ml-nissoka/WA_Fn-UseC_-Telco-Customer-Churn.csv


severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Data loaded successfully!
Shape of the dataset: (7043, 21)

First 5 rows of the dataset:
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No

In [2]:
# Cell 2: Initial Data Inspection
# This cell performs initial checks on the dataset.
# It displays data types, non-null counts using .info(),
# descriptive statistics for numerical columns using .describe(),
# value counts for the 'Churn' target variable,
# and checks for any duplicate rows.

print("--- Dataset Info ---")
df.info()

print("\n--- Descriptive Statistics for Numerical Columns ---")
print(df.describe())

print("\n--- Value Counts for 'Churn' Column ---")
print(df['Churn'].value_counts())

print("\n--- Check for Duplicates ---")
print(f"Number of duplicate rows: {df.duplicated().sum()}")

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  704

In [3]:
# Cell 3: Handle TotalCharges Column
# This cell addresses the 'TotalCharges' column, which was incorrectly read as an object type.
# It converts non-numeric values (likely empty strings or spaces) to NaN, then fills these NaNs with 0,
# and finally converts the entire column to a numeric (float) type.

print("Original 'TotalCharges' data type:", df['TotalCharges'].dtype)

# Convert 'TotalCharges' to numeric, coercing errors to NaN
# This will turn any non-numeric string (like an empty string) into NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check for NaN values introduced by coercion
print(f"Number of NaN values introduced in 'TotalCharges': {df['TotalCharges'].isnull().sum()}")

# Fill NaN values with 0 (assuming these represent new customers with no charges yet)
df['TotalCharges'].fillna(0, inplace=True)

# Verify the data type and check descriptive statistics again
print("New 'TotalCharges' data type:", df['TotalCharges'].dtype)
print("\nDescriptive statistics for 'TotalCharges' after conversion:")
print(df['TotalCharges'].describe())

Original 'TotalCharges' data type: object
Number of NaN values introduced in 'TotalCharges': 11
New 'TotalCharges' data type: float64

Descriptive statistics for 'TotalCharges' after conversion:
count    7043.000000
mean     2279.734304
std      2266.794470
min         0.000000
25%       398.550000
50%      1394.550000
75%      3786.600000
max      8684.800000
Name: TotalCharges, dtype: float64


In [4]:
# Cell 4: Drop customerID and Encode Churn
# This cell performs two key data cleaning steps:
# 1. Drops the 'customerID' column as it's a unique identifier and not useful for modeling.
# 2. Encodes the 'Churn' target variable from 'Yes'/'No' to numerical 1/0 for model compatibility.

print("Original number of columns:", df.shape[1])

# Drop 'customerID' column
df.drop('customerID', axis=1, inplace=True)
print(f"'customerID' column dropped. New number of columns: {df.shape[1]}")

print("\nOriginal 'Churn' value counts:")
print(df['Churn'].value_counts())

# Encode 'Churn' column: 'Yes' to 1, 'No' to 0
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

print("\nNew 'Churn' value counts after encoding:")
print(df['Churn'].value_counts())
print(f"New 'Churn' data type: {df['Churn'].dtype}")

# Display the first few rows to confirm changes
print("\nFirst 5 rows after dropping 'customerID' and encoding 'Churn':")
print(df.head())

Original number of columns: 21
'customerID' column dropped. New number of columns: 20

Original 'Churn' value counts:
No     5174
Yes    1869
Name: Churn, dtype: int64

New 'Churn' value counts after encoding:
0    5174
1    1869
Name: Churn, dtype: int64
New 'Churn' data type: int64

First 5 rows after dropping 'customerID' and encoding 'Churn':
   gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female              0     Yes         No       1           No   
1    Male              0      No         No      34          Yes   
2    Male              0      No         No       2          Yes   
3    Male              0      No         No      45           No   
4  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL      

In [5]:
# Cell 5: Identify and Inspect Categorical Features
# This cell identifies all non-numerical columns (object type) in the DataFrame,
# which are considered categorical features. It then prints the unique values for each
# categorical column to check for consistency and prepare for encoding.

# Exclude 'Churn' as it's already encoded and is our target variable
categorical_cols = df.select_dtypes(include='object').columns.tolist()

print(f"Identified Categorical Columns ({len(categorical_cols)}):")
print(categorical_cols)

print("\n--- Unique values for each Categorical Column ---")
for col in categorical_cols:
    print(f"\nColumn '{col}':")
    print(df[col].value_counts())
    print(f"Number of unique values: {df[col].nunique()}")

Identified Categorical Columns (15):
['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

--- Unique values for each Categorical Column ---

Column 'gender':
Male      3555
Female    3488
Name: gender, dtype: int64
Number of unique values: 2

Column 'Partner':
No     3641
Yes    3402
Name: Partner, dtype: int64
Number of unique values: 2

Column 'Dependents':
No     4933
Yes    2110
Name: Dependents, dtype: int64
Number of unique values: 2

Column 'PhoneService':
Yes    6361
No      682
Name: PhoneService, dtype: int64
Number of unique values: 2

Column 'MultipleLines':
No                  3390
Yes                 2971
No phone service     682
Name: MultipleLines, dtype: int64
Number of unique values: 3

Column 'InternetService':
Fiber optic    3096
DSL            2421
No             1526
Name: Intern

In [6]:
# Cell 6: Consolidate Categories and Apply One-Hot Encoding
# This cell first standardizes specific categorical values
# ('No internet service' to 'No', 'No phone service' to 'No').
# Then, it identifies all categorical columns (excluding the target 'Churn')
# and applies one-hot encoding using pandas get_dummies() to convert them
# into a numerical format suitable for machine learning models.

# List of columns where 'No internet service' should be treated as 'No'
internet_service_cols = [
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies'
]

# Replace 'No internet service' with 'No' in relevant columns
for col in internet_service_cols:
    df[col] = df[col].replace('No internet service', 'No')
    print(f"Column '{col}' unique values after replacement:")
    print(df[col].value_counts()) # Verify change

# Replace 'No phone service' with 'No' in 'MultipleLines'
df['MultipleLines'] = df['MultipleLines'].replace('No phone service', 'No')
print(f"\nColumn 'MultipleLines' unique values after replacement:")
print(df['MultipleLines'].value_counts()) # Verify change


# Identify all categorical columns for one-hot encoding (excluding 'Churn')
# Use select_dtypes(include='object') to get current object-type columns
# After replacement, some might still be 'object' if 'Yes'/'No'
categorical_cols_for_ohe = df.select_dtypes(include='object').columns.tolist()

print(f"\nColumns selected for One-Hot Encoding ({len(categorical_cols_for_ohe)}):")
print(categorical_cols_for_ohe)

# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols_for_ohe, drop_first=True, dtype=int)
# drop_first=True prevents multicollinearity by dropping the first category (e.g., 'Male' if 'Female' is kept)
# dtype=int makes the new columns use 0/1 integers instead of booleans

print(f"\nShape of DataFrame after One-Hot Encoding: {df_encoded.shape}")
print("\nFirst 5 rows of the encoded DataFrame:")
print(df_encoded.head())

print("\nData types of the encoded DataFrame:")
print(df_encoded.info())

Column 'OnlineSecurity' unique values after replacement:
No     5024
Yes    2019
Name: OnlineSecurity, dtype: int64
Column 'OnlineBackup' unique values after replacement:
No     4614
Yes    2429
Name: OnlineBackup, dtype: int64
Column 'DeviceProtection' unique values after replacement:
No     4621
Yes    2422
Name: DeviceProtection, dtype: int64
Column 'TechSupport' unique values after replacement:
No     4999
Yes    2044
Name: TechSupport, dtype: int64
Column 'StreamingTV' unique values after replacement:
No     4336
Yes    2707
Name: StreamingTV, dtype: int64
Column 'StreamingMovies' unique values after replacement:
No     4311
Yes    2732
Name: StreamingMovies, dtype: int64

Column 'MultipleLines' unique values after replacement:
No     4072
Yes    2971
Name: MultipleLines, dtype: int64

Columns selected for One-Hot Encoding (15):
['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport

In [9]:
# Cell 7: Prepare Data for Modeling (Feature-Target Split & Train-Test Split)
# This cell separates the features (X) from the target variable (y - 'Churn').
# It then splits the data into training, validation, and test sets using sklearn's train_test_split.
# A common split is 70% for training, 15% for validation, and 15% for testing.

from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df_encoded.drop('Churn', axis=1) # All columns except 'Churn'
y = df_encoded['Churn'] # The 'Churn' column

print(f"Shape of X (features): {X.shape}")
print(f"Shape of y (target): {y.shape}")

# Split the data into training and temporary sets (85% train, 15% temp)
# We use a stratify=y to maintain the same churn distribution in splits
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
# Adjust test_size to 0.3 (or 30%) for the temp set, leaving 70% for train
# This makes (15% temp) / (30% total) = 0.5 for the second split

# Split the temporary set into validation and test sets (50% validation, 50% test of the temp set)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"\nShape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_val (validation): {X_val.shape}")
print(f"Shape of y_val (validation): {y_val.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

print("\n--- Churn distribution in splits ---")
print(f"Train Churn %: {y_train.value_counts(normalize=True)[1]:.2f}")
print(f"Validation Churn %: {y_val.value_counts(normalize=True)[1]:.2f}")
print(f"Test Churn %: {y_test.value_counts(normalize=True)[1]:.2f}")

Shape of X (features): (7043, 23)
Shape of y (target): (7043,)

Shape of X_train: (4930, 23)
Shape of y_train: (4930,)
Shape of X_val (validation): (1056, 23)
Shape of y_val (validation): (1056,)
Shape of X_test: (1057, 23)
Shape of y_test: (1057,)

--- Churn distribution in splits ---
Train Churn %: 0.27
Validation Churn %: 0.27
Test Churn %: 0.27


In [10]:
# Cell 8: Prepare Data for SageMaker Training and Upload to S3
# This cell prepares the training and validation datasets in the format expected by SageMaker's
# built-in XGBoost algorithm (CSV without headers, with the target variable as the first column).
# It then uploads these prepared datasets to the specified S3 bucket.

import os
import io # To write to string buffer for S3 upload
from sagemaker.session import Session # To get default sagemaker session

# Convert target (y) to float for consistency with Sagemaker's expectations for classification targets
y_train_float = y_train.astype('float32')
y_val_float = y_val.astype('float32')

# Concatenate target and features for training and validation sets
# SageMaker's built-in algorithms expect the target variable as the first column
train_data = pd.concat([y_train_float, X_train], axis=1)
val_data = pd.concat([y_val_float, X_val], axis=1)

print(f"Shape of train_data for SageMaker: {train_data.shape}")
print(f"Shape of val_data for SageMaker: {val_data.shape}")
print("First 5 rows of train_data (target should be first column):")
print(train_data.head())

# Define S3 paths for saving processed data
# You'll upload to subfolders within your main bucket for organization
s3_prefix = 'telco-churn-xgboost' # A sub-folder for this project's model data

# Get the default SageMaker session and bucket
sagemaker_session = sagemaker.Session()
# Ensure this bucket_name is the one you created earlier: 'telco-churn-ml-nissoka'
# If you used a different name, make sure to update it here.
default_bucket = bucket_name # Using the bucket_name defined in Cell 1

# Convert DataFrames to CSV format in memory
# header=False: SageMaker built-in algorithms expect no header row
# index=False: Do not write DataFrame index as a column
train_csv_buffer = io.StringIO()
train_data.to_csv(train_csv_buffer, sep=',', header=False, index=False)
train_csv_string = train_csv_buffer.getvalue()

val_csv_buffer = io.StringIO()
val_data.to_csv(val_csv_buffer, sep=',', header=False, index=False)
val_csv_string = val_csv_buffer.getvalue()

# Upload to S3
# SageMaker will automatically detect the data type (CSV in this case)
train_s3_path = os.path.join(s3_prefix, 'train/train.csv')
val_s3_path = os.path.join(s3_prefix, 'validation/validation.csv')

print(f"\nUploading training data to s3://{default_bucket}/{train_s3_path}")
sagemaker_session.upload_string_as_file_body(body=train_csv_string, bucket=default_bucket, key=train_s3_path)
print("Training data uploaded successfully.")

print(f"\nUploading validation data to s3://{default_bucket}/{val_s3_path}")
sagemaker_session.upload_string_as_file_body(body=val_csv_string, bucket=default_bucket, key=val_s3_path)
print("Validation data uploaded successfully.")

# Store S3 URIs for later use in model training
s3_train_data = f's3://{default_bucket}/{train_s3_path}'
s3_validation_data = f's3://{default_bucket}/{val_s3_path}'

print(f"\nS3 Training Data Path: {s3_train_data}")
print(f"S3 Validation Data Path: {s3_validation_data}")

Shape of train_data for SageMaker: (4930, 24)
Shape of val_data for SageMaker: (1056, 24)
First 5 rows of train_data (target should be first column):
      Churn  SeniorCitizen  tenure  MonthlyCharges  TotalCharges  gender_Male  \
5557    0.0              0       5           80.20        384.25            0   
2270    1.0              1       3           86.85        220.95            0   
6930    1.0              0       3           75.15        216.75            0   
2257    0.0              0      60           80.55       4847.05            0   
898     1.0              0      12           98.90       1120.95            0   

      Partner_Yes  Dependents_Yes  PhoneService_Yes  MultipleLines_Yes  ...  \
5557            0               0                 1                  0  ...   
2270            0               0                 1                  0  ...   
6930            1               0                 1                  1  ...   
2257            0               0              


Uploading training data to s3://telco-churn-ml-nissoka/telco-churn-xgboost/train/train.csv
Training data uploaded successfully.

Uploading validation data to s3://telco-churn-ml-nissoka/telco-churn-xgboost/validation/validation.csv
Validation data uploaded successfully.

S3 Training Data Path: s3://telco-churn-ml-nissoka/telco-churn-xgboost/train/train.csv
S3 Validation Data Path: s3://telco-churn-ml-nissoka/telco-churn-xgboost/validation/validation.csv


In [11]:
# Cell 9: Train XGBoost Model with SageMaker
# This cell configures and trains an XGBoost model using SageMaker's built-in algorithm.

import sagemaker # <--- ADD OR ENSURE THIS LINE IS PRESENT

from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

# Get the default SageMaker session and role
sagemaker_session = sagemaker.Session()


# Cell 9: Train XGBoost Model with SageMaker
# This cell configures and trains an XGBoost model using SageMaker's built-in algorithm.
# It defines the estimator, specifies hyperparameters, and initiates the training job
# using the data uploaded to S3 in the previous step.

from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

# Get the default SageMaker session and role
sagemaker_session = sagemaker.Session()
# Make sure to retrieve the IAM role ARN associated with your notebook instance.
# You can find this in the SageMaker console under Notebook Instances -> your_notebook_name -> Permissions and encryption.
# It should look something like 'arn:aws:iam::YOUR_ACCOUNT_ID:role/AmazonSageMaker-ExecutionRole-YYYYMMDDTHHMMSS'
# Or, if you created the role using the notebook instance creation, you can often retrieve it like this:
try:
    sagemaker_role = sagemaker.get_execution_role()
    print(f"Using SageMaker Execution Role: {sagemaker_role}")
except ValueError:
    print("Could not automatically retrieve SageMaker execution role. Please manually provide it.")
    # If the above fails, you might need to manually set it like:
    # sagemaker_role = 'arn:aws:iam::YOUR_ACCOUNT_ID:role/AmazonSageMaker-ExecutionRole-XXXXXXXXXXXX'
    # Replace 'YOUR_ACCOUNT_ID' and 'XXXXXXXXXXXX' with your actual values.
    # IMPORTANT: Ensure this role has permissions to access your S3 bucket and to create/manage SageMaker training jobs and endpoints.

# Define the Docker image for XGBoost (for your region)
# You can find the correct ECR image URI here: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-images.html
# Example for us-east-1 (check for your region):
region = sagemaker_session.boto_region_name
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")
print(f"XGBoost container image for {region}: {xgboost_container}")

# Define the output path for the model artifacts in S3
output_path = f's3://{bucket_name}/{s3_prefix}/output'
print(f"Model artifacts will be saved to: {output_path}")

# Configure the XGBoost Estimator
# instance_count: Number of instances to use for training
# instance_type: Type of EC2 instance for training (e.g., ml.m5.xlarge is a good general choice)
# output_path: Where the model artifacts will be stored in S3
# sagemaker_session: The SageMaker session
# role: The IAM role with permissions for SageMaker to perform actions
xgb_estimator = Estimator(
    image_uri=xgboost_container,
    role=sagemaker_role,
    instance_count=1,
    instance_type='ml.m5.xlarge', # Using a slightly larger instance for training
    output_path=output_path,
    sagemaker_session=sagemaker_session,
    base_job_name='telco-churn-xgb-training' # Prefix for training job names
)

# Set XGBoost hyperparameters
# These are crucial for model performance. We'll start with reasonable defaults.
# objective: 'binary:logistic' for binary classification
# eval_metric: 'auc' is good for imbalanced datasets
# num_round: Number of boosting rounds (iterations)
# early_stopping_rounds: Stop training if validation metric doesn't improve for this many rounds
# eta (learning_rate): Step size shrinkage to prevent overfitting
# max_depth: Maximum depth of a tree
# subsample: Subsample ratio of the training instance
# colsample_bytree: Subsample ratio of columns when constructing each tree
xgb_estimator.set_hyperparameters(
    objective='binary:logistic',
    eval_metric='auc',
    num_round=100, # Number of boosting rounds
    early_stopping_rounds=10, # Stop if validation AUC doesn't improve for 10 rounds
    eta=0.2, # Learning rate
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=1,
    gamma=0.1 # Minimum loss reduction required to make a further partition
)

# Define the training and validation data inputs
train_input = TrainingInput(s3_train_data, content_type='csv')
validation_input = TrainingInput(s3_validation_data, content_type='csv')

# Start the training job
print("\nInitiating XGBoost model training...")
xgb_estimator.fit({'train': train_input, 'validation': validation_input})
print("XGBoost model training job initiated.")

# The model artifacts will be available in xgb_estimator.model_data after the job completes
print(f"Model data S3 URI will be available at: {xgb_estimator.model_data} (after training completes)")

Using SageMaker Execution Role: arn:aws:iam::687563221599:role/service-role/AmazonSageMaker-ExecutionRole-20250526T210337


XGBoost container image for us-east-1: 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1
Model artifacts will be saved to: s3://telco-churn-ml-nissoka/telco-churn-xgboost/output



Initiating XGBoost model training...


2025-05-27 03:35:38 Starting - Starting the training job...
2025-05-27 03:36:13 Downloading - Downloading input data...
2025-05-27 03:36:38 Downloading - Downloading the training image......
2025-05-27 03:37:40 Training - Training image download completed. Training in progress.
2025-05-27 03:37:40 Uploading - Uploading generated training model.[34m[2025-05-27 03:37:35.846 ip-10-0-219-147.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-05-27 03:37:35.868 ip-10-0-219-147.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-05-27:03:37:36:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-05-27:03:37:36:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2025-05-27:03:37:36:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-05-27:03:37:36:INFO] No G

In [12]:
# Cell 10: Deploy XGBoost Model as a Real-time Endpoint
# This cell deploys the trained XGBoost model as a real-time HTTPS endpoint in SageMaker.
# This endpoint allows us to send new data and get predictions instantly.

from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer # To send CSV data to the endpoint
from sagemaker.deserializers import CSVDeserializer # To receive CSV predictions from the endpoint

print("Deploying the trained XGBoost model to a real-time endpoint...")

# The `deploy` method of the Estimator object will deploy the model.
# You need to specify instance_count and instance_type for the endpoint.
# For evaluation and testing, a small instance like 'ml.t2.medium' or 'ml.m5.large' is sufficient.
# The 'predictor' object returned can be used to make predictions.
try:
    xgb_predictor = xgb_estimator.deploy(
        initial_instance_count=1,
        instance_type='ml.t2.medium', # A good balance for a real-time endpoint
        serializer=CSVSerializer(), # Data sent to the endpoint will be CSV
        deserializer=CSVDeserializer() # Data received from the endpoint will be CSV
    )
    print("XGBoost model deployed successfully!")
    print(f"Endpoint name: {xgb_predictor.endpoint_name}")
except Exception as e:
    print(f"Error deploying model: {e}")

# Store the endpoint name for future reference if needed
endpoint_name = xgb_predictor.endpoint_name

Deploying the trained XGBoost model to a real-time endpoint...


---------------!XGBoost model deployed successfully!
Endpoint name: telco-churn-xgb-training-2025-05-27-03-39-12-964


In [12]:
# Cell 11: Make Predictions and Evaluate Model Performance
# This cell uses the deployed SageMaker endpoint to get predictions on the test set.
# It then converts the predicted probabilities into binary class labels (0 or 1)
# and calculates key classification metrics (confusion matrix, accuracy, precision, recall, F1-score)
# to assess the model's performance.

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

print("Making predictions on the test set using the deployed endpoint...")

# Convert X_test to a list of lists or similar format for CSV serialization
# The endpoint expects the input data (features only)
test_features_list = X_test.values.tolist()

# The SageMaker endpoint has a limit on the size of the payload.
# For larger test sets, we need to batch predictions to avoid exceeding the limit.
# Let's set a batch size, e.g., 100 or 200 rows per request.
batch_size = 200
predictions = []

for i in range(0, len(test_features_list), batch_size):
    batch = test_features_list[i : i + batch_size]
    # The predict method sends the batch and receives the response
    # The deserializer (CSVDeserializer) already parses the response
    response = xgb_predictor.predict(batch)
    # The response is a list of lists (e.g., [['0.75'], ['0.21'], ...])
    # We need to flatten it and ENSURE each item is converted to a float
    probabilities = [float(item[0]) for item in response] # <--- CRITICAL CHANGE: added float()
    predictions.extend(probabilities)
    print(f"Processed batch {int(i/batch_size) + 1}/{(len(test_features_list) + batch_size - 1) // batch_size}") # Progress update

# Convert the list of predictions to a NumPy array
y_pred_proba = np.array(predictions)

print("\nPredictions obtained from the endpoint.")
print(f"Sample probabilities: {y_pred_proba[:10]}")

# Convert probabilities to binary predictions using a threshold (e.g., 0.5)
# For imbalanced datasets, you might explore different thresholds later
threshold = 0.5
y_pred = (y_pred_proba >= threshold).astype(int)

print(f"\nSample binary predictions (threshold={threshold}): {y_pred[:10]}")

# Evaluate the model
print("\n--- Model Evaluation on Test Set ---")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)
# Interpretation:
# [[True Negatives (TN), False Positives (FP)]
#  [False Negatives (FN), True Negatives (FN)]]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred_proba) # Use probabilities for AUC

print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC AUC Score: {auc_score:.4f}")

# IMPORTANT: You can now optionally delete the endpoint if you are done with evaluation
# To save costs, delete the endpoint using:
# xgb_predictor.delete_endpoint()
# print("Endpoint deleted.")



Making predictions on the test set using the deployed endpoint...
Processed batch 1/6
Processed batch 2/6
Processed batch 3/6
Processed batch 4/6
Processed batch 5/6
Processed batch 6/6

Predictions obtained from the endpoint.
Sample probabilities: [0.34361413 0.07854402 0.0351665  0.18758756 0.03396553 0.12085247
 0.02656176 0.09240169 0.11652198 0.80386442]

Sample binary predictions (threshold=0.5): [0 0 0 0 0 0 0 0 0 1]

--- Model Evaluation on Test Set ---

Confusion Matrix:
[[701  75]
 [148 133]]

Accuracy: 0.7890
Precision: 0.6394
Recall: 0.4733
F1-Score: 0.5440
ROC AUC Score: 0.8361


In [17]:
# Cell 12: Test Endpoint with Sample Inputs via Boto3 and Clean Up
# This cell demonstrates how to send sample data to the deployed SageMaker endpoint
# for real-time predictions using Boto3.
# It also includes the crucial step of deleting the SageMaker endpoint to stop incurring costs.

import boto3
import json
import numpy as np

# Re-use the endpoint name from Cell 10
# If you restarted your notebook, make sure to re-run Cell 10 or manually set:
# endpoint_name = 'telco-churn-xgb-training-2025-05-27-00-06-46-130' # Replace with your actual endpoint name

print(f"Testing endpoint: {endpoint_name}")

# Create a SageMaker runtime client
runtime_sm_client = boto3.client('sagemaker-runtime', region_name=sagemaker_session.boto_region_name)

# Prepare a sample input from the X_test set
# Take the first row of X_test (features only)
sample_input = X_test.iloc[0].values.tolist()

# Convert the sample_input list to a CSV string
# The endpoint expects CSV format
sample_input_csv = ','.join(map(str, sample_input))

print(f"Sample input features (first row of X_test):\n{X_test.iloc[0]}")
print(f"\nSample input as CSV string: {sample_input_csv}")

# Invoke the endpoint
try:
    response = runtime_sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType='text/csv',
        Body=sample_input_csv
    )

    # Read the response
    # The response body is a byte stream, decode it and convert to float
    result = float(response['Body'].read().decode('utf-8'))

    print(f"\nPrediction for sample input (probability): {result:.4f}")

    # Convert probability to binary prediction
    sample_binary_prediction = 1 if result >= 0.5 else 0
    print(f"Binary prediction (threshold=0.5): {sample_binary_prediction}")
    print(f"Actual Churn label for this sample: {y_test.iloc[0]}")

except Exception as e:
    print(f"Error invoking endpoint: {e}")

print("\n--- Cleaning Up ---")
# IMPORTANT: Delete the endpoint to avoid incurring continuous costs!
try:
    xgb_predictor.delete_endpoint() # This uses the predictor object from Cell 10
    print(f"Endpoint '{endpoint_name}' deletion initiated successfully.")
    print("It may take a few minutes for the endpoint to fully disappear from your SageMaker console.")
except Exception as e:
    print(f"Error deleting endpoint: {e}")
    print("You may need to manually delete the endpoint from the SageMaker console.")

Testing endpoint: telco-churn-xgb-training-2025-05-27-00-33-08-179
Sample input features (first row of X_test):
SeniorCitizen                              1.00
tenure                                    17.00
MonthlyCharges                            45.05
TotalCharges                             770.60
gender_Male                                1.00
Partner_Yes                                0.00
Dependents_Yes                             0.00
PhoneService_Yes                           1.00
MultipleLines_Yes                          0.00
InternetService_Fiber optic                0.00
InternetService_No                         0.00
OnlineSecurity_Yes                         0.00
OnlineBackup_Yes                           0.00
DeviceProtection_Yes                       0.00
TechSupport_Yes                            0.00
StreamingTV_Yes                            0.00
StreamingMovies_Yes                        0.00
Contract_One year                          0.00
Contract_Two year       

Endpoint 'telco-churn-xgb-training-2025-05-27-00-33-08-179' deletion initiated successfully.
It may take a few minutes for the endpoint to fully disappear from your SageMaker console.
