In [6]:
pip install --upgrade sagemaker pydantic

Collecting sagemaker
  Downloading sagemaker-2.240.0-py3-none-any.whl.metadata (16 kB)
Downloading sagemaker-2.240.0-py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m78.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.239.2
    Uninstalling sagemaker-2.239.2:
      Successfully uninstalled sagemaker-2.239.2
Successfully installed sagemaker-2.240.0
Note: you may need to restart the kernel to use updated packages.


In [93]:
pip install --upgrade sagemaker

Note: you may need to restart the kernel to use updated packages.


In [94]:
import sagemaker
import boto3
from sagemaker import image_uris
from sagemaker.session import s3_input, Session

'''We are importing the boto3 library, which allows us to automate AWS services like S3,
SageMaker, EC2, DynamoDB, Lambda, and many more.

🔹 Why is boto3 Important in AWS SageMaker?
For a data scientist working with AWS SageMaker, boto3 is essential for:
✅ Managing SageMaker training jobs (starting, stopping, monitoring)
✅ Deploying machine learning models to SageMaker endpoints
✅ Automating data pipeline workflows (loading data from S3, triggering Lambda functions)
✅ Controlling AWS resources (managing instances, setting up permissions)'''

In [89]:
bucket_name = 'bank-application-nishi-24'
my_region = boto3.session.Session().region_name  #Set the region 
print(my_region)

eu-north-1


In [88]:
s3 = boto3.client('s3')

try:
    if my_region == 'eu-north-1':
        s3.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration={'LocationConstraint': my_region}  # Specify region explicitly
        )
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error:', e)

S3 error: An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


In [76]:
# setting an output path where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo'
output_path = 's3://{}/{}/output'.format(bucket_name,prefix)
print(output_path)

s3://bank-application-nishi-24/xgboost-as-a-built-in-algo/output


In [77]:
import pandas as pd
import urllib
try:
    urllib.request.urlretrieve("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('Sucess:downloaded bank_clean.csv')
except Exception as e:
    print('Data load error: ',e)

try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Sucess: Data loaded into dataframe')
except Exception as e:
    print('data load error: ',e)

Sucess:downloaded bank_clean.csv
Sucess: Data loaded into dataframe


In [78]:
#Train test split

import numpy as np
train_data,test_data = np.split(model_data.sample(frac=1,random_state=1729),[int(0.7*len(model_data))])
print(train_data.shape,test_data.shape)


(28831, 61) (12357, 61)


Downloading and dataset storing in s3

In [79]:
## Saving Train And Test Into Buckets
## We start with Train Data
from sagemaker.inputs import TrainingInput
import os
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], 
                                                axis=1)], 
                                                axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [80]:
pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'], 
                                                axis=1)], 
                                                axis=1).to_csv('test.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

Building and Deploying Using Xgboost Inbuilt Algorithm

In [95]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
container = image_uris.retrieve(
    framework='xgboost',
    region= boto3.session.Session().region_name,
    instance_type='ml.t3.medium',
    version='1.5-1')

Purpose: This code fetches the location (URI) of a pre-built Docker image for XGBoost, a popular machine learning algorithm, tailored to your specified settings.

framework='xgboost': Indicates that you're interested in using the XGBoost algorithm.

region=boto3.session.Session().region_name: Automatically detects and uses your current AWS region to ensure the Docker image is sourced from the correct regional repository.

instance_type='ml.t3.medium': Specifies the type of SageMaker instance you plan to use for training, ensuring compatibility between the instance's architecture and the Docker image.

version='1.5-1': Selects the specific version of the XGBoost algorithm you wish to utilize.

In summary, this code helps SageMaker identify the exact Docker image to use for training your model with XGBoost, based on your region, instance type, and desired algorithm version.

In [96]:
# initialize hyperparameters
hyperparameters = {"max_depth":"5",
                   "eta":"0.2",
                   "gamma":"4",
                   "min_child_weight":"6",
                   "subsample":"0.7",
                   "objective":"binary:logistic",
                   "num_round":50}

max_depth: Specifies the maximum depth of each tree. Deeper trees can model more complex patterns but may lead to overfitting. In this case, it's set to 5, indicating that each tree can have up to 5 levels.

eta (also known as learning rate): Controls the contribution of each tree to the final model. Lower values make the model more robust to overfitting but require more boosting rounds. Here, it's set to 0.2.

gamma: Defines the minimum loss reduction required to make a further partition on a leaf node. Higher values lead to more conservative models. A value of 4 means a node will split only if it results in a loss reduction of at least 4.

min_child_weight: Determines the minimum sum of instance weights needed in a child node. Higher values prevent the model from learning relations that might be highly specific to the particular sample selected for a tree. It's set to 6 here.

subsample: Denotes the fraction of the training data to be randomly sampled for each tree. Using a subset can prevent overfitting. A value of 0.7 means each tree is built using 70% of the data.

objective: Specifies the learning task and the corresponding loss function. 'binary:logistic' indicates a binary classification problem with logistic regression applied to output probabilities.

num_round: Indicates the number of boosting iterations, i.e., the number of trees to be added. Here, it's set to 50.

In [101]:
#constructing a sagemaker estimator that calls the xgboost container
from sagemaker.estimator import Estimator
import boto3
output_path = 's3://{}/{}/output'.format(bucket_name,prefix)


estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.large', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          use_spot_instances=True,
                                          max_run=300,
                                          max_wait=600)

Yes, you can change the instance type for your Amazon SageMaker training job independently of the instance type used for your SageMaker notebook. The instance type specified in your notebook determines the computational resources for the notebook environment, while the instance type defined in your training job configuration determines the resources allocated for that specific training task.

To modify the instance type for your training job, adjust the instance_type parameter in your estimator configuration. For example, if you previously set it to 'ml.t3.medium', you can change it to a different instance type, such as 'ml.m5.large', as shown

In [102]:
estimator.fit({'train': s3_input_train, 'validation': s3_input_test})

2025-03-03 17:31:50 Starting - Starting the training job...
.....03-03 17:32:22 Downloading - Downloading input data.
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-03-03 17:33:59.253 ip-10-0-246-42.eu-north-1.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-03-03 17:33:59.273 ip-10-0-246-42.eu-north-1.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-03-03:17:33:59:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-03-03:17:33:59:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-03-03:17:33:59:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-03-03:17:33:59:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2025-03-03:17:33:59:INFO] Determined 0 GPU(s) available on the instance.[0m
[34m[2025-03-03:17:33:59:INFO] Determined delimiter of CSV input is ','[0m

Deployimg ML model

In [104]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m5.large')

------!

Prediction of the Test Data

In [115]:
from sagemaker.serializers import CSVSerializer
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values # Prepare the test data by dropping unnecessary columns and converting to an array
xgb_predictor.content_type = 'text/csv' # Set the content type for the inference
csv_serializer = CSVSerializer()  # Instantiate the CSVSerializer
xgb_predictor.serializer = csv_serializer # Assign the instantiated serializer to the predictor
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.loadtxt(predictions.splitlines(), delimiter=',') # Convert the predictions from a string to a NumPy array
print(predictions_array.shape)

(12357,)


Imports the CSVSerializer class from the sagemaker.serializers module.
Why it’s written this way:
SageMaker requires data to be serialized (converted into a specific format) before sending it to the model for inference.
CSVSerializer is used to serialize the input data into CSV format, which is a common format for tabular data and is supported by SageMaker.

Drops the columns y_no and y_yes from the test_data DataFrame.
Converts the remaining data into a NumPy array using .values.
Why it’s written this way:
y_no and y_yes are likely the target variables (labels) that the model is trying to predict. Since we’re preparing the input data for inference, we don’t need these columns.
.values converts the DataFrame into a NumPy array, which is the required input format for the CSVSerializer.

Why it’s written this way:
SageMaker needs to know the format of the input data being sent to the model. By setting content_type to text/csv, we tell SageMaker that the input data is in CSV format.
This ensures that the model can correctly interpret the incoming data.


Why it’s written this way:
The CSVSerializer is responsible for converting the input data (NumPy array) into a CSV-formatted string that SageMaker can process.
This step ensures that the data is properly serialized before being sent to the model.



Why it’s written this way:
The xgb_predictor is the object used to make predictions with the deployed SageMaker model.By setting the serializer, we ensure that the input data is automatically serialized into CSV format whenever predict is called.


What it does:
Sends the test_data_array to the deployed SageMaker model for inference.
The model returns the predictions as a byte string, which is then decoded into a UTF-8 string using .decode('utf-8').
Why it’s written this way:
The predict method sends the serialized data to the model and retrieves the predictions.
The predictions are returned as a byte string, so we decode it into a regular string for easier processing.


What it does:
Splits the predictions string into lines (since each line represents a prediction).
Uses np.loadtxt to parse the lines into a NumPy array, treating commas (',') as delimiters.
Why it’s written this way:
The predictions are returned as a CSV-formatted string (e.g., "0.1,0.9\n0.2,0.8").
splitlines() splits the string into individual lines, and np.loadtxt parses each line into numerical values.
This converts the predictions into a NumPy array, which is easier to work with for further analysis or evaluation.



Why it’s written this way:

The shape of the array tells us how many predictions were made and the format of the output (e.g., (n_samples, n_classes) for classification or (n_samples,) for regression).
This is useful for verifying that the predictions are in the expected format.

Summary of the Workflow
Prepare the data: Remove unnecessary columns and convert it into a NumPy array.
Set up the predictor: Configure the predictor to accept CSV-formatted input.
Serialize the data: Use CSVSerializer to convert the data into CSV format.
Make predictions: Send the data to the model and retrieve the predictions.
Parse the predictions: Convert the predictions from a string into a NumPy array.
Verify the output: Check the shape of the predictions to ensure they are in the expected format.

This approach ensures that the data is properly formatted and processed at every step, making it compatible with SageMaker’s requirements.



In [116]:
predictions_array

array([0.05214286, 0.05660191, 0.05096195, ..., 0.03436061, 0.02942475,
       0.03715819])

In [117]:
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 89.7%

Predicted      No Purchase    Purchase
Observed
No Purchase    91% (10785)    34% (151)
Purchase        9% (1124)     66% (297) 



In [121]:
import boto3
import sagemaker

# Initialize SageMaker and S3 clients
sagemaker_session = sagemaker.Session()
s3_resource = boto3.resource('s3')

# Step 1: Delete all SageMaker endpoints
def delete_all_endpoints():
    sagemaker_client = boto3.client('sagemaker')
    endpoints = sagemaker_client.list_endpoints()['Endpoints']
    
    for endpoint in endpoints:
        endpoint_name = endpoint['EndpointName']
        print(f"Deleting endpoint: {endpoint_name}")
        sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
    print("All endpoints deleted.")

# Step 2: Delete all objects in an S3 bucket
def delete_all_objects_in_bucket(bucket_name):
    bucket = s3_resource.Bucket(bucket_name)
    print(f"Deleting all objects in bucket: {bucket_name}")
    bucket.objects.all().delete()
    print(f"All objects in bucket '{bucket_name}' deleted.")

# Step 3: Main cleanup function
def cleanup_resources(bucket_name):
    # Delete all SageMaker endpoints
    delete_all_endpoints()
    
    # Delete all objects in the specified S3 bucket
    delete_all_objects_in_bucket(bucket_name)

# Specify the S3 bucket name
bucket_name = 'bank-application-nishi-24'  # Replace with your bucket name

# Run the cleanup
cleanup_resources(bucket_name)

All endpoints deleted.
Deleting all objects in bucket: bank-application-nishi-24
All objects in bucket 'bank-application-nishi-24' deleted.
