# Steps to be followed
1. Import necessary libraries
2. create s3 bucket ( for training data, testing data and to save model)
3. Mapping train & test data in s3
4. Mapping the path of models in s3

In [1]:
import sagemaker
import boto3 # with boto3 you can read public s3 bucket from your local system
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session

  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
my_region = boto3.session.Session().region_name # set the region of the instance
# because based on region name 
print(my_region)

us-east-1


In [3]:
# create bucket
bucket_name = 'zohaibsnotebookbucket'

s3 = boto3.resource('s3')

try:
    if my_region == 'us-east-1':
        s3.create_bucket(Bucket = bucket_name)
    print('S3 bucket created successfully')
    
except Exception as e:
    print('S3 error:',e)

S3 bucket created successfully


In [4]:
# set an output path where the trained model will be saved
prefix = 'models'
output_path = 's3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://zohaibsnotebookbucket/models/output


## Downloading The Dataset And Storing in S3( DATA INGESTION)


In [5]:
import pandas as pd
import urllib
try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ',e)

try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


In [6]:
import numpy as np
train_data, test_data = np.split(
    model_data.sample(frac=1, random_state=1729), 
    [int(0.7 * len(model_data))]
)
                                

print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


  return bound(*args, **kwds)


In [7]:
"""In SageMaker, when using the Linear Learner algorithm, the first column in your dataset should be the 
dependent variable, which is the target variable for prediction. """

'In SageMaker, when using the Linear Learner algorithm, the first column in your dataset should be the \ndependent variable, which is the target variable for prediction. '

In [8]:
### Saving Train And Test Into Buckets

import os
import pandas as pd

train_combined = pd.concat(
    [train_data['y_yes'], 
    train_data.drop(['y_no', 'y_yes'], 
                    axis=1)], 
    axis=1)

# 2. Save the combined DataFrame as a CSV file without index and header
train_combined.to_csv(
    'train.csv', 
    index=False, 
    header=False)


boto3.resource('s3').Object(
    bucket_name,
    os.path.join( prefix, 'train/train.csv')
).upload_file('train.csv')



s3_input_train = sagemaker.TrainingInput(
    s3_data='s3://{}/{}/train'.format(bucket_name, prefix), 
    content_type='csv'
)

s3_input_train

<sagemaker.inputs.TrainingInput at 0x7ff4d2790fd0>

In [9]:
# Test Data Into Buckets

pd.concat(
    [test_data['y_yes'], 
    test_data.drop(
        ['y_no', 'y_yes'], 
        axis=1
    )], 
    axis=1

).to_csv('test.csv',
         index=False,
         header=False)



boto3.resource('s3').Object(
    bucket_name,
    os.path.join(prefix, 'test/test.csv')
).upload_file('test.csv')




s3_input_test = sagemaker.TrainingInput(
    s3_data='s3://{}/{}/test'.format(bucket_name, prefix), 
    content_type='csv')
s3_input_test

<sagemaker.inputs.TrainingInput at 0x7ff4d1e77b20>

In [10]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
container = sagemaker.image_uris.retrieve(
    region = boto3.Session().region_name,                          
    framework ='xgboost',
    version='1.0-1'
)
print(container)

683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3


In [11]:
# sagemaker.image_uris.retrieve(
#     region = boto3.Session().region_name,                          
#     version='1.0-1'
# )

In [12]:
hyperparameters = {
#     "max-depth" : 5,
    "eta":"0.2",
    "gamma":"4",
    "min_child_weight":"6",
    "subsample":"0.7",
    "objective":"binary:logistic",
    
#     "max-depth":"16",
    "num_round":50
}

In [13]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(), #IAM role
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          # to reduce billing price, use followng 3
                                          use_spot_instances=True,
                                          max_run=300,
                                          max_wait=600)


# shift+tab
estimator

<sagemaker.estimator.Estimator at 0x7ff4d192bbe0>

In [14]:
estimator.fit({
    'train': s3_input_train,
    'validation': s3_input_test
})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-03-01-14-49-51-825


2024-03-01 14:49:51 Starting - Starting the training job...
2024-03-01 14:50:07 Starting - Preparing the instances for training...
2024-03-01 14:50:37 Downloading - Downloading input data...
2024-03-01 14:51:18 Training - Training image download completed. Training in progress....
2024-03-01 14:51:44 Uploading - Uploading generated training model[34m[2024-03-01 14:51:39.101 ip-10-2-195-220.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[

## Deploy ML model

In [15]:
xgb_predictor = estimator.deploy(initial_instance_count=1,
                                instance_type='ml.m4.xlarge')
xgb_predictor

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-03-01-14-52-33-903
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-03-01-14-52-33-903
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-03-01-14-52-33-903


------!

<sagemaker.base_predictor.Predictor at 0x7ff4d1e3ee60>

# Prediction of test data

In [18]:
from sagemaker.serializers import CSVSerializer
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = CSVSerializer() # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(12357,)


In [19]:
predictions_array

array([0.04345566, 0.05905825, 0.0395886 , ..., 0.04037751, 0.03186692,
       0.06086195])

In [20]:
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 89.7%

Predicted      No Purchase    Purchase
Observed
No Purchase    91% (10771)    35% (165)
Purchase        9% (1111)     65% (310) 



In [21]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2024-03-01-14-52-33-903


[{'ResponseMetadata': {'RequestId': 'TES8D4114RFDSCJ2',
   'HostId': 'XLZduwPavgBDNoR2281GaHg99uyZNuSch+y8plenuJSEg1kHARECEpIiUT0OKUuFJIqo149pLJc=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'XLZduwPavgBDNoR2281GaHg99uyZNuSch+y8plenuJSEg1kHARECEpIiUT0OKUuFJIqo149pLJc=',
    'x-amz-request-id': 'TES8D4114RFDSCJ2',
    'date': 'Fri, 01 Mar 2024 15:31:04 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'models/output/sagemaker-xgboost-2024-03-01-14-49-51-825/debug-output/events/000000000000/000000000000_worker_0.tfevents'},
   {'Key': 'models/output/sagemaker-xgboost-2024-03-01-14-49-51-825/debug-output/events/000000000010/000000000010_worker_0.tfevents'},
   {'Key': 'models/output/sagemaker-xgboost-2024-03-01-14-49-51-825/debug-output/index/000000000/000000000030_worker_0.json'},
   {'Key': 'models/output/sagemaker-xgboost-2024-03-01-14-49-5