In [31]:
import os
import numpy as np
import pandas as pd
import urllib
import sagemaker, boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report

# Create S3 bucket

In [2]:
region=boto3.session.Session().region_name
region

'eu-central-1'

In [6]:
s3 = boto3.resource('s3')
bucket_name = "marzieh-bank-project"  

# Create an S3 bucket with the specified region
s3.create_bucket(
    Bucket=bucket_name,
    CreateBucketConfiguration={'LocationConstraint': region})

s3.Bucket(name='marzieh-bank-project')

# Downloading dataset inside the instance

In [7]:
urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv",
                            "bank_clean.csv")

model_data = pd.read_csv('./bank_clean.csv',index_col=0)
train,test=train_test_split(model_data,test_size=0.3,random_state=1729)
print(model_data.shape,train.shape,test.shape)

(41188, 61) (28831, 61) (12357, 61)


# Store train/test data in S3 bucket

In [8]:
#make a data format corresponding to sagemaker
pd.concat([train['y_yes'], train.drop(['y_no', 'y_yes'], 
                                                axis=1)], 
                                                axis=1).to_csv('train.csv', index=False, header=False)

pd.concat([test['y_yes'], test.drop(['y_no', 'y_yes'], 
                                                axis=1)], 
                                                axis=1).to_csv('test.csv', index=False, header=False)

prefix = 'xgboost'
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')


s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')
s3_input_test = TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

In [10]:
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://marzieh-bank-project/xgboost/output


# Building model of Xgboot

In [16]:
#make a image of model for xgboost machine learning method
container = get_image_uri(boto3.Session().region_name,'xgboost')   

hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
        }   

estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          output_path=output_path,
                                          role=sagemaker.get_execution_role(),
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.2xlarge', 
                                          train_volume_size=5, # 5 GB                                           
                                          train_use_spot_instances=True,
                                          train_max_run=300,
                                          train_max_wait=600)

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


# Traning the model

In [17]:
estimator.fit({'train': s3_input_train,'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: xgboost-2024-08-20-09-28-50-065


2024-08-20 09:28:50 Starting - Starting the training job...
2024-08-20 09:29:20 Starting - Preparing the instances for training...
2024-08-20 09:29:39 Downloading - Downloading input data...
2024-08-20 09:29:55 Downloading - Downloading the training image...
2024-08-20 09:30:30 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2024-08-20:09:30:48:INFO] Running standalone xgboost training.[0m
[34m[2024-08-20:09:30:48:INFO] File size need to be processed in the node: 4.83mb. Available memory size in the node: 23862.5mb[0m
[34m[2024-08-20:09:30:48:INFO] Determined delimiter of CSV input is ','[0m
[34m[09:30:48] S3DistributionType set as FullyReplicated[0m
[34m[09:30:48] 28831x59 matrix with 1701029 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-08-20:09:30:48:INFO] Determined delimiter of CSV input is ','[0m
[34m[09:30:48] S3DistributionType set as FullyReplicated[0m
[34m[09:30

# Model deployment

In [18]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2024-08-20-09-33-49-628
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-08-20-09-33-49-628
INFO:sagemaker:Creating endpoint with name xgboost-2024-08-20-09-33-49-628


------!

# Model prediction for test

In [24]:
# change the test data to array
test_array = test.drop(['y_no', 'y_yes'], axis=1).values 
xgb_predictor.content_type = 'text/csv' 
xgb_predictor.serializer = CSVSerializer()

predictions = xgb_predictor.predict(test_array).decode('utf-8') 
predictions_array = np.fromstring(predictions[1:], sep=',') 
print(round(predictions_array))

[0.24883506 0.05253069 0.09926574 ... 0.06243566 0.02652985 0.12547438]


In [37]:
print(confusion_matrix(test['y_yes'],np.round(predictions_array)))

[[10843   189]
 [ 1023   302]]


In [41]:
print(classification_report(test['y_yes'],y_pred=np.round(predictions_array)))

              precision    recall  f1-score   support

           0       0.91      0.98      0.95     11032
           1       0.62      0.23      0.33      1325

    accuracy                           0.90     12357
   macro avg       0.76      0.61      0.64     12357
weighted avg       0.88      0.90      0.88     12357



# Download S3 bucket project before deleting it

In [None]:
"""
in aws, user, security credintial, create access key
inside your local machine:
pip install awscli
aws configure
File association not found for extension .py
AWS Access Key ID: #######
AWS Secret Access Key : ######
Default region name : eu-central-1
Default output format: json
aws s3 sync s3://marzieh-bank-project C:\Users\micha\OneDrive\Skrivbord\marzieh\13-aws
"""

# How to delete the deployed model

In [42]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: xgboost-2024-08-20-09-33-49-628


[{'ResponseMetadata': {'RequestId': '5TK84S1C7T8FR3A1',
   'HostId': 'kvxEYpZSF4SSLZQWTPi88vPu+JZR9AB92LCxbRjFgQblrfGUx/W1ikdFfJdLuZKX+uGSiCbXM9SSbBS7bKCLQQ==',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'kvxEYpZSF4SSLZQWTPi88vPu+JZR9AB92LCxbRjFgQblrfGUx/W1ikdFfJdLuZKX+uGSiCbXM9SSbBS7bKCLQQ==',
    'x-amz-request-id': '5TK84S1C7T8FR3A1',
    'date': 'Tue, 20 Aug 2024 10:53:13 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost/output/xgboost-2024-08-20-09-28-50-065/profiler-output/framework/training_job_end.ts'},
   {'Key': 'xgboost/output/xgboost-2024-08-20-09-28-50-065/output/model.tar.gz'},
   {'Key': 'xgboost/output/xgboost-2024-08-20-09-28-50-065/debug-output/training_job_end.ts'},
   {'Key': 'xgboost/test/test.csv'},
   {'Key': 'xgboost/output/xgboost-2024-08-20-09-28-50-065/profiler-output/system/training_job_end.ts'},
   {'Key