In [2]:
bucket = 'catboostbucket'
prefix = 'dataset'
key = 'dataset/credit_card_transactions-ibm_v2.csv'
# Define IAM role
import boto3
import re

import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder


import matplotlib.pyplot as plt
import os
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
import io
s3 = boto3.client('s3')
role = get_execution_role()

In [3]:
#  accessing the data set in catboostbucket
obj = s3.get_object(Bucket=bucket, Key=key)
df = pd.read_csv(obj.get("Body"),nrows=1000000)
df.head() 

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No


In [4]:
# all preprocessing
df['Is Fraud?']=df['Is Fraud?'].replace({'No':0,'Yes':1}) 
df['Errors?']=df['Errors?'].fillna('NAN')
df['Errors?']=df['Errors?'].apply(lambda value:value=='NAN')
df['Use Chip'].unique()
df['is online']=df['Use Chip'].apply(lambda value:value=='Online Transaction')
df['Use Chip']=df['Use Chip'].replace({'Swipe Transaction':0, 'Online Transaction':1, 'Chip Transaction':2})
df['Zip'] = df['Zip'].fillna(df['Zip'].mean())  
df['Amount'] = df['Amount'].apply(lambda value: float(value.split("$")[1]))
df['Hour'] = df['Time'].apply(lambda value: int(value.split(":")[0]))
df['Minutes'] = df['Time'].apply(lambda value: int(value.split(":")[1]))
df.drop(['Time'], axis=1, inplace=True)     
df['Merchant State']=df['Merchant State'].fillna('NAN')    
df['Merchant City']=df['Merchant City'].fillna('NAN') 
df['is vozmes']=df['Amount'].apply(lambda value: value<0)  
df['abs_amount']=df['Amount'].apply(lambda value: abs(value))
le=LabelEncoder() 
df['Merchant State']=le.fit_transform(df['Merchant State'])
le=LabelEncoder()
df['Merchant City']=le.fit_transform(df['Merchant City'])
df.drop('Merchant Name',axis=1,inplace=True)
df.drop('User',axis=1,inplace=True)
df = pd.concat([df['Is Fraud?'], df.drop(['Is Fraud?'], axis=1)], axis=1)
df.replace({False: 0, True: 1}, inplace=True)

In [5]:
# sagemaker does not want the xtest ytest , instead first column should be the target column, so np.split is used instead
train_data, validation_data, test_data = np.split(df.sample(frac=1, random_state=42), [int(0.7 * len(df)), int(0.9 * len(df))])
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)
test_data.to_csv('test.csv', header=False, index=False)

In [6]:
# making folders and ... in bucket for new files 
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')
s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

In [7]:
# accessing builtin algorithm of xgboost within aws in containers and then giving estimator the model and other info to prep for training
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(eta=0.1, objective='binary:logistic', num_round=25) 

# training the model
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2022-04-18 22:37:11 Starting - Starting the training job...ProfilerReport-1650321431: InProgress
............
2022-04-18 22:39:37 Starting - Preparing the instances for training..............................
2022-04-18 22:44:38 Downloading - Downloading input data...
2022-04-18 22:45:08 Training - Downloading the training image..........[34mArguments: train[0m
[34m[2022-04-18:22:46:48:INFO] Running standalone xgboost training.[0m
[34m[2022-04-18:22:46:48:INFO] File size need to be processed in the node: 52.82mb. Available memory size in the node: 8467.79mb[0m
[34m[2022-04-18:22:46:48:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:46:48] S3DistributionType set as FullyReplicated[0m
[34m[22:46:49] 700000x16 matrix with 11200000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-04-18:22:46:49:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:46:49] S3DistributionType set as FullyReplicated[0m
[34m[22:46:49] 

In [8]:
# deploying the model, this is the last cell which is organized, below is messy, ignore commented out cells.
xgb_predictor = xgb.deploy(
	initial_instance_count = 1,
	instance_type = 'ml.m4.xlarge',
	serializer = CSVSerializer())

----------!

In [30]:
# def predict(data, rows=500):
#     split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
#     predictions = ''
#     for array in split_array:
#         predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])
        

#     return np.fromstring(predictions[1:], sep=',')

# predictions = predict(test_data.to_numpy()[:,1:])
# predictions

In [24]:
# quick test to see if we did everything correctly
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])
        p = np.fromstring(predictions[1:], sep=',')
        p = np.round(p)

    return p

predictions = predict(test_data.to_numpy()[:,1:])
predictions

array([0., 0., 0., ..., 0., 0., 0.])

In [25]:
# a = np.argmax(predictions)
# print(a)

In [26]:
# x = np.round(predictions)
# x

In [27]:
# np.unique(x)

In [28]:
# count = (x == 1).sum()
# print('Total occurences of "1" in array: ', count)

In [29]:
# count = (x == 0).sum()
# print('Total occurences of "1" in array: ', count)

In [None]:
# deletes endpoint
# xgb_predictor.delete_endpoint()

In [11]:
# this was the first version of lambda function I wrote, its better now. but basically endpoint_name is the name of the deployed model and then once we give it input_json it retuns the predictions
import boto3
import numpy as np
# copy endpoint name u got from endpoint name print() or uploaded one and copy it here
ENDPOINT_NAME = 'xgboost-2022-04-18-22-50-13-637'
runtime = boto3.client('runtime.sagemaker')

def lambda_handler(event, context):
  inputs = event['data']
  result = []
  for input in inputs:
    serialized_input = ','.join(map(str, input))

    response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                     ContentType='text/csv',
                                     Body=serialized_input)
  
    result.append(response['Body'].read().decode())

  return result

input_json = { "data":
        [[0,2017,10,10,4.02,2,12095,146,29693.0,5814,1,0,11,19,0,4.02],
         [0,2017,10,10,4.02,2,12095,146,29693.0,5814,1,0,11,19,0,4.02],
         [0,2017,10,10,4.02,2,12095,146,29693.0,5814,1,0,11,19,0,4.02],
         [0,2017,10,10,4.02,2,12095,146,29693.0,5814,1,0,11,19,0,4.02]]
}

result = lambda_handler(input_json, _)
result

['0.040032703429460526',
 '0.040032703429460526',
 '0.040032703429460526',
 '0.040032703429460526']