# Online Fraud Detection using AWS SageMaker

## 1. Setup and Installation

In [None]:
!pip install pandas scikit-learn boto3 sagemaker



## 2. Import Libraries

In [1]:

import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker.sklearn import SKLearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix




sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## 3. Setup AWS and S3 Configurations

In [2]:

bucket_name = 'mydataset-710'  # S3 bucket name
file_name = 'onlinefraud.csv'  # Dataset file name in S3
region_name = 'us-east-1'

s3 = boto3.client('s3', region_name=region_name)
s3_resource = boto3.resource('s3', region_name=region_name)

# SageMaker session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()


## 4. Load Dataset from S3

In [3]:

s3_path = f's3://{bucket_name}/{file_name}'
df = pd.read_csv(s3_path)
df.head()


severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## 5. Data Preprocessing

In [4]:

le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])  

X = df.drop(['isFraud', 'nameOrig', 'nameDest'], axis=1)
y = df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## 6. Save Preprocessed Data to S3

In [5]:

train_data = pd.DataFrame(X_train_scaled)
train_data['target'] = y_train.values
test_data = pd.DataFrame(X_test_scaled)
test_data['target'] = y_test.values

train_data.to_csv('train.csv', index=False)
test_data.to_csv('test.csv', index=False)

train_s3_path = f's3://{bucket_name}/train/train.csv'
test_s3_path = f's3://{bucket_name}/test/test.csv'

s3.upload_file('train.csv', bucket_name, 'train/train.csv')
s3.upload_file('test.csv', bucket_name, 'test/test.csv')


## 7. Create train_script.py

## 8. Train Model on SageMaker

In [6]:

sklearn_estimator = SKLearn(
    entry_point='train_script.py',  
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version='1.2-1',
    py_version='py3',
    hyperparameters={
        'n_estimators': 100,
        'random_state': 42
    }
)

input_location = f's3://{bucket_name}/train'
sklearn_estimator.fit({'train': input_location})


2025-02-18 02:00:16 Starting - Starting the training job...
..25-02-18 02:00:32 Starting - Preparing the instances for training.
.....02-18 02:01:03 Downloading - Downloading input data.
..25-02-18 02:01:49 Downloading - Downloading the training image.
2025-02-18 02:02:30 Training - Training image download completed. Training in progress.[34m2025-02-18 02:02:33,498 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-02-18 02:02:33,502 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-02-18 02:02:33,505 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-02-18 02:02:33,526 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-02-18 02:02:33,757 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-02-18 02:02:33,760 sagemaker-training-toolkit INFO     No Neurons d

## 9. Deploy Model as Endpoint

In [7]:

predictor = sklearn_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge'
)


------!

## 10. Test the Endpoint

In [8]:

test_sample = X_test_scaled[0].reshape(1, -1)
prediction = predictor.predict(test_sample)
print("Prediction:", prediction)


Prediction: [0]


## 11. Model Evaluation

In [11]:
batch_size = 100  # Adjust this based on your data size and endpoint limits
y_pred = []

for i in range(0, len(X_test_scaled), batch_size):
    batch = X_test_scaled[i:i+batch_size]
    preds = predictor.predict(batch)
    y_pred.extend(preds)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9996961419457184

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.98      0.78      0.87      2435

    accuracy                           1.00   1908786
   macro avg       0.99      0.89      0.93   1908786
weighted avg       1.00      1.00      1.00   1908786


Confusion Matrix:
 [[1906305      46]
 [    534    1901]]


## 12. Clean Up Resources

In [None]:

predictor.delete_endpoint()


## 13. Notes and Tips


- Ensure SageMaker role has S3 and SageMaker Full Access.
- Adjust instance type as per dataset size.
- Always clean up resources to avoid charges.
