In [180]:
import boto3
import json
from io import StringIO
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [181]:
# S3 Bucket and data location
S3_BUCKET = "fraud-detection-storage-mk"
S3_KEY = "synthetic-transaction-data/transactions.json"

# Load data from S3
s3 = boto3.client("s3")
obj = s3.get_object(Bucket=S3_BUCKET, Key=S3_KEY)
data = obj["Body"].read().decode("utf-8")
df = pd.read_json(StringIO(data))
df.head()

Unnamed: 0,transaction_id,user_id,timestamp,amount,device_type,location,is_vpn,card_type,status,is_fraud
0,T4048,U83310,2025-01-01 12:00:00+00:00,2917.55,desktop,C,False,credit,approved,0
1,T2663,U17909,2025-01-01 12:00:00+00:00,284.67,mobile,C,False,credit,approved,0
2,T7547,U68247,2025-01-01 12:00:00+00:00,1193.47,desktop,A,False,credit,approved,0
3,T4500,U23022,2025-01-01 12:00:00+00:00,3227.2,mobile,D,True,credit,approved,0
4,T5000,U13432,2025-01-01 12:00:00+00:00,49.61,mobile,F,True,debit,approved,0


In [None]:
# Feature Extraction
label_encoder = LabelEncoder()
ohe = OneHotEncoder( drop='first')
scaler = MinMaxScaler()
# Feature Engineering
## One-Hot Encoding for card_type
df_card_encoded = pd.DataFrame(ohe.fit_transform(df["card_type"]), columns=ohe.get_feature_names_out(["card_type"]))
df = pd.concat([df, df_card_encoded], axis=1)
df.drop(columns=["card_type"], inplace=True)  # Drop original categorical column

## Frequency Encoding for location
location_counts = df["location"].value_counts().to_dict()
df["location_freq"] = df["location"].map(location_counts)
df.drop(columns=["location"], inplace=True)  # Drop original column

## Scale amount feature
df["scaled_amount"] = scaler.fit_transform(df[["amount"]])
df.drop(columns=["amount"], inplace=True)

## df["high_value"] = df["amount"].apply(lambda x: 1 if x > 1000 else 0)
df["is_vpn"] = df["is_vpn"].astype(int)
# Encode categorical feature
# df["card_type"] = label_encoder.fit_transform(df["card_type"])  
df["device_type"] = label_encoder.fit_transform(df["device_type"]) 
# df["location"] = label_encoder.fit_transform(df["location"]) 
df["status"] = label_encoder.fit_transform(df["status"])

# Drop non-numeric columns that are not needed for training
df.drop(columns=["transaction_id", "user_id", "timestamp","amount"], inplace=True, errors='ignore')
df.head()


KeyError: 'card_type'

In [177]:
df.head()

Unnamed: 0,device_type,location,is_vpn,card_type,status,is_fraud,high_value
0,0,2,0,0,0,0,1
1,0,2,0,0,0,0,0
2,0,0,0,0,0,0,1
3,0,3,1,0,0,0,1
4,1,5,1,1,0,0,0


In [178]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the splits
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)
print(df.dtypes)

Training data shape: (1600, 6)
Testing data shape: (400, 6)
device_type    int64
location       int64
is_vpn         int64
card_type      int64
status         int64
is_fraud       int64
high_value     int64
dtype: object


In [None]:
# Initialize the model
model = RandomForestClassifier(
    random_state=42
)

# Train the model
model.fit(X_train, y_train)


# Make Predictions
y_pred = model.predict(X_test)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)


In [179]:
xgb_model = XGBClassifier(
    n_estimators=100, 
    max_depth=5, 
    learning_rate=0.1, 
    subsample=0.8, 
    colsample_bytree=0.8, 
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Make Predictions
y_pred = xgb_model.predict(X_test)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

#Cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", scores)
print("Mean Accuracy:", scores.mean())

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model Accuracy: 0.8175
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        73
           1       0.82      1.00      0.90       327

    accuracy                           0.82       400
   macro avg       0.41      0.50      0.45       400
weighted avg       0.67      0.82      0.74       400

Cross-Validation Accuracy Scores: [0.8275 0.79   0.8275 0.8275 0.8275]
Mean Accuracy: 0.8200000000000001
