In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.2f}'.format)

import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

In [7]:
import joblib
from joblib import load

df = joblib.load(r'C:\Users\user\Desktop\ML & DL projects\Anti- Money Laundering classification\dataset_joblib_files\dataset_autoencoder.joblib')

In [12]:
df.head(10)

Unnamed: 0,from_bank,account,to_bank,account_1,amount_received,amount_paid,is_laundering,day,receiving_currency_Others,receiving_currency_Swiss Franc,receiving_currency_US Dollar,receiving_currency_Yuan,payment_currency_Others,payment_currency_Swiss Franc,payment_currency_US Dollar,payment_currency_Yuan,payment_format_Cheque,payment_format_Credit Card,payment_format_Others
0,10,8000EBD30,10,8000EBD30,3697.34,3697.34,0,1,False,False,True,False,False,False,True,False,False,False,True
1,3208,8000F4580,1,8000F5340,0.01,0.01,0,1,False,False,True,False,False,False,True,False,True,False,False
2,3209,8000F4670,3209,8000F4670,14675.57,14675.57,0,1,False,False,True,False,False,False,True,False,False,False,True
3,12,8000F5030,12,8000F5030,2806.97,2806.97,0,1,False,False,True,False,False,False,True,False,False,False,True
4,10,8000F5200,10,8000F5200,36682.97,36682.97,0,1,False,False,True,False,False,False,True,False,False,False,True
5,1,8000F5AD0,1,8000F5AD0,6162.44,6162.44,0,1,False,False,True,False,False,False,True,False,False,False,True
6,1,8000EBAC0,1,8000EBAC0,14.26,14.26,0,1,False,False,True,False,False,False,True,False,False,False,True
7,1,8000EC1E0,1,8000EC1E0,11.86,11.86,0,1,False,False,True,False,False,False,True,False,False,False,True
8,12,8000EC280,2439,8017BF800,7.66,7.66,0,1,False,False,True,False,False,False,True,False,False,True,False
9,1,8000EDEC0,211050,80AEF5310,383.71,383.71,0,1,False,False,True,False,False,False,True,False,False,True,False


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5078345 entries, 0 to 5078344
Data columns (total 19 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   from_bank                       int64  
 1   account                         object 
 2   to_bank                         int64  
 3   account_1                       object 
 4   amount_received                 float64
 5   amount_paid                     float64
 6   is_laundering                   int64  
 7   day                             int32  
 8   receiving_currency_Others       bool   
 9   receiving_currency_Swiss Franc  bool   
 10  receiving_currency_US Dollar    bool   
 11  receiving_currency_Yuan         bool   
 12  payment_currency_Others         bool   
 13  payment_currency_Swiss Franc    bool   
 14  payment_currency_US Dollar      bool   
 15  payment_currency_Yuan           bool   
 16  payment_format_Cheque           bool   
 17  payment_format_Credit Card 

In [37]:
df1 = df.copy()

all_accounts = pd.concat([df1['account'], df1['account_1']]).unique()

# Create mapping
account_to_id = {acc: idx for idx, acc in enumerate(all_accounts)}

# Map to numeric node IDs
df1['sender_id'] = df1['account'].map(account_to_id)
df1['receiver_id'] = df1['account_1'].map(account_to_id)

df1[['account','sender_id','account_1','receiver_id']].head()

Unnamed: 0,account,sender_id,account_1,receiver_id
0,8000EBD30,0,8000EBD30,0
1,8000F4580,1,8000F5340,399658
2,8000F4670,2,8000F4670,2
3,8000F5030,3,8000F5030,3
4,8000F5200,4,8000F5200,4


In [44]:
num_nodes = len(account_to_id)

In [38]:
df1.drop(columns=['account', 'account_1'], axis=1, inplace=True)

In [21]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5078345 entries, 0 to 5078344
Data columns (total 19 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   from_bank                       int64  
 1   to_bank                         int64  
 2   amount_received                 float64
 3   amount_paid                     float64
 4   is_laundering                   int64  
 5   day                             int32  
 6   receiving_currency_Others       bool   
 7   receiving_currency_Swiss Franc  bool   
 8   receiving_currency_US Dollar    bool   
 9   receiving_currency_Yuan         bool   
 10  payment_currency_Others         bool   
 11  payment_currency_Swiss Franc    bool   
 12  payment_currency_US Dollar      bool   
 13  payment_currency_Yuan           bool   
 14  payment_format_Cheque           bool   
 15  payment_format_Credit Card      bool   
 16  payment_format_Others           bool   
 17  sender_id                  

In [40]:
# 4. GRAPH STRUCTURE
# ------------------------------------------------------------
edge_index = torch.tensor(
df1[['sender_id', 'receiver_id']].values.T,
dtype=torch.long
)

In [42]:
# 5. EDGE FEATURES
# ------------------------------------------------------------
feature_cols = df1.columns.drop([
'sender_id', 'receiver_id', 'is_laundering'
])


edge_attr = torch.tensor(df[feature_cols].astype(float).values, dtype=torch.float)


y = torch.tensor(df['is_laundering'].values, dtype=torch.long)

In [43]:
# 6. TRAIN / VAL / TEST SPLIT (EDGE LEVEL)
# ------------------------------------------------------------
indices = torch.arange(len(y))


train_idx, temp_idx = train_test_split(indices, test_size=0.3, stratify=y)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, stratify=y[temp_idx])

In [45]:
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score

In [46]:
# ------------------------------------------------------------
# 7. PYTORCH GEOMETRIC DATA OBJECT
# ------------------------------------------------------------
data = Data(
edge_index=edge_index,
edge_attr=edge_attr,
y=y,
num_nodes=num_nodes
)

In [47]:
# 8. CLASS IMBALANCE HANDLING (WEIGHTED LOSS)
# ------------------------------------------------------------
fraud_ratio = y.sum().item() / len(y)
class_weights = torch.tensor([fraud_ratio, 1 - fraud_ratio])


loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

In [48]:
# 9. GNN MODEL WITH TEMPORAL AWARENESS
# ------------------------------------------------------------
class TemporalFraudGNN(torch.nn.Module):
    def __init__(self, edge_dim):
        super().__init__()
        self.conv1 = GCNConv(1, 64)
        self.conv2 = GCNConv(64, 32)


        self.edge_mlp = torch.nn.Sequential(
        torch.nn.Linear(32 * 2 + edge_dim, 64),
        torch.nn.ReLU(),
        torch.nn.Linear(64, 2)
        )


    def forward(self, data):
        x = torch.ones((data.num_nodes, 1))


        x = self.conv1(x, data.edge_index)
        x = F.relu(x)
        x = self.conv2(x, data.edge_index)


        src, dst = data.edge_index


        edge_input = torch.cat([
        x[src],
        x[dst],
        data.edge_attr
        ], dim=1)


        return self.edge_mlp(edge_input)


model = TemporalFraudGNN(edge_attr.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [49]:
# 10. TRAINING LOOP
# ------------------------------------------------------------
for epoch in range(30):
    model.train()
    optimizer.zero_grad()


    out = model(data)
    loss = loss_fn(out[train_idx], y[train_idx])


    loss.backward()
    optimizer.step()


    print(f"Epoch {epoch+1:02d} | Train Loss: {loss.item():.4f}")

Epoch 01 | Train Loss: 458813.8125
Epoch 02 | Train Loss: 344503.0938
Epoch 03 | Train Loss: 232172.4062
Epoch 04 | Train Loss: 120404.6406
Epoch 05 | Train Loss: 9234.0215
Epoch 06 | Train Loss: 15741.1338
Epoch 07 | Train Loss: 29255.7207
Epoch 08 | Train Loss: 40387.2227
Epoch 09 | Train Loss: 49444.9531
Epoch 10 | Train Loss: 56763.2969
Epoch 11 | Train Loss: 62340.5000
Epoch 12 | Train Loss: 66649.5781
Epoch 13 | Train Loss: 69755.6719
Epoch 14 | Train Loss: 71700.2344
Epoch 15 | Train Loss: 72654.5703
Epoch 16 | Train Loss: 72713.7266
Epoch 17 | Train Loss: 71976.8125
Epoch 18 | Train Loss: 70522.7891
Epoch 19 | Train Loss: 68417.2344
Epoch 20 | Train Loss: 65714.4375
Epoch 21 | Train Loss: 62462.2812
Epoch 22 | Train Loss: 58692.3242
Epoch 23 | Train Loss: 54405.6758
Epoch 24 | Train Loss: 49651.9141
Epoch 25 | Train Loss: 44467.8047
Epoch 26 | Train Loss: 38881.2266
Epoch 27 | Train Loss: 32913.5000
Epoch 28 | Train Loss: 26591.9004
Epoch 29 | Train Loss: 19926.4121
Epoch 30 | 

In [50]:
# 11. EVALUATION METRICS
# ------------------------------------------------------------
model.eval()
with torch.no_grad():
    logits = model(data)
    probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()


train_auc = roc_auc_score(y[train_idx], probs[train_idx])
val_auc = roc_auc_score(y[val_idx], probs[val_idx])
test_auc = roc_auc_score(y[test_idx], probs[test_idx])


train_pr = average_precision_score(y[train_idx], probs[train_idx])
val_pr = average_precision_score(y[val_idx], probs[val_idx])
test_pr = average_precision_score(y[test_idx], probs[test_idx])


print("\nROC-AUC Scores")
print("Train:", round(train_auc, 4))
print("Val:", round(val_auc, 4))
print("Test:", round(test_auc, 4))


print("\nPrecision-Recall (AP)")
print("Train:", round(train_pr, 4))
print("Val:", round(val_pr, 4))
print("Test:", round(test_pr, 4))


ROC-AUC Scores
Train: 0.4941
Val: 0.4804
Test: 0.4904

Precision-Recall (AP)
Train: 0.001
Val: 0.001
Test: 0.001


In [51]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [52]:
# ------------------------------------------------------------
# 12. CONFUSION MATRIX & CLASSIFICATION REPORT (TEST SET)
# ------------------------------------------------------------

# Convert probabilities to class predictions (0/1)
threshold = 0.5   # you can tune this later for fraud recall vs precision
y_pred_test = (probs[test_idx] >= threshold).astype(int)

y_true_test = y[test_idx].cpu().numpy()

# Confusion Matrix
cm = confusion_matrix(y_true_test, y_pred_test)

print("\nConfusion Matrix (Test Set):")
print(cm)

# Classification Report
print("\nClassification Report (Test Set):")
print(classification_report(
    y_true_test,
    y_pred_test,
    digits=4
))



Confusion Matrix (Test Set):
[[386348 374628]
 [   392    384]]

Classification Report (Test Set):
              precision    recall  f1-score   support

           0     0.9990    0.5077    0.6732    760976
           1     0.0010    0.4948    0.0020       776

    accuracy                         0.5077    761752
   macro avg     0.5000    0.5013    0.3376    761752
weighted avg     0.9980    0.5077    0.6726    761752



In [53]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

for t in np.linspace(0.01, 0.9, 10):
    preds = (probs[test_idx] >= t).astype(int)
    p, r, f, _ = precision_recall_fscore_support(
        y_true_test, preds, average='binary', zero_division=0
    )
    print(f"Threshold {t:.2f} | Precision {p:.4f} | Recall {r:.4f} | F1 {f:.4f}")


Threshold 0.01 | Precision 0.0010 | Recall 0.5387 | F1 0.0019
Threshold 0.11 | Precision 0.0010 | Recall 0.5129 | F1 0.0019
Threshold 0.21 | Precision 0.0010 | Recall 0.5064 | F1 0.0020
Threshold 0.31 | Precision 0.0010 | Recall 0.5013 | F1 0.0020
Threshold 0.41 | Precision 0.0010 | Recall 0.4961 | F1 0.0020
Threshold 0.50 | Precision 0.0010 | Recall 0.4948 | F1 0.0020
Threshold 0.60 | Precision 0.0010 | Recall 0.4948 | F1 0.0021
Threshold 0.70 | Precision 0.0010 | Recall 0.4871 | F1 0.0021
Threshold 0.80 | Precision 0.0010 | Recall 0.4820 | F1 0.0021
Threshold 0.90 | Precision 0.0010 | Recall 0.4742 | F1 0.0021
