In [1]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

print(device)

cuda


In [None]:
# run this cell if on mac

import torch

# mps runs out of memory without batches, and neighbor batches aren't supported on mac yet
device = torch.device('cpu')

In [3]:
import os

# Install the Kaggle API client
!pip install kaggle

# Create the .kaggle directory if it doesn't exist
os.makedirs('/root/.kaggle', exist_ok=True)



In [4]:
import json

# Kaggle API credentials provided by the user
kaggle_credentials = {"username":"qmulberry","key":"KGAT_0b8dba1122eb1899edcec446e08f9011"}

# Write the credentials to kaggle.json
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump(kaggle_credentials, f)

# Set permissions for the kaggle.json file
!chmod 600 /root/.kaggle/kaggle.json

print("Kaggle API key configured successfully!")

Kaggle API key configured successfully!


In [5]:
# Create the dataset directory if it doesn't exist
import os
os.makedirs('dataset', exist_ok=True)

# Download and unzip the dataset into the 'dataset' directory
!kaggle datasets download -d ealtman2019/ibm-transactions-for-anti-money-laundering-aml -p dataset --unzip

print("Dataset downloaded and unzipped successfully into 'dataset' directory!")

Dataset URL: https://www.kaggle.com/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml
License(s): Community Data License Agreement - Sharing - Version 1.0
Downloading ibm-transactions-for-anti-money-laundering-aml.zip to dataset
100% 7.59G/7.61G [00:19<00:00, 332MB/s]
100% 7.61G/7.61G [00:19<00:00, 429MB/s]
Dataset downloaded and unzipped successfully into 'dataset' directory!


In [6]:
import pandas as pd
from pathlib import Path

# get data (need git LFS)
node_path = Path("dataset") / "HI-Small_accounts.csv"
edge_path = Path("dataset") / "HI-Small_Trans.csv"
node_data = pd.read_csv(node_path)
edge_data = pd.read_csv(edge_path)

In [7]:
import numpy as np

# use indices from accounts dataset for node indices
accounts = node_data.reset_index()[['Account Number', 'index']]
num_nodes = accounts.shape[0]
compact = {accounts['Account Number'][i]: accounts['index'][i] for i in range(num_nodes)}
to_node = np.vectorize(lambda x: compact[x])

In [9]:
!pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.7.0


In [10]:
from torch_geometric.data import Data

# create adjacency list in COO format
source = to_node(edge_data['Account'])
target = to_node(edge_data['Account.1'])
edge_index = torch.from_numpy(np.vstack([source, target])).to(device)

num_edges = edge_index.shape[1]

g = Data(edge_index=edge_index, num_nodes=num_nodes)

In [11]:
from torch_geometric.utils import degree
import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder

# create bank frequency column
freq = node_data['Bank ID'].value_counts()
id_freq = np.vectorize(lambda x: freq[x])
node_data['Bank Frequency'] = id_freq(node_data['Bank ID'])
node_data['Bank Frequency'] = pd.cut(node_data['Bank Frequency'], bins=[0, 2, 10, 100, 4000], labels=[0, 1, 2, 3])

# use one hot encoding for categorical variables
bank_enc = OneHotEncoder(sparse_output=False)
bank_frequency = bank_enc.fit_transform(node_data['Bank Frequency'].to_numpy().reshape(-1, 1))

paid_enc = OneHotEncoder(sparse_output=False)
currency_sent = paid_enc.fit_transform(edge_data['Payment Currency'].to_numpy().reshape(-1, 1))

received_enc = OneHotEncoder(sparse_output=False)
currency_received = received_enc.fit_transform(edge_data['Receiving Currency'].to_numpy().reshape(-1, 1))

format_enc = OneHotEncoder(sparse_output=False)
pay_format = format_enc.fit_transform(edge_data['Payment Format'].to_numpy().reshape(-1, 1))

# create numerical variables
time_trans = pd.to_datetime(edge_data['Timestamp']).astype('int64') / 1e9
amount_sent = edge_data['Amount Paid'].to_numpy()
amount_received = edge_data['Amount Received'].to_numpy()

# combine all edge features into one tensor with dtype = float32
edge_features = torch.from_numpy(np.column_stack([time_trans, amount_received,
                                                  currency_received, pay_format])).float().to(device)
edge_dim = edge_features.shape[1]

# edge label with 0 = not laundering, 1 = is laundering
label = torch.from_numpy(edge_data['Is Laundering'].to_numpy()).long().to(device)

g.x = torch.from_numpy(bank_frequency).float().to(device)
g.edge_attr = F.normalize(edge_features)
g.edge_label = label

In [12]:
# chronological 60/20/20 split

train_end = int(0.6 * num_edges)
val_end = int(0.8 * num_edges)

train_idx = torch.zeros(num_edges, dtype=torch.bool)
train_idx[:train_end] = True

val_idx = torch.zeros(num_edges, dtype=torch.bool)
val_idx[train_end:val_end] = True

test_idx = torch.zeros(num_edges, dtype=torch.bool)
test_idx[val_end:] = True

#g.train_mask = train_idx
#g.val_mask = val_idx
#g.test_mask = test_idx

In [13]:
# hyperparameters

epochs = 20

pos_weight = 10

embedding_dim = 64

hidden = 64

learn_rate = 0.005

dropout = 0.5

num_neighbors = [20, 10]

batch_size = 1024

In [14]:
from torch_geometric.nn import GINEConv
import torch.nn as nn

# node embedding model
class GNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, embedding_dim, edge_feat_dim, dropout):
        super().__init__()
        self.in_channels = in_channels
        self.hidden = hidden_channels
        self.dim = embedding_dim
        self.edge_dim = edge_feat_dim
        self.drop = dropout

        # linear embeddings
        self.node_embedding = nn.Linear(self.in_channels, self.dim)
        self.edge_embedding = nn.Linear(self.edge_dim, self.dim)

        # convolution layers for node embedding
        self.conv1 = GINEConv(nn.Sequential(
                    nn.Linear(self.dim, self.dim),
                    nn.ReLU(),
                    nn.Linear(self.dim, self.dim)
                    ), edge_dim=self.dim)
        self.conv2 = GINEConv(nn.Sequential(
                    nn.Linear(self.dim, self.dim),
                    nn.ReLU(),
                    nn.Linear(self.dim, self.dim)
                    ), edge_dim=self.dim)

        # mlp for edge embedding
        self.mlp_edge = nn.Sequential(
            nn.Linear(self.dim*3, self.dim),
            nn.ReLU(),
            nn.Linear(self.dim, self.dim),
        )

        # mlp for edge classifier
        self.mlp_classifier = nn.Sequential(
            nn.Linear(self.dim*3, self.hidden),
            nn.ReLU(),
            nn.Dropout(self.drop),
            nn.Linear(self.hidden, self.hidden // 2),
            nn.ReLU(),
            nn.Dropout(self.drop),
            nn.Linear(self.hidden // 2, 2)
        )

    def forward(self, x, edge_index, edge_attr):
        u, v = edge_index
        edge_pairs = torch.transpose(edge_index, 0, 1)
        x = self.node_embedding(x)
        edge_attr = self.edge_embedding(edge_attr)

        # message passing with averaging aggregation method
        x = 0.5 * (x + F.relu(self.conv1(x, edge_index, edge_attr)))
        edge_attr = edge_attr + 0.5*self.mlp_edge(torch.cat([x[u], x[v], edge_attr], dim=-1))

        x = 0.5 * (x + F.relu(self.conv2(x, edge_index, edge_attr)))
        edge_attr = edge_attr + 0.5*self.mlp_edge(torch.cat([x[u], x[v], edge_attr], dim=-1))

        # combine embedding of source, target, and edge features
        x = x[edge_pairs].reshape(-1, 2*self.dim).relu()
        return self.mlp_classifier(torch.cat([x, edge_attr], dim=1))

In [15]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, precision_recall_curve, accuracy_score, precision_score, recall_score, f1_score

# evaluate performance
def evaluate(split_idx):
    gnn.eval()
    with torch.no_grad():
        logits = gnn(g.x, g.edge_index, g.edge_attr)
        y = g.edge_label[split_idx]
        preds = logits[split_idx].argmax(dim=-1)
        accuracy = accuracy_score(y, preds)
        precision = precision_score(y, preds, zero_division=0.0)
        recall = recall_score(y, preds, zero_division=0.0)
        f1 = f1_score(y, preds, zero_division=0.0)
        return [accuracy, precision, recall, f1]

def plots(split_idx):
    gnn.eval()
    with torch.no_grad():
        logits = gnn(g.x, g.edge_index, g.edge_attr)
        y = g.edge_label[split_idx]
        preds = logits[split_idx].argmax(dim=-1)
        cm = confusion_matrix(y, preds)

        # plot confusion matrix for default threshold
        plt.figure(1)
        ConfusionMatrixDisplay(cm).plot()
        plt.title('Confusion Matrix')
        plt.show()

        # plot loss curve
        plt.figure(2)
        plt.plot(loss_values)
        plt.xticks(range(0,epochs))
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Validation Loss')
        plt.show()

        # plot precision-recall curve
        plt.figure(3)
        precision, recall, thresholds = precision_recall_curve(y, logits[split_idx][:, 1])
        plt.plot(recall, precision, label=f'Precision-Recall Curve')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.show()

In [16]:
from torch_geometric.loader import LinkNeighborLoader

train_loader = LinkNeighborLoader(g, num_neighbors=num_neighbors,
                                  edge_label_index=edge_index[:, train_idx],
                                  edge_label=edge_label[train_idx],
                                  batch_size=batch_size,
                                  shuffle=True)

val_loader = LinkNeighborLoader(g, num_neighbors=num_neighbors,
                                edge_label_index=edge_index[:, val_idx],
                                edge_label=edge_label[val_idx],
                                batch_size=batch_size,
                                shuffle=True)

test_loader = LinkNeighborLoader(g, num_neighbors=num_neighbors,
                                 edge_label_index=edge_index[:, test_idx],
                                 edge_label=edge_label[test_idx],
                                 batch_size=batch_size,
                                 shuffle=True)

  neighbor_sampler = NeighborSampler(


In [30]:
# Uninstall existing PyTorch and PyG components to ensure a clean install
!pip uninstall torch torch-scatter torch-sparse torch-geometric torch-cluster torchvision --y

# Install PyTorch 2.1.0 with CUDA 12.1
!pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu121

# Set the PyTorch version for PyG dependencies
import torch
TORCH_VERSION = "2.1.0"
CUDA_VERSION = "121"

# Install torch-scatter, torch-sparse, torch-cluster, and torch-geometric
# using pre-compiled wheels for PyTorch 2.1.0 + cu121
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{TORCH_VERSION}+cu{CUDA_VERSION}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{TORCH_VERSION}+cu{CUDA_VERSION}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{TORCH_VERSION}+cu{CUDA_VERSION}.html
!pip install torch-geometric==2.5.0 -f https://data.pyg.org/whl/torch-{TORCH_VERSION}+cu{CUDA_VERSION}.html

print("PyTorch and PyTorch Geometric dependencies installed successfully!")

2.9.0+cu126
[0mFound existing installation: torch-geometric 2.7.0
Uninstalling torch-geometric-2.7.0:
  Successfully uninstalled torch-geometric-2.7.0
[0mLooking in links: https://data.pyg.org/whl/torch-{TORCH}.html
Collecting torch-scatter
  Downloading torch_scatter-2.1.2.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch-scatter
  Building wheel for torch-scatter (setup.py) ... [?25l[?25hcanceled[31mERROR: Operation cancelled by user[0m[31m
[0mLooking in links: https://data.pyg.org/whl/torch-{TORCH}.html
Collecting torch-sparse
  Using cached torch_sparse-0.6.18.tar.gz (209 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch-sparse
[31mERROR: Operation cancelled by user[0m[31m
[0m  Building wheel for torch-sparse (setup.py) ... [?25

In [31]:
# This cell is no longer needed as the full installation is handled in the previous cell.
# Keeping it here as a placeholder in case it was referenced elsewhere, but its content is cleared.
# If you still want to run this, you might uncomment its content, but it's redundant now.
# import torch
# TORCH_VERSION = torch.__version__
# CUDA_VERSION = torch.version.cuda.replace(".", "") if torch.version.cuda else 'cpu'
# print(f"Detected PyTorch version: {TORCH_VERSION}")
# print(f"Detected CUDA version: {CUDA_VERSION}")
# print(f"Attempting to install torch-sparse compatible with PyTorch {TORCH_VERSION} and CUDA {CUDA_VERSION}...")
# !pip install torch-sparse -f https://data.pyg.org/whl/torch-{TORCH_VERSION}+cu{CUDA_VERSION}.html

In [17]:
import torch.optim as optim

# instantiate model
gnn = GNN(4, hidden, embedding_dim, edge_dim, dropout).to(device)
weight = torch.tensor([1, pos_weight]).float().to(device)
optimizer = optim.Adam(gnn.parameters(), lr=learn_rate)
criterion = nn.CrossEntropyLoss(weight=weight).to(device)

# training loop
loss_values = []
for epoch in range(epochs):
    for batch in train_loader:
        batch.to(device)
        gnn.train()
        optimizer.zero_grad()
        logits = gnn(batch.x, batch.edge_index, batch.edge_attr)
        loss = criterion(logits, batch.edge_label)
        loss.backward()
        optimizer.step()
    loss_values.append(loss.item())
    val_metrics = evaluate(val_idx)
    print(f'Epoch {epoch+1:02d} | loss {loss.item():.4f} | accuracy {val_metrics[0]:.4f} | precision {val_metrics[1]:.4f}| recall {val_metrics[2]:.4f}| f1 {val_metrics[3]:.4f}')

# evaluate metrics on test data
test_metrics = evaluate(test_idx)
print('\n')
print(f'Test Accuracy: {test_metrics[0]:.4f}')
print(f'Test Precision: {test_metrics[1]:.4f}')
print(f'Test Recall: {test_metrics[2]:.4f}')
print(f'Test f1: {test_metrics[3]:.4f}')


plots(test_idx)

ImportError: 'NeighborSampler' requires either 'pyg-lib' or 'torch-sparse'