In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

!pip install torch_geometric
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import torch_geometric.nn as gnn
from torch_geometric.utils import degree
import torch.nn.functional as F

from sklearn.metrics import roc_auc_score



In [2]:
input_path = "/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv"
df = pd.read_csv(input_path)

train_df = df.sample(frac=0.6)
test_df = df.drop(train_df.index)

In [11]:
def df_to_data(df):
    cards = pd.unique(df[["nameOrig", "nameDest"]].values.ravel())
    edge_index = torch.tensor(np.array([
        pd.Categorical(df["nameOrig"], categories=cards).codes,
        pd.Categorical(df["nameDest"], categories=cards).codes
    ]).astype(int))

    x = torch.rand((len(cards), 1))
    
    fraud_cards = set(df[df["isFraud"] == 1]["nameOrig"].unique())
    y = torch.tensor([1 if x in fraud_cards else 0 for x in cards])
    
    return Data(x=x, y=y, edge_index=edge_index)

train_data = df_to_data(train_df)
test_data = df_to_data(test_df)

In [12]:
train_loader = DataLoader(train_df, batch_size=32)

In [13]:
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super().__init__()
        self.conv1 = gnn.GCNConv(num_node_features, 16)
        self.conv2 = gnn.GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = degree(edge_index[0], data.num_nodes).unsqueeze(-1)
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(1, 2).to(device)
train_data = train_data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
loss_fn = torch.nn.CrossEntropyLoss(weight=torch.tensor([0.001, 0.769]).to(device))

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(train_data)
    loss = loss_fn(out, train_data.y)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, loss {loss}")

Epoch 0, loss 0.7540538907051086
Epoch 10, loss 0.6808944344520569
Epoch 20, loss 0.6712062358856201
Epoch 30, loss 0.669948160648346
Epoch 40, loss 0.6680005192756653
Epoch 50, loss 0.6657681465148926
Epoch 60, loss 0.6666455268859863
Epoch 70, loss 0.6652311682701111
Epoch 80, loss 0.6643689274787903
Epoch 90, loss 0.6618225574493408
Epoch 100, loss 0.6584250330924988
Epoch 110, loss 0.6560139656066895
Epoch 120, loss 0.6477935314178467
Epoch 130, loss 0.6394631266593933
Epoch 140, loss 0.6315372586250305
Epoch 150, loss 0.6230282187461853
Epoch 160, loss 0.6177523136138916
Epoch 170, loss 0.6124471426010132
Epoch 180, loss 0.6077969074249268
Epoch 190, loss 0.6052228808403015


In [27]:
model.eval()
pred = model(test_data.to(device)).argmax(dim=1)
tp = int(((pred == 1) & (test_data.y == 1)).sum())
fp = int(((pred == 1) & (test_data.y == 0)).sum())
tn = int(((pred == 0) & (test_data.y == 0)).sum())
fn = int(((pred == 0) & (test_data.y == 1)).sum())

print(f"{tp=} {fp=} {tn=} {fn=}")

print(f"Accuracy: {(tp + tn) / (tp + fp + tn + fn):.4f}")
print(f"Precision: {tp / (tp + fp):.4f}")
print(f"Recall: {tp / (tp + fn):.4f}")

tp=4 fp=1464 tn=3850406 fn=3297
Accuracy: 0.9988
Precision: 0.0027
Recall: 0.0012
