# Installing PyTorch and TorchGeometric

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

**On Colab** click restart runtime, if asked.

In [3]:
# Enforce pytorch version 1.6.0
import torch
if torch.__version__ != '1.6.0':
  !pip uninstall torch -y
  !pip uninstall torchvision -y
  !pip install torch==1.6.0
  !pip install torchvision==0.7.0

# Check pytorch version and make sure you use a GPU Kernel
!python -c "import torch; print(torch.__version__)"
!python -c "import torch; print(torch.version.cuda)"
!python --version
!nvidia-smi

1.6.0
10.2
Python 3.7.12
NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [4]:
import torch
pytorch_version = f"torch-{torch.__version__}+cu{torch.version.cuda.replace('.', '')}.html"
!pip install --no-index torch-scatter -f https://pytorch-geometric.com/whl/$pytorch_version
!pip install --no-index torch-sparse -f https://pytorch-geometric.com/whl/$pytorch_version
!pip install --no-index torch-cluster -f https://pytorch-geometric.com/whl/$pytorch_version
!pip install --no-index torch-spline-conv -f https://pytorch-geometric.com/whl/$pytorch_version
!pip install torch-geometric

Looking in links: https://pytorch-geometric.com/whl/torch-1.6.0+cu102.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.6.0%2Bcu102/torch_scatter-2.0.6-cp37-cp37m-linux_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 2.1 MB/s 
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.6
Looking in links: https://pytorch-geometric.com/whl/torch-1.6.0+cu102.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.6.0%2Bcu102/torch_sparse-0.6.9-cp37-cp37m-linux_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 2.1 MB/s 
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.9
Looking in links: https://pytorch-geometric.com/whl/torch-1.6.0+cu102.html
Collecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-1.6.0%2Bcu102/torch_cluster-1.5.9-cp37-cp37m-linux_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 2.1 

# Loading dataset into pyG

In [5]:
import torch
import torch_geometric
from torch_geometric.data import InMemoryDataset, Data
import os
import random
from math import floor

for custom datasets, you must inherit from "InMemoryDataset" or "Dataset" (for larger datasets) class.

In [8]:
import torch.nn.functional as F

class Transactions(InMemoryDataset):
    def __init__(self, root, test=False, transform=None, pre_transform=None):
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
    
    @property
    def raw_file_names(self):
        return ["accounts.csv", "transactions.csv"]
    
    @property
    def processed_file_names(self):
        return ['graph.pt']

    def download(self):
        pass

    def process(self):
        # Read files from csv
        accounts = pd.read_csv(self.raw_paths[0],
                 index_col='acct_id',
                 usecols=["acct_id","initial_deposit", "prior_sar_count"])
        transactions = pd.read_csv(self.raw_paths[1],
                 index_col='tran_id',
                 usecols=["tran_id", "orig_acct", "bene_acct", "base_amt"])
        
        # Process Edge Features
        gb = transactions.groupby(['orig_acct', 'bene_acct'])
        counts = gb.size().to_frame(name='counts')

        ## Aggregate multi edges between two nodes using count, mean, min and max
        features = counts.join(gb.agg(
            {'base_amt': 'mean'}).rename(columns={'base_amt': 'mean'})).join(
                gb.agg({'base_amt': 'max'}).rename(columns={'base_amt': 'max'}).join(
                    gb.agg({'base_amt': 'min'}).rename(columns={'base_amt': 'min'}
                )
            )
        ).reset_index()
        edge_index = torch.tensor(features[['orig_acct', 'bene_acct']].values)
        edge_attributes = torch.tensor(features[['counts', 'mean', 'max']].values, dtype=torch.float)
        edge_attributes = edge_attributes / edge_attributes.max(0, keepdim=True)[0] # Normalizing values

        # Process Node Features
        initials = [[x] for x in accounts['initial_deposit']]
        x = torch.tensor(initials, dtype=torch.float)
        x = x / x.max(0, keepdim=True)[0] # Normalizing Values

        # Process Node Labels
        y = np.array([x for x in accounts['prior_sar_count']])
        y = np.where(y == True, 1, 0)
        y = F.one_hot(torch.tensor(y), num_classes=2)

        # Create Masks for training and test set
        train_mask = np.full(20000, False)
        train_mask[:18000] = True
        np.random.shuffle(train_mask)
        train_mask = torch.tensor(train_mask, dtype=torch.bool)
        test_mask = np.invert(train_mask)
        test_mask = torch.tensor(test_mask, dtype=torch.bool)
    
        data = Data(
            x=x,
            edge_index=edge_index.t().contiguous(),
            edge_attr=edge_attributes,
            train_mask=train_mask,
            test_mask=test_mask,
            y=y
        )
        # Save dataset
        data, slices = self.collate([data])
        torch.save((data, slices), self.processed_paths[0])

In [9]:
dataset = Transactions(root='/content/drive/MyDrive/BankProject/')

## Dataset Statistics

In [10]:
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of classes: {dataset.num_classes}')
print(f'Number of node features: {dataset.num_node_features}')
print(f'Number of edge features: {dataset.num_edge_features}')
data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Number of training set: {data.train_mask.sum()}')
print(f'Number of test set: {data.test_mask.sum()}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Dataset: Transactions():
Number of graphs: 1
Number of classes: 2
Number of node features: 1
Number of edge features: 3

Data(x=[20000, 1], edge_index=[2, 189726], edge_attr=[189726, 3], y=[20000, 2], train_mask=[20000], test_mask=[20000])
Number of nodes: 20000
Number of edges: 189726
Number of training set: 18000
Number of test set: 2000
Average node degree: 9.49
Has isolated nodes: True
Has self-loops: False
Is undirected: False


# Defining Neural Network Architecture

In [11]:
import torch
from torch_geometric.nn import TransformerConv, Linear

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = TransformerConv(1, 64, edge_dim=3, heads=3)
        self.conv2 = TransformerConv(64*3, 64, edge_dim=3, heads=3)
        self.linear1 = torch.nn.Linear(64*3, 32)
        self.linear2 = torch.nn.Linear(32, 2)
    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        return x # A Softmax layer is further applied using CrossEntropy loss

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)

data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4,)

model.train()

GCN(
  (conv1): TransformerConv(1, 64, heads=3)
  (conv2): TransformerConv(192, 64, heads=3)
  (linear1): Linear(in_features=192, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=2, bias=True)
)

In [13]:
print(f"number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

number of parameters: 157154


# Training

In [14]:
imbalance = torch.tensor([1.0, 25.5]).to(device) # Weight of each class
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    L = torch.nn.CrossEntropyLoss(weight=imbalance)
    L = L(out[data.train_mask], data.y[data.train_mask].argmax(-1))

    L.backward()
    optimizer.step()
    print(f"epoch {epoch}: {L}")

epoch 0: 0.7033388018608093
epoch 1: 0.9092857837677002
epoch 2: 0.8066948652267456
epoch 3: 0.7400979995727539
epoch 4: 0.6893926858901978
epoch 5: 0.7126712203025818
epoch 6: 0.6984794735908508
epoch 7: 0.686921238899231
epoch 8: 0.6937553882598877
epoch 9: 0.6894543170928955
epoch 10: 0.680950939655304
epoch 11: 0.6817549467086792
epoch 12: 0.6775029301643372
epoch 13: 0.6718550324440002
epoch 14: 0.6683871746063232
epoch 15: 0.6593403816223145
epoch 16: 0.6518533229827881
epoch 17: 0.636893093585968
epoch 18: 0.6221458911895752
epoch 19: 0.5986725091934204
epoch 20: 0.5694953799247742
epoch 21: 0.5349553227424622
epoch 22: 0.5166204571723938
epoch 23: 0.6710770726203918
epoch 24: 0.5203755497932434
epoch 25: 0.5193316340446472
epoch 26: 0.4916556775569916
epoch 27: 0.47148799896240234
epoch 28: 0.44983989000320435
epoch 29: 0.45902150869369507
epoch 30: 0.40949997305870056
epoch 31: 0.43941161036491394
epoch 32: 0.3812395930290222
epoch 33: 0.4164711833000183
epoch 34: 0.3569839894

# Model Evaluation

## Test

In [15]:
model.eval()
pred = model(data).argmax(dim=1)

from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(data.y[data.test_mask].argmax(-1).cpu(),
                                  pred[data.test_mask].cpu()).ravel()

print(f'True Positive: {tp}')
print(f'False Positive: {fp}')
print(f'True Negative: {tn}')
print(f'False Negative: {fn}')

True Positive: 69
False Positive: 53
True Negative: 1876
False Negative: 2


In [17]:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
print("precision: {:.3f}%".format(precision*100))
print("recall: {:.3f}%".format(recall*100))

precision: 56.557%
recall: 97.183%
