In [1]:
import os

os.environ["DGLBACKEND"] = "pytorch"

import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import dgl
import torch
from dgl.nn import GraphConv
from sklearn.metrics import f1_score, recall_score

import settings as s

In [2]:
data = pd.read_parquet(s.STAGED_DATA_LOCATION)
data.loc[:, "source"] = data["source"].str.slice(0, 8)
data.loc[:, "target"] = data["target"].str.slice(0, 8)

In [3]:
currency_rates = {
    "jpy": np.float32(0.009487665410827868),
    "cny": np.float32(0.14930721887033868),
    "cad": np.float32(0.7579775434031815),
    "sar": np.float32(0.2665884611958837),
    "aud": np.float32(0.7078143121927827),
    "ils": np.float32(0.29612081311363503),
    "chf": np.float32(1.0928961554056371),
    "usd": np.float32(1.0),
    "eur": np.float32(1.171783425225877),
    "rub": np.float32(0.012852809604990688),
    "gbp": np.float32(1.2916554735187644),
    "btc": np.float32(11879.132698717296),
    "inr": np.float32(0.013615817231245796),
    "mxn": np.float32(0.047296753463246695),
    "brl": np.float32(0.1771008654705292),
}

In [4]:
%%time

data.loc[:, "amount_usd"] = data.apply(lambda x: currency_rates[x["source_currency"]] * x["source_amount"], axis=1)
data.loc[:, "timestamp_trend"] = (data.loc[:, "timestamp"].astype(int) / 10**9) - (data.loc[:, "timestamp"].min().timestamp() - 1)
del data["timestamp"]

CPU times: user 18.9 s, sys: 218 ms, total: 19.1 s
Wall time: 19.1 s


In [5]:
def weighted_quantiles(values, weights, quantiles=0.5, interpolate=False):
    i = values.argsort()
    sorted_weights = weights[i]
    sorted_values = values[i]
    sorted_weights_cumsum = sorted_weights.cumsum()

    if interpolate:
        xp = (sorted_weights_cumsum - sorted_weights/2 ) / sorted_weights_cumsum[-1]
        return np.interp(quantiles, xp, sorted_values)
    else:
        return sorted_values[np.searchsorted(sorted_weights_cumsum, quantiles * sorted_weights_cumsum[-1])]


def weighted_std(values, weights):
    average = np.average(values, weights=weights)
    variance = np.average((values-average)**2, weights=weights)
    return np.sqrt(variance)

In [6]:
%%time

timestamp_weighted_std = data.groupby(["source", "target"]).apply(
    lambda x: weighted_std(x["timestamp_trend"], x["amount_usd"]),
    include_groups=False
).reset_index().rename(columns={0 : "timestamp_weighted_std"})

CPU times: user 1min 41s, sys: 3.51 s, total: 1min 44s
Wall time: 1min 41s


In [7]:
%%time

timestamp_weighted_mean = data.groupby(["source", "target"]).apply(
    lambda x: np.average(x["timestamp_trend"], weights=x["amount_usd"]),
    include_groups=False
).reset_index().rename(columns={0 : "timestamp_weighted_mean"})

CPU times: user 40.5 s, sys: 1.27 s, total: 41.8 s
Wall time: 40.7 s


In [8]:
%%time

timestamp_weighted_median = data.groupby(["source", "target"]).apply(
    lambda x: weighted_quantiles(
        x["timestamp_trend"].values, weights=x["amount_usd"].values, quantiles=0.5, interpolate=True
    ),
    include_groups=False
).reset_index().rename(columns={0 : "timestamp_weighted_median"})

CPU times: user 28.1 s, sys: 204 ms, total: 28.3 s
Wall time: 28.2 s


In [9]:
%%time

timestamp_weighted_90th = data.groupby(["source", "target"]).apply(
    lambda x: weighted_quantiles(
        x["timestamp_trend"].values, weights=x["amount_usd"].values, quantiles=0.9, interpolate=True
    ),
    include_groups=False
).reset_index().rename(columns={0 : "timestamp_weighted_90th"})

CPU times: user 27.9 s, sys: 188 ms, total: 28.1 s
Wall time: 28 s


In [10]:
def generate_features(grouped_data, key, source_or_target="source"):
    source, target = key
    row = {
        "source": source, "target": target, 
        "amount_usd": sum(grouped_data["amount_usd"]), 
        "is_laundering": max(grouped_data["is_laundering"])
    }
    turnover_currency = (
        grouped_data.groupby(
            f"{source_or_target}_currency"
        ).agg({f"{source_or_target}_amount": "sum"}).to_dict()[f"{source_or_target}_amount"]
    )
    # if len(turnover_currency) > 1:
    #     if "btc" not in turnover_currency:
    #         raise Exception(turnover_currency)
    row.update(turnover_currency)
    return row

In [11]:
%%time

features = []
for key_, group in data.groupby(["source", "target"]):
    features.append(generate_features(group, key_))
features = pd.DataFrame(features).fillna(0)

CPU times: user 6min 20s, sys: 9.28 s, total: 6min 29s
Wall time: 6min 22s


In [12]:
%%time

index = ["source", "target"]
to_join = [timestamp_weighted_std, timestamp_weighted_mean, timestamp_weighted_median, timestamp_weighted_90th]
features = features.set_index(index)
for join in to_join:
    features = features.join(join.set_index(index))
features = features.reset_index()

CPU times: user 4.68 s, sys: 221 ms, total: 4.9 s
Wall time: 4.9 s


In [13]:
features.loc[:, "id" ] = list(features.index)
left = features.loc[:, ["id", "source", "target", "timestamp_weighted_mean", "amount_usd"]].set_index("target")
right = features.loc[:, ["id", "source", "target", "timestamp_weighted_mean", "amount_usd"]].set_index("source")
edges = left.join(right, how="inner", lsuffix="_left").reset_index(drop=True)
edges = edges[
    ["id_left", "id", "timestamp_weighted_mean", "timestamp_weighted_mean_left", "amount_usd", "amount_usd_left"]
    ].rename(
    columns={"id_left": "source", "id": "target"}
)
edges.loc[:, "recency"] = 1 - abs(edges["timestamp_weighted_mean_left"] - edges["timestamp_weighted_mean"]) / (
    edges["timestamp_weighted_mean_left"] + edges["timestamp_weighted_mean"]
)
edges.loc[:, "flow"] = 1 - abs(edges["amount_usd_left"] - edges["amount_usd"]) / (
    edges["amount_usd_left"] + edges["amount_usd"]
)
features_test = features.sample(frac=0.3).copy(deep=True)
features.loc[:, "train_mask"] = True
features.loc[features_test.index, "train_mask"] = False
del edges["timestamp_weighted_mean_left"]
del edges["timestamp_weighted_mean"]
del edges["amount_usd_left"]
del edges["amount_usd"]

In [14]:
feature_columns = [
    'amount_usd', 'usd', 'eur', 'cny',
    'jpy', 'inr', 'rub', 'gbp', 'cad', 'aud', 'mxn', 'brl', 'chf', 'ils',
    'sar', 'btc', 'timestamp_weighted_std', 'timestamp_weighted_mean',
    'timestamp_weighted_median', 'timestamp_weighted_90th'
]

In [15]:
graph = dgl.graph((edges["source"], edges["target"]), num_nodes=features.shape[0])

# Node attributes
graph.ndata["feat"] = torch.from_numpy(np.array(features[feature_columns].values, dtype=np.float32))
graph.ndata["label"] = torch.from_numpy(np.array(features["is_laundering"].values, np.int64))
graph.ndata["train_mask"] = torch.from_numpy(features["train_mask"].values)
graph.ndata["test_mask"] = torch.from_numpy(~features["train_mask"].values)

# Edge attributes
# These are probably not supported in the model
edges_attributes = ["recency", "flow"]
graph.edata["recency"] = torch.from_numpy(np.array(edges[edges_attributes].values, dtype=np.float32))

In [16]:
class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats, allow_zero_in_degree=True)
        self.conv2 = GraphConv(h_feats, num_classes, allow_zero_in_degree=True)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h
    

def train(g, model):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    # best_val_acc = 0
    # best_test_acc = 0

    features = g.ndata["feat"]
    labels = g.ndata["label"]
    train_mask = g.ndata["train_mask"]
    # val_mask = g.ndata["val_mask"]
    test_mask = g.ndata["test_mask"]
    for e in range(100):
        # Forward
        logits = model(g, features)
        # Compute prediction
        pred = logits.argmax(1)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = F.cross_entropy(logits[train_mask], labels[train_mask], weight=torch.tensor([1.0, 100.0]))

        y = pred[test_mask]
        y_true = labels[test_mask]
        # Compute accuracy on training/validation/test
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        # val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (y == y_true).float().mean()

        # # Save the best validation accuracy and the corresponding test accuracy.
        # if best_val_acc < val_acc:
        #     best_val_acc = val_acc
        #     best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 5 == 0:
            # print(
            #     f"In epoch {e}, loss: {loss:.3f}, val acc: {val_acc:.3f} (best {best_val_acc:.3f}), test acc: {test_acc:.3f} (best {best_test_acc:.3f})"
            # )
            print(
                f"In epoch {e}, loss: {loss:.3f}, test acc: {test_acc:.3f}"
            )
    return y, y_true

In [17]:
hidden_features = 64
model = GCN(graph.ndata["feat"].shape[1], hidden_features, 2)
y, y_true = train(graph, model)
y_true = np.array(y_true.tolist())
y = np.array(y.tolist())

In epoch 0, loss: 1198976.875, test acc: 0.757
In epoch 5, loss: 177803.250, test acc: 0.955
In epoch 10, loss: 319607.156, test acc: 0.639
In epoch 15, loss: 119910.000, test acc: 0.688
In epoch 20, loss: 101918.680, test acc: 0.575
In epoch 25, loss: 87060.523, test acc: 0.899
In epoch 30, loss: 63340.883, test acc: 0.914
In epoch 35, loss: 46380.887, test acc: 0.277
In epoch 40, loss: 70854.039, test acc: 0.604
In epoch 45, loss: 55014.809, test acc: 0.967
In epoch 50, loss: 81124.547, test acc: 0.313
In epoch 55, loss: 101953.422, test acc: 0.869
In epoch 60, loss: 65272.984, test acc: 0.314
In epoch 65, loss: 59597.215, test acc: 0.974
In epoch 70, loss: 41365.840, test acc: 0.749
In epoch 75, loss: 65573.484, test acc: 0.651
In epoch 80, loss: 53056.926, test acc: 0.622
In epoch 85, loss: 40356.359, test acc: 0.903
In epoch 90, loss: 81222.039, test acc: 0.312
In epoch 95, loss: 58085.973, test acc: 0.810


In [18]:
print(round(recall_score(y_true, y), 3), round(f1_score(y_true, y), 3))

0.493 0.011


In [19]:
round(y.sum() / y.shape[0], 4)

0.4266

In [20]:
round(y_true.sum() / y_true.shape[0], 4)

0.005

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)