### https://arxiv.org/pdf/2402.08593

In [1]:
import json
import random
import os
import pickle
import time
import shutil
import sys
import uuid
from collections import defaultdict
from datetime import timedelta
from glob import glob
from itertools import product
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, recall_score, RocCurveDisplay

from sklearn.preprocessing import normalize

import igraph as ig
import leidenalg as la
import numpy as np
import pandas as pd
import xgboost as xgb

import settings as s

assert s.FILE_SIZE == "Medium"
assert s.HIGH_ILLICIT == False

os.environ["EXT_DATA_TYPE_FOLDER"] = s.OUTPUT_POSTFIX.lstrip("-")

from common import get_weights, delete_large_vars, MULTI_PROC_STAGING_LOCATION
from communities import get_communities_spark
from features import (
    generate_features_spark, generate_features_udf_wrapper, get_edge_features_udf,
    SCHEMA_FEAT_UDF, CURRENCY_RATES
)

%load_ext autoreload
%autoreload 2

In [2]:
start_script = time.time()

In [3]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8 (Tested on: Conda 24.1.2 | Apple M3 Pro)"
    )

In [4]:
SPARK_CONF = [
    ("spark.driver.memory", "32g"),
    ("spark.worker.memory", "32g"),
    ("spark.driver.maxResultSize", "32g"),
    ("spark.driver.bindAddress", "127.0.0.1"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
    ("spark.network.timeout", "600s"),
    ("spark.sql.autoBroadcastJoinThreshold", -1)
]

shutil.rmtree("artifacts", ignore_errors=True)
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(SPARK_CONF))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/01 20:26:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
TRAIN_PERC = 0.6
VALIDATION_PERC = 0.2
TEST_PERC = 0.2

KEEP_TOP_N = 100

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

location_main = os.path.join("features", os.environ["EXT_DATA_TYPE_FOLDER"])
# shutil.rmtree(location_main, ignore_errors=True)

location_communities_leiden = f"{location_main}{os.sep}communities_leiden.parquet"

location_features_leiden = f"{location_main}{os.sep}features_leiden.parquet"
location_features_ego = f"{location_main}{os.sep}features_ego.parquet"
location_features_2_hop = f"{location_main}{os.sep}features_2_hop.parquet"
location_features_2_hop_out = f"{location_main}{os.sep}features_2_hop_out.parquet"
location_features_2_hop_in = f"{location_main}{os.sep}features_2_hop_in.parquet"
location_features_2_hop_combined = f"{location_main}{os.sep}features_2_hop_combined.parquet"
location_features_source = f"{location_main}{os.sep}features_source.parquet"
location_features_target = f"{location_main}{os.sep}features_target.parquet"

location_flow_dispense = f"{location_main}{os.sep}location_flow_dispense.parquet"
location_flow_passthrough = f"{location_main}{os.sep}location_flow_passthrough.parquet"
location_flow_sink = f"{location_main}{os.sep}location_flow_sink.parquet"

location_comm_as_source_features = f"{location_main}{os.sep}comm_as_source_features.parquet"
location_comm_as_target_features = f"{location_main}{os.sep}comm_as_target_features.parquet"
location_comm_as_passthrough_features = f"{location_main}{os.sep}comm_as_passthrough_features.parquet"
location_comm_as_passthrough_features_reverse = f"{location_main}{os.sep}comm_as_passthrough_features_reverse.parquet"

location_features_node_level = f"{location_main}{os.sep}features_node_level.parquet"
location_features_edges = f"{location_main}{os.sep}features_edges.parquet"

location_features_edges_train = f"{location_main}{os.sep}features_edges_train.parquet"
location_features_edges_valid = f"{location_main}{os.sep}features_edges_valid.parquet"
location_features_edges_test = f"{location_main}{os.sep}features_edges_test.parquet"

location_train_trx_features = f"{location_main}{os.sep}train_trx_features.parquet"
location_valid_trx_features = f"{location_main}{os.sep}valid_trx_features.parquet"
location_test_trx_features = f"{location_main}{os.sep}test_trx_features.parquet"

location_train_features = f"{location_main}{os.sep}train_features.parquet"
location_valid_features = f"{location_main}{os.sep}valid_features.parquet"
location_test_features = f"{location_main}{os.sep}test_features.parquet"

try:
    os.makedirs(location_main)
except FileExistsError:
    pass

In [6]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)
data = data.withColumn("is_laundering", sf.col("is_laundering").cast("boolean"))
data_count_original = data.count()

In [7]:
%%time

trx_ids_sorted = data.sort("timestamp").select("transaction_id").toPandas()["transaction_id"].values
trx_count = len(trx_ids_sorted)

last_train_index = int(np.floor(trx_count * TRAIN_PERC))
last_validation_index = last_train_index + int(np.floor(trx_count * VALIDATION_PERC))
train_indexes = trx_ids_sorted[:last_train_index]
validation_indexes = trx_ids_sorted[last_train_index:last_validation_index]
test_indexes = trx_ids_sorted[last_validation_index:]

train_indexes_loc = os.path.join(location_main, "temp_train_indexes.parquet")
validation_indexes_loc = os.path.join(location_main, "temp_validation_indexes.parquet")
test_indexes_loc = os.path.join(location_main, "temp_test_indexes.parquet")

pd.DataFrame(train_indexes, columns=["transaction_id"]).to_parquet(train_indexes_loc)
pd.DataFrame(validation_indexes, columns=["transaction_id"]).to_parquet(validation_indexes_loc)
pd.DataFrame(test_indexes, columns=["transaction_id"]).to_parquet(test_indexes_loc)

train_indexes = spark.read.parquet(train_indexes_loc)
validation_indexes = spark.read.parquet(validation_indexes_loc)
test_indexes = spark.read.parquet(test_indexes_loc)

train = train_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
validation = validation_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
test = test_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
train_count, validation_count, test_count = train.count(), validation.count(), test.count()
print()
print(trx_count, train_count, validation_count, test_count)
print()

os.remove(train_indexes_loc)
os.remove(validation_indexes_loc)
os.remove(test_indexes_loc)

train.write.parquet("train-temp", mode="overwrite")
validation.write.parquet("validation-temp", mode="overwrite")
test.write.parquet("test-temp", mode="overwrite")

                                                                                


31223525 18734115 6244705 6244705





CPU times: user 388 ms, sys: 181 ms, total: 569 ms
Wall time: 43.9 s


                                                                                

In [8]:
train = spark.read.parquet("train-temp")
validation = spark.read.parquet("validation-temp")
test = spark.read.parquet("test-temp")
test_count = test.count()

In [8]:
%%time

edges = data.groupby(["source", "target"]).agg(
    sf.sum("amount").alias("amount")
).toPandas()
weights = get_weights(edges)
edges_agg = edges.set_index(["source", "target"]).join(
    weights.set_index(["source", "target"]), how="left"
).reset_index()
edges_agg.loc[:, "amount_weighted"] = (
    edges_agg.loc[:, "amount"] * 
    (edges_agg.loc[:, "weight"] / edges_agg.loc[:, "weight"].max())
)

                                                                                

CPU times: user 34.2 s, sys: 920 ms, total: 35.1 s
Wall time: 40.7 s


In [9]:
# Later on, we will reset the variables (to free up memory), while still keeping these intact
to_keep = %who_ls
to_keep = list(to_keep)

In [10]:
%%time

TOP_N = 50
NUM_HOPS = 5

data_input = spark.createDataFrame(edges_agg)
nodes_source = set(edges_agg["source"].unique())
nodes_target = set(edges_agg["target"].unique())
nodes_passthrough = nodes_source.intersection(nodes_target)

%run generate_flow_features.ipynb

comm_as_source_features.to_parquet(location_comm_as_source_features)
comm_as_target_features.to_parquet(location_comm_as_target_features)
comm_as_passthrough_features.to_parquet(location_comm_as_passthrough_features)
comm_as_passthrough_features_reverse.to_parquet(location_comm_as_passthrough_features_reverse)

del comm_as_source_features
del comm_as_target_features
del comm_as_passthrough_features
del comm_as_passthrough_features_reverse

                                                                                


Processing comm_as_source

Processed hop #1 | 2,842,770 | 1,333,420
Processed hop #2 | 4,358,267 | 847,080
Processed hop #3 | 6,468,686 | 661,588
Processed hop #4 | 8,492,595 | 593,472
Processed hop #5 | 10,225,029 | 556,876

Processing comm_as_target

Processed hop #1 | 2,967,961 | 1,169,331
Processed hop #2 | 11,502,196 | 963,178
Processed hop #3 | 15,539,582 | 919,379
Processed hop #4 | 20,255,707 | 894,218
Processed hop #5 | 23,799,812 | 882,995

Processing comm_as_passthrough

Processed hop #1 | 2,405,456 | 949,800
Processed hop #2 | 3,599,750 | 608,779
Processed hop #3 | 5,394,064 | 508,228
Processed hop #4 | 6,981,617 | 456,019
Processed hop #5 | 8,250,246 | 427,327

Processing comm_as_passthrough_reverse

Processed hop #1 | 2,811,504 | 1,108,680
Processed hop #2 | 10,885,590 | 912,988
Processed hop #3 | 14,752,721 | 870,883
Processed hop #4 | 19,343,674 | 846,994
Processed hop #5 | 22,862,561 | 836,186


comm_as_source_features

CPU times: user 1min 52s, sys: 886 ms, total: 1m

In [11]:
%%time

print("Constructing Leiden communities")

graph = ig.Graph.DataFrame(edges_agg.loc[:, ["source", "target", "amount_weighted"]], use_vids=False, directed=True)
nodes_mapping = {x.index: x["name"] for x in graph.vs()}
communities_leiden = la.find_partition(
    graph, la.ModularityVertexPartition, n_iterations=100, weights="amount_weighted"
)
communities_leiden = [[nodes_mapping[_] for _ in x] for x in communities_leiden]
communities_leiden = [(str(uuid.uuid4()), set(x)) for x in communities_leiden]
sizes_leiden = [len(x[1]) for x in communities_leiden]

with open(location_communities_leiden, "wb") as fl:
    pickle.dump(communities_leiden, fl)

Constructing Leiden communities
CPU times: user 51min 5s, sys: 28.6 s, total: 51min 34s
Wall time: 51min 29s


In [12]:
# with open(location_communities_leiden, "rb") as fl:
#     communities_leiden = pickle.load(fl)

In [13]:
%%time

data_agg_weights = get_weights(
    data.groupby(["source", "target"])
    .agg(
        sf.sum("amount").alias("amount")
    ).toPandas()
)

data_agg_weights_rev = data_agg_weights.rename(
    columns={"target": "source", "source": "target"}
).loc[:, ["source", "target", "weight"]]
data_agg_weights_ud = pd.concat([data_agg_weights, data_agg_weights_rev], ignore_index=True)
data_agg_weights_ud = data_agg_weights_ud.groupby(["source", "target"]).agg(weight=("weight", "sum")).reset_index()

data_agg_weights_ud.sort_values("weight", ascending=False, inplace=True)
grouped_ud = data_agg_weights_ud.groupby("source").head(KEEP_TOP_N).reset_index(drop=True)
grouped_ud = grouped_ud.groupby("source").agg(targets=("target", set))

total = grouped_ud.index.nunique()
nodes_neighborhoods = {}
for index, (source, targets) in enumerate(grouped_ud.iterrows()):
    community_candidates = {source}
    for target in targets["targets"]:
        community_candidates |= (grouped_ud.loc[target, "targets"] | {target})
    nodes_neighborhoods[source] = set(community_candidates)
    if not (index % 250_000):
        print(index, total)

del data_agg_weights_rev
del data_agg_weights_ud
del grouped_ud

                                                                                

0 2070952
250000 2070952
500000 2070952
750000 2070952
1000000 2070952
1250000 2070952
1500000 2070952
1750000 2070952
2000000 2070952
CPU times: user 1min 46s, sys: 1.41 s, total: 1min 47s
Wall time: 1min 52s


In [14]:
%%time

print("Constructing 2-hop communities")

communities_2_hop = get_communities_spark(
    nodes_neighborhoods,
    ig.Graph.DataFrame(edges_agg.loc[:, ["source", "target", "amount_weighted"]], use_vids=False, directed=True), 
    os.cpu_count(), spark, 2, "all", 0.01, "amount_weighted"
)
sizes_2_hop = [len(x[1]) for x in communities_2_hop]

Constructing 2-hop communities


                                                                                

CPU times: user 24.8 s, sys: 1.5 s, total: 26.3 s
Wall time: 5min 48s


In [15]:
%%time

ts_min = data.select(sf.min("timestamp").alias("x")).collect()[0]["x"] - timedelta(minutes=1)
data_graph_agg = data.groupby(["source", "target", "source_bank", "target_bank", "source_currency"]).agg(
    sf.count("source").alias("num_transactions"),
    sf.sum("amount").alias("amount"),
    sf.sum("source_amount").alias("source_amount"),
    sf.collect_list(sf.array((sf.col("timestamp") - ts_min).cast("long"), sf.col("amount"))).alias("timestamps_amounts"),
)
data_graph_agg_sdf = data_graph_agg.persist(StorageLevel.DISK_ONLY)
print(data_graph_agg_sdf.count())
data_graph_agg = data_graph_agg_sdf.toPandas()
index = ["source", "target"]
edges_agg.loc[:, index + ["weight"]].set_index(index)
data_graph_agg = data_graph_agg.set_index(index).join(
    edges_agg.loc[:, index + ["weight"]].set_index(index), how="left"
).reset_index()
data_graph_agg.loc[:, "amount_weighted"] = (
    data_graph_agg.loc[:, "amount"] * 
    (data_graph_agg.loc[:, "weight"] / data_graph_agg.loc[:, "weight"].max())
)

                                                                                

4476472


                                                                                

CPU times: user 37.2 s, sys: 3.48 s, total: 40.7 s
Wall time: 1min 23s


In [16]:
graph = ig.Graph.DataFrame(data_graph_agg, use_vids=False, directed=True)

In [17]:
%%time

print("Leiden communitites features creation")

features_leiden = generate_features_spark(communities_leiden, graph, spark)
features_leiden = features_leiden.rename(columns={"key": "key_fake"})
communities_leiden_dict = dict(communities_leiden)
features_leiden.loc[:, "key"] = features_leiden.loc[:, "key_fake"].apply(lambda x: communities_leiden_dict[x])
features_leiden = features_leiden.explode("key")
del features_leiden["key_fake"]
features_leiden.set_index("key").to_parquet(location_features_leiden)

Leiden communitites features creation


                                                                                

CPU times: user 4min 20s, sys: 23.7 s, total: 4min 44s
Wall time: 11min 52s


In [18]:
%%time

print("2-hop communitites features creation")

features_2_hop = generate_features_spark(communities_2_hop, graph, spark)
features_2_hop.set_index("key").to_parquet(location_features_2_hop)

2-hop communitites features creation


                                                                                

CPU times: user 23min 49s, sys: 16min 6s, total: 39min 56s
Wall time: 1h 10min 43s


In [19]:
del graph

In [20]:
%%time

print("Temporal flows features creation")

edges_totals = data.select("source", "target", "amount").groupby(
    ["source", "target"]
).agg(sf.count("amount").alias("amount")).toPandas()
edges_totals = edges_totals.sort_values("amount", ascending=False).reset_index(drop=True)
left_edges = spark.createDataFrame(edges_totals.groupby("target").head(TOP_N).loc[:, ["source", "target"]])
right_edges = spark.createDataFrame(edges_totals.groupby("source").head(TOP_N).loc[:, ["source", "target"]])

columns = ["source", "target", "timestamp", "amount"]

left = left_edges.select(sf.col("source").alias("src"), sf.col("target").alias("tgt")).join(
    data.select(*columns),
    on=(sf.col("src") == sf.col("source")) & (sf.col("tgt") == sf.col("target")),
    how="left"
).drop("src", "tgt").persist(StorageLevel.DISK_ONLY)
select = []
for column in left.columns:
    select.append(sf.col(column).alias(f"left_{column}"))
left = left.select(*select)
right = right_edges.select(sf.col("source").alias("src"), sf.col("target").alias("tgt")).join(
    data.select(*columns),
    on=(sf.col("src") == sf.col("source")) & (sf.col("tgt") == sf.col("target")),
    how="left"
).drop("src", "tgt").persist(StorageLevel.DISK_ONLY)

print(left.count(), right.count())

flows_temporal = left.join(
    right,
    (left["left_target"] == right["source"]) &
    (left["left_timestamp"] <= right["timestamp"]),
    how="inner"
).groupby(["left_source", "left_target", "source", "target"]).agg(
    sf.sum("left_amount").alias("left_amount"),
    sf.sum("amount").alias("amount"),
).drop("left_target").select(
    sf.col("left_source").alias("dispense"),
    sf.col("source").alias("passthrough"),
    sf.col("target").alias("sink"),
    sf.least("left_amount", "amount").alias("amount"),
)

aggregate = [
    sf.sum("amount").alias("amount_sum"),
    sf.mean("amount").alias("amount_mean"),
    sf.median("amount").alias("amount_median"),
    sf.max("amount").alias("amount_max"),
    sf.stddev("amount").alias("amount_std"),
    sf.countDistinct("dispense").alias("dispense_count"),
    sf.countDistinct("passthrough").alias("passthrough_count"),
    sf.countDistinct("sink").alias("sink_count"),
]
for flow_location, flow_type in [
    (location_flow_dispense, "dispense"), (location_flow_passthrough, "passthrough"), (location_flow_sink, "sink")
]:
    print(flow_type)
    flows_temporal_stats = flows_temporal.groupby(flow_type).agg(*aggregate).toPandas()
    flows_temporal_cyclic_stats = flows_temporal.where(
        (sf.col("dispense") == sf.col("sink"))
    ).groupby(flow_type).agg(*aggregate).toPandas()
    flows_temporal_stats = flows_temporal_stats.set_index(flow_type).join(
        flows_temporal_cyclic_stats.set_index(flow_type),
        how="left", rsuffix="_cycle"
    )
    flows_temporal_stats.index.name = "key"
    flows_temporal_stats.to_parquet(flow_location)
    del flows_temporal_stats
    del flows_temporal_cyclic_stats

left.unpersist()
right.unpersist()

del edges_totals
del left_edges
del right_edges

Temporal flows features creation


                                                                                

31827307 28952152
dispense


                                                                                

passthrough


                                                                                

sink


                                                                                

CPU times: user 14.2 s, sys: 2.58 s, total: 16.7 s
Wall time: 9min 12s


In [21]:
%%time

print("1-hop-source features creation")

features_source = spark.createDataFrame(data_graph_agg).withColumn(
    "key", sf.col("source")
).repartition(os.cpu_count() * 5, "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_source = pd.DataFrame(features_source["features"].apply(json.loads).tolist())
features_source.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_source.columns]
features_source.set_index("key").to_parquet(location_features_source)

1-hop-source features creation


25/07/31 03:20:02 WARN TaskSetManager: Stage 285 contains a task of very large size (2896 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1min 43s, sys: 12.7 s, total: 1min 56s
Wall time: 25min 3s


In [22]:
%%time

print("1-hop-target features creation")

features_target = spark.createDataFrame(data_graph_agg).withColumn(
    "key", sf.col("target")
).repartition(os.cpu_count() * 5, "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_target = pd.DataFrame(features_target["features"].apply(json.loads).tolist())
features_target.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_target.columns]
features_target.set_index("key").to_parquet(location_features_target)

1-hop-target features creation


25/07/31 03:44:58 WARN TaskSetManager: Stage 288 contains a task of very large size (2896 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1min 41s, sys: 8.72 s, total: 1min 50s
Wall time: 20min 27s


In [23]:
del data_graph_agg

In [24]:
ENABLED_FEATURES = [
    ("leiden", location_features_leiden),
    ("2_hop", location_features_2_hop),
    ("as_source", location_features_source),
    ("as_target", location_features_target),
    ("comm_as_source_features", location_comm_as_source_features),
    ("comm_as_target_features", location_comm_as_target_features),
    ("comm_as_passthrough_features", location_comm_as_passthrough_features),
    ("comm_as_passthrough_features_reverse", location_comm_as_passthrough_features_reverse),
    ("flow_dispense", location_flow_dispense),
    ("flow_passthrough", location_flow_passthrough),
    ("flow_sink", location_flow_sink),   
]

In [25]:
all_features = pd.DataFrame()
all_features.index.name = "key"

for feature_group, location in ENABLED_FEATURES:
    all_features = all_features.join(
        pd.read_parquet(location), how="outer", rsuffix=f"_{feature_group}"
    )

all_features.to_parquet(location_features_node_level)
print("Features:", all_features.shape)
del all_features

Features: (2070952, 344)


In [8]:
all_features = pd.read_parquet(location_features_node_level)

In [9]:
all_features.shape

(2026126, 344)

In [10]:
constants = []
for column in all_features.columns:
    if all_features[column].nunique(dropna=True) <= 1:
        del all_features[column]
        constants.append(column)
print(f"Deleted {len(constants)} constant columns")

Deleted 14 constant columns


In [11]:
medians = {}
for column in all_features.columns:
    medians[column] = np.nanmedian(all_features[column])

In [34]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")

Script executed in 0:29:29


In [12]:
%%time

print("Training the anomaly detection model")

anomalies = all_features.loc[:, []]
model_ad = IsolationForest(n_estimators=10_000)
anomalies.loc[:, "anomaly_score"] = -model_ad.fit(
    all_features.fillna(medians)
).decision_function(all_features.fillna(medians))
anomalies.loc[:, "anomaly_score"] += abs(anomalies.loc[:, "anomaly_score"].min()) + 1e-10
anomalies.loc[:, "anomaly_score"] /= anomalies.loc[:, "anomaly_score"].max()

Training the anomaly detection model
CPU times: user 39.5 s, sys: 10.2 s, total: 49.7 s
Wall time: 53 s


In [32]:
n_components = 50
pca = PCA(n_components=n_components)
all_features_dim_reduced = pd.DataFrame(
    pca.fit_transform(normalize(all_features.fillna(medians), norm="l1", axis=1)),
    index=all_features.index
)
explained_variance_ratio = round(sum(pca.explained_variance_ratio_) * 100, 2)
assert explained_variance_ratio > 99
print(n_components, explained_variance_ratio)
all_features_dim_reduced.columns = [
    f"pca_{x + 1}" for x in all_features_dim_reduced.columns
]
all_features_dim_reduced = all_features_dim_reduced.astype(np.float32)

50 99.99


In [33]:
%%time

print(f"Generating edge features")

to_select = ["source", "target", "format", "source_currency", "source_amount", "amount", "timestamp"]

edges_features_input = data.select(to_select).groupby(
    ["source", "target", "format", "source_currency"]
).agg(
    sf.sum("source_amount").alias("source_amount"), 
    sf.sum("amount").alias("amount"),
    sf.unix_timestamp(sf.min("timestamp")).alias("min_ts"),
    sf.unix_timestamp(sf.max("timestamp")).alias("max_ts"),
).repartition(os.cpu_count() * 2, "source", "target").persist(StorageLevel.DISK_ONLY)
_ = edges_features_input.count()

edge_features = edges_features_input.groupby(["source", "target"]).applyInPandas(
    get_edge_features_udf, schema=SCHEMA_FEAT_UDF
).toPandas()
edge_features = pd.DataFrame(edge_features["features"].apply(json.loads).tolist())

edge_features.to_parquet(location_features_edges)
del edge_features

Generating edge features


                                                                                

CPU times: user 15 s, sys: 2.83 s, total: 17.9 s
Wall time: 12min 9s


In [13]:
edge_features = pd.read_parquet(location_features_edges)

In [14]:
%%time

train_edges = train.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
valid_edges = validation.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
test_edges = test.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)

train_features = train_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
validation_features = valid_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
test_features = test_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()

                                                                                

CPU times: user 21.8 s, sys: 1.75 s, total: 23.5 s
Wall time: 1min 13s


In [14]:
all_features_spark = spark.createDataFrame(all_features.reset_index())
for col in all_features_spark.columns:
    all_features_spark = all_features_spark.withColumnRenamed(col, f"node_{col}")

In [15]:
def save_edge_features(features_in, location):
    initial_node_features = list(all_features_spark.columns)

    features_in = features_in.rename(
        columns={x: f"edge_{x}" for x in features_in.columns}
    )
    features_in = features_in.set_index("edge_target").join(
        anomalies, how="left"
    ).reset_index().set_index("edge_source").join(
        anomalies, how="left", rsuffix="_source"
    ).reset_index()
    
    features_in = spark.createDataFrame(features_in)
    features_in = features_in.withColumnRenamed("anomaly_score", "anomaly_score_target")
    
    features_in = features_in.join(
        all_features_spark,
        features_in["edge_source"] == all_features_spark["node_key"],
        how="left"
    )

    all_features_spark_target = all_features_spark.select(*initial_node_features)
    for col in all_features_spark_target.columns:
        all_features_spark_target = all_features_spark_target.withColumnRenamed(col, f"{col}_target")

    features_in = features_in.join(
        all_features_spark_target,
        features_in["edge_target"] == all_features_spark_target["node_key_target"],
        how="left"
    )

    features_in = features_in.withColumnRenamed("edge_source", "source")
    features_in = features_in.withColumnRenamed("edge_target", "target").drop("node_key", "node_key_target")

    features_in = features_in.withColumn(
        "anom_scores_diff", sf.col("anomaly_score_source") - sf.col("anomaly_score_target")
    )
    features_in = features_in.withColumn(
        "anom_scores_min", sf.least(sf.col("anomaly_score_source"), sf.col("anomaly_score_target"))
    )
    features_in = features_in.withColumn(
        "anom_scores_max", sf.greatest(sf.col("anomaly_score_source"), sf.col("anomaly_score_target"))
    )
    features_in = features_in.withColumn(
        "anom_scores_mean", (sf.col("anomaly_score_source") + sf.col("anomaly_score_target")) / 2
    )
    features_in.write.parquet(location, mode="overwrite")

In [21]:
%%time

save_edge_features(train_features, location_features_edges_train)

25/07/31 19:42:58 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/07/31 19:42:58 WARN TaskSetManager: Stage 68 contains a task of very large size (27383 KiB). The maximum recommended task size is 1000 KiB.
25/07/31 19:43:17 WARN TaskSetManager: Stage 69 contains a task of very large size (2303 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 6.71 s, sys: 1.51 s, total: 8.22 s
Wall time: 2min 55s


In [22]:
%%time

save_edge_features(validation_features, location_features_edges_valid)

25/07/31 19:45:49 WARN TaskSetManager: Stage 77 contains a task of very large size (27383 KiB). The maximum recommended task size is 1000 KiB.
25/07/31 19:46:10 WARN TaskSetManager: Stage 78 contains a task of very large size (2303 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 2.5 s, sys: 748 ms, total: 3.24 s
Wall time: 1min 32s


In [23]:
%%time

save_edge_features(test_features, location_features_edges_test)

25/07/31 19:47:23 WARN TaskSetManager: Stage 86 contains a task of very large size (27383 KiB). The maximum recommended task size is 1000 KiB.
25/07/31 19:47:42 WARN TaskSetManager: Stage 87 contains a task of very large size (2303 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 3.04 s, sys: 884 ms, total: 3.93 s
Wall time: 1min 36s


In [15]:
def save_trx_features(data_in, location):
    columns = ["source", "target", "source_currency", "target_currency", "format"]

    trx_features = data_in.groupby(columns).agg(
        sf.sum("amount").alias("amount"),
        sf.count("amount").alias("trx_count"),
        sf.max("is_laundering").alias("is_laundering"),
    ).toPandas()
    trx_features.loc[:, "inter_currency"] = trx_features["source_currency"] != trx_features["target_currency"]

    trx_features.to_parquet(location)
    del trx_features

In [16]:
%%time

save_trx_features(train, location_train_trx_features)
save_trx_features(validation, location_valid_trx_features)
save_trx_features(test, location_test_trx_features)

                                                                                

CPU times: user 5.36 s, sys: 722 ms, total: 6.08 s
Wall time: 59.1 s


In [42]:
# To free up memory for training

to_reset = %who_ls
to_reset = list(to_reset)
to_reset.remove("to_keep")
to_reset = set(to_reset) - set(to_keep)
for var_to_reset in list(to_reset):
    var_to_reset = f"^{var_to_reset}$"
    %reset_selective -f {var_to_reset}

delete_large_vars(globals(), locals())

Deleted `global` DataFrame: edges
Deleted `global` DataFrame: weights
Deleted `global` DataFrame: edges_agg


True

In [39]:
def combine_features(location_features_trx, location_features_edges, location_features, is_test_ds=False):
    features_input = spark.read.parquet(location_features_edges)
    trx_features_input = spark.read.parquet(location_features_trx).withColumnRenamed(
        "source", "source_trx"
    ).withColumnRenamed(
        "target", "target_trx"
    )
    drop = ["source_trx", "target_trx"]
    if not is_test_ds:
        drop += ["source", "target"]
    features_input = trx_features_input.join(
        features_input,
        (trx_features_input["source_trx"] == features_input["source"]) &
        (trx_features_input["target_trx"] == features_input["target"]),
        how="left"
    ).drop(*drop)
    features_input = features_input.write.parquet(location_features, mode="overwrite")

In [40]:
%%time

combine_features(location_train_trx_features, location_features_edges_train, location_train_features)
combine_features(location_valid_trx_features, location_features_edges_valid, location_valid_features)
combine_features(
    location_test_trx_features, location_features_edges_test, location_test_features,
    is_test_ds=True
)

25/07/31 22:41:55 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

CPU times: user 49.2 ms, sys: 91.2 ms, total: 140 ms
Wall time: 1min 38s


                                                                                

In [27]:
shutil.rmtree(MULTI_PROC_STAGING_LOCATION, ignore_errors=True)

In [9]:
category_features = ["source_currency", "target_currency", "format"]
categories_types = {k: "category" for k in category_features}
train_features = pd.read_parquet(location_train_features)
train_features = train_features.astype(categories_types, copy=False)
validation_features = pd.read_parquet(location_valid_features)
validation_features = validation_features.astype(categories_types, copy=False)

train_labels = train_features["is_laundering"].values
del train_features["is_laundering"]
validation_labels = validation_features["is_laundering"].values
del validation_features["is_laundering"]

In [12]:
def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))


model = xgb.XGBClassifier(
    scale_pos_weight=3, early_stopping_rounds=10,
    eval_metric=f1_eval, disable_default_eval_metric=True, 
    num_parallel_tree=10, max_depth=6,
    # colsample_bytree=1, subsample=1,
    colsample_bytree=1, subsample=0.5,
    n_estimators=100, enable_categorical=True,
)

In [13]:
%%time

model.fit(
    train_features, train_labels, verbose=True, eval_set=[(validation_features, validation_labels)],
)

print()
print(model.best_score)

[0]	validation_0-f1_eval:0.95046
[1]	validation_0-f1_eval:0.92075
[2]	validation_0-f1_eval:0.90474
[3]	validation_0-f1_eval:0.88211
[4]	validation_0-f1_eval:0.84809
[5]	validation_0-f1_eval:0.82665
[6]	validation_0-f1_eval:0.81598
[7]	validation_0-f1_eval:0.80384
[8]	validation_0-f1_eval:0.78690
[9]	validation_0-f1_eval:0.77960
[10]	validation_0-f1_eval:0.77169
[11]	validation_0-f1_eval:0.76488
[12]	validation_0-f1_eval:0.76065
[13]	validation_0-f1_eval:0.75594
[14]	validation_0-f1_eval:0.75159
[15]	validation_0-f1_eval:0.74831
[16]	validation_0-f1_eval:0.74536
[17]	validation_0-f1_eval:0.74395
[18]	validation_0-f1_eval:0.74211
[19]	validation_0-f1_eval:0.74169
[20]	validation_0-f1_eval:0.73960
[21]	validation_0-f1_eval:0.73845
[22]	validation_0-f1_eval:0.73711
[23]	validation_0-f1_eval:0.73493
[24]	validation_0-f1_eval:0.73565
[25]	validation_0-f1_eval:0.73456
[26]	validation_0-f1_eval:0.73317
[27]	validation_0-f1_eval:0.73183
[28]	validation_0-f1_eval:0.73111
[29]	validation_0-f1_eva

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1
,device,
,early_stopping_rounds,10
,enable_categorical,True


In [14]:
print()
print(model.best_score)


0.723115


In [15]:
del train_features
del validation_features

In [16]:
test_features = pd.read_parquet(location_test_features)
test_features = test_features.astype(categories_types, copy=False)

join_columns = ["source", "target", "source_currency", "target_currency", "format"]
test_predictions = test_features.loc[:, join_columns]
test_labels = test_features["is_laundering"].values
del test_features["is_laundering"]
del test_features["source"]
del test_features["target"]

In [17]:
test_predictions.loc[:, "prediction"] = model.predict(test_features)

In [18]:
test_predictions_final = spark.createDataFrame(test_predictions).alias("pred")
test_predictions_final = test.join(
    test_predictions_final,
    (test["source"] == test_predictions_final["pred.source"]) &
    (test["target"] == test_predictions_final["pred.target"]) &
    (test["source_currency"] == test_predictions_final["pred.source_currency"]) &
    (test["target_currency"] == test_predictions_final["pred.target_currency"]) &
    (test["format"] == test_predictions_final["pred.format"]),
    how="left"
).select("transaction_id", "is_laundering", "prediction").persist(storageLevel=StorageLevel.DISK_ONLY)

assert test_predictions_final.count() == test_count

test_predictions_final = test_predictions_final.toPandas()
f1_final = f1_score(test_predictions_final["is_laundering"], test_predictions_final["prediction"])
f1_final = round(f1_final * 100, 4)

print(f1_final)

                                                                                

45.9546


In [19]:
gfp_best = 31.03
gfp_std = 0.22

In [20]:
print(f"GFP best: {gfp_best} ± {gfp_std}")

GFP best: 31.03 ± 0.22


In [21]:
# print(f"{round(f1_final, 2)} ±{round(np.std(f1_scores), 2)}")
print(f"{round(f1_final, 2)} ± NA")

45.95 ± NA


In [22]:
uplift = round(((f1_final - gfp_best) / gfp_best) * 100, 2)
print(f"Uplift of {uplift}%")

Uplift of 48.1%
