### https://arxiv.org/pdf/2402.08593

In [1]:
import json
import random
import os
import pickle
import time
import shutil
import sys
import uuid
from collections import defaultdict
from datetime import timedelta
from glob import glob
from itertools import product
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, recall_score, RocCurveDisplay

from sklearn.preprocessing import normalize

import igraph as ig
import leidenalg as la
import numpy as np
import pandas as pd
import xgboost as xgb

import settings as s

assert s.FILE_SIZE == "Large"
assert s.HIGH_ILLICIT == False

os.environ["EXT_DATA_TYPE_FOLDER"] = s.OUTPUT_POSTFIX.lstrip("-")

from common import get_weights, delete_large_vars, MULTI_PROC_STAGING_LOCATION
from communities import get_communities_spark
from features import (
    generate_features_spark, generate_features_udf_wrapper, get_edge_features_udf,
    SCHEMA_FEAT_UDF, CURRENCY_RATES
)

%load_ext autoreload
%autoreload 2

In [2]:
start_script = time.time()

In [3]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8 (Tested on: Conda 24.1.2 | Apple M3 Pro)"
    )

In [4]:
SPARK_CONF = [
    ("spark.driver.memory", "32g"),
    ("spark.worker.memory", "32g"),
    ("spark.driver.maxResultSize", "32g"),
    ("spark.driver.bindAddress", "127.0.0.1"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
    ("spark.network.timeout", "600s"),
    ("spark.sql.autoBroadcastJoinThreshold", -1)
]

shutil.rmtree("artifacts", ignore_errors=True)
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(SPARK_CONF))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/01 13:39:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
TRAIN_PERC = 0.6
VALIDATION_PERC = 0.2
TEST_PERC = 0.2

KEEP_TOP_N = 100

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

location_main = os.path.join("features", os.environ["EXT_DATA_TYPE_FOLDER"])
# shutil.rmtree(location_main, ignore_errors=True)

location_communities_leiden = f"{location_main}{os.sep}communities_leiden.parquet"

location_features_leiden = f"{location_main}{os.sep}features_leiden.parquet"
location_features_ego = f"{location_main}{os.sep}features_ego.parquet"
location_features_2_hop = f"{location_main}{os.sep}features_2_hop.parquet"
location_features_2_hop_out = f"{location_main}{os.sep}features_2_hop_out.parquet"
location_features_2_hop_in = f"{location_main}{os.sep}features_2_hop_in.parquet"
location_features_2_hop_combined = f"{location_main}{os.sep}features_2_hop_combined.parquet"
location_features_source = f"{location_main}{os.sep}features_source.parquet"
location_features_target = f"{location_main}{os.sep}features_target.parquet"

location_flow_dispense = f"{location_main}{os.sep}location_flow_dispense.parquet"
location_flow_passthrough = f"{location_main}{os.sep}location_flow_passthrough.parquet"
location_flow_sink = f"{location_main}{os.sep}location_flow_sink.parquet"

location_comm_as_source_features = f"{location_main}{os.sep}comm_as_source_features.parquet"
location_comm_as_target_features = f"{location_main}{os.sep}comm_as_target_features.parquet"
location_comm_as_passthrough_features = f"{location_main}{os.sep}comm_as_passthrough_features.parquet"
location_comm_as_passthrough_features_reverse = f"{location_main}{os.sep}comm_as_passthrough_features_reverse.parquet"

location_features_node_level = f"{location_main}{os.sep}features_node_level.parquet"
location_features_edges = f"{location_main}{os.sep}features_edges.parquet"

location_features_edges_train = f"{location_main}{os.sep}features_edges_train.parquet"
location_features_edges_valid = f"{location_main}{os.sep}features_edges_valid.parquet"
location_features_edges_test = f"{location_main}{os.sep}features_edges_test.parquet"

location_train_trx_features = f"{location_main}{os.sep}train_trx_features.parquet"
location_valid_trx_features = f"{location_main}{os.sep}valid_trx_features.parquet"
location_test_trx_features = f"{location_main}{os.sep}test_trx_features.parquet"

location_train_features = f"{location_main}{os.sep}train_features.parquet"
location_valid_features = f"{location_main}{os.sep}valid_features.parquet"
location_test_features = f"{location_main}{os.sep}test_features.parquet"

try:
    os.makedirs(location_main)
except FileExistsError:
    pass

In [6]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)
data = data.withColumn("is_laundering", sf.col("is_laundering").cast("boolean"))
data_count_original = data.count()

                                                                                

In [7]:
%%time

trx_ids_sorted = data.sort("timestamp").select("transaction_id").toPandas()["transaction_id"].values
trx_count = len(trx_ids_sorted)

last_train_index = int(np.floor(trx_count * TRAIN_PERC))
last_validation_index = last_train_index + int(np.floor(trx_count * VALIDATION_PERC))
train_indexes = trx_ids_sorted[:last_train_index]
validation_indexes = trx_ids_sorted[last_train_index:last_validation_index]
test_indexes = trx_ids_sorted[last_validation_index:]

train_indexes_loc = os.path.join(location_main, "temp_train_indexes.parquet")
validation_indexes_loc = os.path.join(location_main, "temp_validation_indexes.parquet")
test_indexes_loc = os.path.join(location_main, "temp_test_indexes.parquet")

pd.DataFrame(train_indexes, columns=["transaction_id"]).to_parquet(train_indexes_loc)
pd.DataFrame(validation_indexes, columns=["transaction_id"]).to_parquet(validation_indexes_loc)
pd.DataFrame(test_indexes, columns=["transaction_id"]).to_parquet(test_indexes_loc)

train_indexes = spark.read.parquet(train_indexes_loc)
validation_indexes = spark.read.parquet(validation_indexes_loc)
test_indexes = spark.read.parquet(test_indexes_loc)

train = train_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
validation = validation_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
test = test_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
train_count, validation_count, test_count = train.count(), validation.count(), test.count()
print()
print(trx_count, train_count, validation_count, test_count)
print()

os.remove(train_indexes_loc)
os.remove(validation_indexes_loc)
os.remove(test_indexes_loc)

train.write.parquet("train-temp", mode="overwrite")
validation.write.parquet("validation-temp", mode="overwrite")
test.write.parquet("test-temp", mode="overwrite")

In [8]:
train = spark.read.parquet("train-temp")
validation = spark.read.parquet("validation-temp")
test = spark.read.parquet("test-temp")
test_count = test.count()

In [9]:
# %%time

# edges = data.groupby(["source", "target"]).agg(
#     sf.sum("amount").alias("amount")
# ).toPandas()
# weights = get_weights(edges)
# edges_agg = edges.set_index(["source", "target"]).join(
#     weights.set_index(["source", "target"]), how="left"
# ).reset_index()
# edges_agg.loc[:, "amount_weighted"] = (
#     edges_agg.loc[:, "amount"] * 
#     (edges_agg.loc[:, "weight"] / edges_agg.loc[:, "weight"].max())
# )

In [10]:
# Later on, we will reset the variables (to free up memory), while still keeping these intact
to_keep = %who_ls
to_keep = list(to_keep)

In [10]:
%%time

TOP_N = 50
NUM_HOPS = 5

data_input = spark.createDataFrame(edges_agg)
nodes_source = set(edges_agg["source"].unique())
nodes_target = set(edges_agg["target"].unique())
nodes_passthrough = nodes_source.intersection(nodes_target)

%run generate_flow_features.ipynb

comm_as_source_features.to_parquet(location_comm_as_source_features)
comm_as_target_features.to_parquet(location_comm_as_target_features)
comm_as_passthrough_features.to_parquet(location_comm_as_passthrough_features)
comm_as_passthrough_features_reverse.to_parquet(location_comm_as_passthrough_features_reverse)

del comm_as_source_features
del comm_as_target_features
del comm_as_passthrough_features
del comm_as_passthrough_features_reverse

                                                                                


Processing comm_as_source

Processed hop #1 | 6,546,146 | 1,667,461
Processed hop #2 | 11,743,312 | 1,469,375
Processed hop #3 | 21,126,341 | 1,409,144
Processed hop #4 | 28,378,044 | 1,391,266
Processed hop #5 | 31,710,015 | 1,386,115

Processing comm_as_target

Processed hop #1 | 6,558,360 | 1,314,480
Processed hop #2 | 18,557,822 | 1,194,763
Processed hop #3 | 31,991,564 | 1,162,193
Processed hop #4 | 40,375,545 | 1,141,523
Processed hop #5 | 42,541,507 | 1,128,392

Processing comm_as_passthrough

Processed hop #1 | 6,037,558 | 1,296,091
Processed hop #2 | 10,075,945 | 1,116,547
Processed hop #3 | 17,959,726 | 1,070,066
Processed hop #4 | 22,473,287 | 1,055,391
Processed hop #5 | 24,581,094 | 1,051,132

Processing comm_as_passthrough_reverse

Processed hop #1 | 6,430,169 | 1,286,192
Processed hop #2 | 18,024,540 | 1,166,579
Processed hop #3 | 31,227,006 | 1,135,566
Processed hop #4 | 39,698,968 | 1,116,228
Processed hop #5 | 42,104,754 | 1,104,409


comm_as_source_features

CPU tim

In [11]:
%%time

print("Constructing Leiden communities")

graph = ig.Graph.DataFrame(edges_agg.loc[:, ["source", "target", "amount_weighted"]], use_vids=False, directed=True)
nodes_mapping = {x.index: x["name"] for x in graph.vs()}
communities_leiden = la.find_partition(
    graph, la.ModularityVertexPartition, n_iterations=100, weights="amount_weighted"
)
communities_leiden = [[nodes_mapping[_] for _ in x] for x in communities_leiden]
communities_leiden = [(str(uuid.uuid4()), set(x)) for x in communities_leiden]
sizes_leiden = [len(x[1]) for x in communities_leiden]

with open(location_communities_leiden, "wb") as fl:
    pickle.dump(communities_leiden, fl)

Constructing Leiden communities
CPU times: user 1h 7min 25s, sys: 46.7 s, total: 1h 8min 12s
Wall time: 1h 8min 3s


In [12]:
# with open(location_communities_leiden, "rb") as fl:
#     communities_leiden = pickle.load(fl)

In [13]:
%%time

data_agg_weights = get_weights(
    data.groupby(["source", "target"])
    .agg(
        sf.sum("amount").alias("amount")
    ).toPandas()
)

data_agg_weights_rev = data_agg_weights.rename(
    columns={"target": "source", "source": "target"}
).loc[:, ["source", "target", "weight"]]
data_agg_weights_ud = pd.concat([data_agg_weights, data_agg_weights_rev], ignore_index=True)
data_agg_weights_ud = data_agg_weights_ud.groupby(["source", "target"]).agg(weight=("weight", "sum")).reset_index()

data_agg_weights_ud.sort_values("weight", ascending=False, inplace=True)
grouped_ud = data_agg_weights_ud.groupby("source").head(KEEP_TOP_N).reset_index(drop=True)
grouped_ud = grouped_ud.groupby("source").agg(targets=("target", set))

total = grouped_ud.index.nunique()
nodes_neighborhoods = {}
for index, (source, targets) in enumerate(grouped_ud.iterrows()):
    community_candidates = {source}
    for target in targets["targets"]:
        community_candidates |= (grouped_ud.loc[target, "targets"] | {target})
    nodes_neighborhoods[source] = set(community_candidates)
    if not (index % 250_000):
        print(index, total)

del data_agg_weights_rev
del data_agg_weights_ud
del grouped_ud

                                                                                

0 2047791
250000 2047791
500000 2047791
750000 2047791
1000000 2047791
1250000 2047791
1500000 2047791
1750000 2047791
2000000 2047791
CPU times: user 2min 58s, sys: 3.37 s, total: 3min 1s
Wall time: 3min 21s


In [14]:
%%time

print("Constructing 2-hop communities")

communities_2_hop = get_communities_spark(
    nodes_neighborhoods,
    ig.Graph.DataFrame(edges_agg.loc[:, ["source", "target", "amount_weighted"]], use_vids=False, directed=True), 
    os.cpu_count(), spark, 2, "all", 0.01, "amount_weighted"
)
sizes_2_hop = [len(x[1]) for x in communities_2_hop]

Constructing 2-hop communities


                                                                                

CPU times: user 50.5 s, sys: 3.01 s, total: 53.5 s
Wall time: 6min 54s


In [15]:
%%time

ts_min = data.select(sf.min("timestamp").alias("x")).collect()[0]["x"] - timedelta(minutes=1)
data_graph_agg = data.groupby(["source", "target", "source_bank", "target_bank", "source_currency"]).agg(
    sf.count("source").alias("num_transactions"),
    sf.sum("amount").alias("amount"),
    sf.sum("source_amount").alias("source_amount"),
    sf.collect_list(sf.array((sf.col("timestamp") - ts_min).cast("long"), sf.col("amount"))).alias("timestamps_amounts"),
)
data_graph_agg_sdf = data_graph_agg.persist(StorageLevel.DISK_ONLY)
print(data_graph_agg_sdf.count())
data_graph_agg = data_graph_agg_sdf.toPandas()
index = ["source", "target"]
edges_agg.loc[:, index + ["weight"]].set_index(index)
data_graph_agg = data_graph_agg.set_index(index).join(
    edges_agg.loc[:, index + ["weight"]].set_index(index), how="left"
).reset_index()
data_graph_agg.loc[:, "amount_weighted"] = (
    data_graph_agg.loc[:, "amount"] * 
    (data_graph_agg.loc[:, "weight"] / data_graph_agg.loc[:, "weight"].max())
)

                                                                                

8177437


                                                                                

CPU times: user 1min 24s, sys: 28.4 s, total: 1min 53s
Wall time: 4min 47s


In [16]:
graph = ig.Graph.DataFrame(data_graph_agg, use_vids=False, directed=True)

In [17]:
%%time

print("Leiden communitites features creation")

features_leiden = generate_features_spark(communities_leiden, graph, spark)
features_leiden = features_leiden.rename(columns={"key": "key_fake"})
communities_leiden_dict = dict(communities_leiden)
features_leiden.loc[:, "key"] = features_leiden.loc[:, "key_fake"].apply(lambda x: communities_leiden_dict[x])
features_leiden = features_leiden.explode("key")
del features_leiden["key_fake"]
features_leiden.set_index("key").to_parquet(location_features_leiden)

Leiden communitites features creation


                                                                                

CPU times: user 2min 59s, sys: 42 s, total: 3min 41s
Wall time: 10min 1s


In [18]:
%%time

print("2-hop communitites features creation")

features_2_hop = generate_features_spark(communities_2_hop, graph, spark)
features_2_hop.set_index("key").to_parquet(location_features_2_hop)

2-hop communitites features creation


                                                                                

CPU times: user 28min 12s, sys: 13min 2s, total: 41min 14s
Wall time: 1h 16min 48s


In [19]:
del graph

In [20]:
%%time

print("Temporal flows features creation")

edges_totals = data.select("source", "target", "amount").groupby(
    ["source", "target"]
).agg(sf.count("amount").alias("amount")).toPandas()
edges_totals = edges_totals.sort_values("amount", ascending=False).reset_index(drop=True)
left_edges = spark.createDataFrame(edges_totals.groupby("target").head(TOP_N).loc[:, ["source", "target"]])
right_edges = spark.createDataFrame(edges_totals.groupby("source").head(TOP_N).loc[:, ["source", "target"]])

columns = ["source", "target", "timestamp", "amount"]

left = left_edges.select(sf.col("source").alias("src"), sf.col("target").alias("tgt")).join(
    data.select(*columns),
    on=(sf.col("src") == sf.col("source")) & (sf.col("tgt") == sf.col("target")),
    how="left"
).drop("src", "tgt").persist(StorageLevel.DISK_ONLY)
select = []
for column in left.columns:
    select.append(sf.col(column).alias(f"left_{column}"))
left = left.select(*select)
right = right_edges.select(sf.col("source").alias("src"), sf.col("target").alias("tgt")).join(
    data.select(*columns),
    on=(sf.col("src") == sf.col("source")) & (sf.col("tgt") == sf.col("target")),
    how="left"
).drop("src", "tgt").persist(StorageLevel.DISK_ONLY)

print(left.count(), right.count())

flows_temporal = left.join(
    right,
    (left["left_target"] == right["source"]) &
    (left["left_timestamp"] <= right["timestamp"]),
    how="inner"
).groupby(["left_source", "left_target", "source", "target"]).agg(
    sf.sum("left_amount").alias("left_amount"),
    sf.sum("amount").alias("amount"),
).drop("left_target").select(
    sf.col("left_source").alias("dispense"),
    sf.col("source").alias("passthrough"),
    sf.col("target").alias("sink"),
    sf.least("left_amount", "amount").alias("amount"),
)

aggregate = [
    sf.sum("amount").alias("amount_sum"),
    sf.mean("amount").alias("amount_mean"),
    sf.median("amount").alias("amount_median"),
    sf.max("amount").alias("amount_max"),
    sf.stddev("amount").alias("amount_std"),
    sf.countDistinct("dispense").alias("dispense_count"),
    sf.countDistinct("passthrough").alias("passthrough_count"),
    sf.countDistinct("sink").alias("sink_count"),
]
for flow_location, flow_type in [
    (location_flow_dispense, "dispense"), (location_flow_passthrough, "passthrough"), (location_flow_sink, "sink")
]:
    print(flow_type)
    flows_temporal_stats = flows_temporal.groupby(flow_type).agg(*aggregate).toPandas()
    flows_temporal_cyclic_stats = flows_temporal.where(
        (sf.col("dispense") == sf.col("sink"))
    ).groupby(flow_type).agg(*aggregate).toPandas()
    flows_temporal_stats = flows_temporal_stats.set_index(flow_type).join(
        flows_temporal_cyclic_stats.set_index(flow_type),
        how="left", rsuffix="_cycle"
    )
    flows_temporal_stats.index.name = "key"
    flows_temporal_stats.to_parquet(flow_location)
    del flows_temporal_stats
    del flows_temporal_cyclic_stats

left.unpersist()
right.unpersist()

del edges_totals
del left_edges
del right_edges

Temporal flows features creation


                                                                                

175660810 159434964
dispense


                                                                                

passthrough


                                                                                

sink


                                                                                

CPU times: user 19.8 s, sys: 3.77 s, total: 23.6 s
Wall time: 31min 5s


In [21]:
%%time

print("1-hop-source features creation")

features_source = spark.createDataFrame(data_graph_agg).withColumn(
    "key", sf.col("source")
).repartition(os.cpu_count() * 5, "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_source = pd.DataFrame(features_source["features"].apply(json.loads).tolist())
features_source.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_source.columns]
features_source.set_index("key").to_parquet(location_features_source)

1-hop-source features creation


25/08/01 04:16:59 WARN TaskSetManager: Stage 285 contains a task of very large size (8870 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 10min 38s, sys: 46.6 s, total: 11min 24s
Wall time: 44min 23s


In [22]:
%%time

print("1-hop-target features creation")

features_target = spark.createDataFrame(data_graph_agg).withColumn(
    "key", sf.col("target")
).repartition(os.cpu_count() * 5, "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_target = pd.DataFrame(features_target["features"].apply(json.loads).tolist())
features_target.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_target.columns]
features_target.set_index("key").to_parquet(location_features_target)

1-hop-target features creation


25/08/01 05:09:07 WARN TaskSetManager: Stage 288 contains a task of very large size (8870 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 10min 38s, sys: 4min 57s, total: 15min 35s
Wall time: 40min 26s


In [23]:
del data_graph_agg

In [10]:
ENABLED_FEATURES = [
    ("leiden", location_features_leiden),
    ("2_hop", location_features_2_hop),
    ("as_source", location_features_source),
    ("as_target", location_features_target),
    ("comm_as_source_features", location_comm_as_source_features),
    ("comm_as_target_features", location_comm_as_target_features),
    ("comm_as_passthrough_features", location_comm_as_passthrough_features),
    ("comm_as_passthrough_features_reverse", location_comm_as_passthrough_features_reverse),
    ("flow_dispense", location_flow_dispense),
    ("flow_passthrough", location_flow_passthrough),
    ("flow_sink", location_flow_sink),   
]

In [11]:
all_features = pd.DataFrame()
all_features.index.name = "key"

for feature_group, location in ENABLED_FEATURES:
    all_features = all_features.join(
        pd.read_parquet(location), how="outer", rsuffix=f"_{feature_group}"
    )

all_features.to_parquet(location_features_node_level)
print("Features:", all_features.shape)
del all_features

Features: (2047791, 344)


In [11]:
all_features = pd.read_parquet(location_features_node_level)

In [12]:
constants = []
for column in all_features.columns:
    if all_features[column].nunique(dropna=True) <= 1:
        del all_features[column]
        constants.append(column)
print(f"Deleted {len(constants)} constant columns")

Deleted 14 constant columns


In [13]:
medians = {}
for column in all_features.columns:
    medians[column] = np.nanmedian(all_features[column])

In [14]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")

Script executed in 0:00:32


In [15]:
%%time

print("Training the anomaly detection model")

anomalies = all_features.loc[:, []]
model_ad = IsolationForest(n_estimators=10_000)
anomalies.loc[:, "anomaly_score"] = -model_ad.fit(
    all_features.fillna(medians)
).decision_function(all_features.fillna(medians))
anomalies.loc[:, "anomaly_score"] += abs(anomalies.loc[:, "anomaly_score"].min()) + 1e-10
anomalies.loc[:, "anomaly_score"] /= anomalies.loc[:, "anomaly_score"].max()

Training the anomaly detection model
CPU times: user 39.8 s, sys: 5.67 s, total: 45.5 s
Wall time: 45.5 s


In [16]:
n_components = 10
pca = PCA(n_components=n_components)
all_features_dim_reduced = pd.DataFrame(
    pca.fit_transform(normalize(all_features.fillna(0), norm="l1", axis=1)),
    index=all_features.index
)
explained_variance_ratio = round(sum(pca.explained_variance_ratio_) * 100, 2)
assert explained_variance_ratio > 95
print(n_components, explained_variance_ratio)
all_features_dim_reduced.columns = [
    f"pca_{x + 1}" for x in all_features_dim_reduced.columns
]
all_features_dim_reduced = all_features_dim_reduced.astype(np.float32)

10 98.4


In [17]:
# %%time

# print(f"Generating edge features")

# to_select = ["source", "target", "format", "source_currency", "source_amount", "amount", "timestamp"]

# edges_features_input = data.select(to_select).groupby(
#     ["source", "target", "format", "source_currency"]
# ).agg(
#     sf.sum("source_amount").alias("source_amount"), 
#     sf.sum("amount").alias("amount"),
#     sf.unix_timestamp(sf.min("timestamp")).alias("min_ts"),
#     sf.unix_timestamp(sf.max("timestamp")).alias("max_ts"),
# ).repartition(os.cpu_count() * 2, "source", "target").persist(StorageLevel.DISK_ONLY)
# _ = edges_features_input.count()

# edge_features = edges_features_input.groupby(["source", "target"]).applyInPandas(
#     get_edge_features_udf, schema=SCHEMA_FEAT_UDF
# ).toPandas()
# edge_features = pd.DataFrame(edge_features["features"].apply(json.loads).tolist())

# edge_features.to_parquet(location_features_edges)
# del edge_features

In [18]:
edge_features = pd.read_parquet(location_features_edges)

In [19]:
%%time

train_edges = train.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
valid_edges = validation.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
test_edges = test.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)

train_features = train_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
validation_features = valid_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
test_features = test_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()

                                                                                

CPU times: user 36.5 s, sys: 3.08 s, total: 39.6 s
Wall time: 1min 2s


In [23]:
# Disable some features for final training
# NOTE: PCA components for all features are already there!

ENABLED_FEATURES_TRAINING = [
    # ("leiden", location_features_leiden),
    # ("2_hop", location_features_2_hop),
    ("as_source", location_features_source),
    ("as_target", location_features_target),
    # ("comm_as_source_features", location_comm_as_source_features),
    # ("comm_as_target_features", location_comm_as_target_features),
    # ("comm_as_passthrough_features", location_comm_as_passthrough_features),
    # ("comm_as_passthrough_features_reverse", location_comm_as_passthrough_features_reverse),
    # ("flow_dispense", location_flow_dispense),
    # ("flow_passthrough", location_flow_passthrough),
    # ("flow_sink", location_flow_sink),   
]

all_features = pd.DataFrame()
all_features.index.name = "key"

for feature_group, location in ENABLED_FEATURES_TRAINING:
    all_features = all_features.join(
        pd.read_parquet(location), how="outer", rsuffix=f"_{feature_group}"
    )

constants = []
for column in all_features.columns:
    if all_features[column].nunique(dropna=True) <= 1:
        del all_features[column]
        constants.append(column)
print(f"Deleted {len(constants)} constant columns")

# Disabling all features
all_features = all_features.loc[:, list(all_features.columns[0:1])]
print("Features:", all_features.shape)

all_features_spark = spark.createDataFrame(all_features.reset_index())
for col in all_features_spark.columns:
    all_features_spark = all_features_spark.withColumnRenamed(col, f"node_{col}")

Deleted 6 constant columns
Features: (2047791, 1)


In [24]:
def save_edge_features(features_in, location):
    initial_node_features = list(all_features_spark.columns)

    features_in = features_in.rename(
        columns={x: f"edge_{x}" for x in features_in.columns}
    )
    features_in = features_in.set_index("edge_target").join(
        anomalies, how="left"
    ).reset_index().set_index("edge_source").join(
        anomalies, how="left", rsuffix="_source"
    ).reset_index().set_index("edge_target").join(
        all_features_dim_reduced, how="left", rsuffix="_target"
    ).reset_index().set_index("edge_source").join(
        all_features_dim_reduced, how="left", rsuffix="_source"
    ).reset_index()
    
    features_in = spark.createDataFrame(features_in)
    features_in = features_in.withColumnRenamed("anomaly_score", "anomaly_score_target")
    
    features_in = features_in.join(
        all_features_spark,
        features_in["edge_source"] == all_features_spark["node_key"],
        how="left"
    )

    all_features_spark_target = all_features_spark.select(*initial_node_features)
    for col in all_features_spark_target.columns:
        all_features_spark_target = all_features_spark_target.withColumnRenamed(col, f"{col}_target")

    features_in = features_in.join(
        all_features_spark_target,
        features_in["edge_target"] == all_features_spark_target["node_key_target"],
        how="left"
    )

    features_in = features_in.withColumnRenamed("edge_source", "source")
    features_in = features_in.withColumnRenamed("edge_target", "target").drop("node_key", "node_key_target")

    features_in = features_in.withColumn(
        "anom_scores_diff", sf.col("anomaly_score_source") - sf.col("anomaly_score_target")
    )
    features_in = features_in.withColumn(
        "anom_scores_min", sf.least(sf.col("anomaly_score_source"), sf.col("anomaly_score_target"))
    )
    features_in = features_in.withColumn(
        "anom_scores_max", sf.greatest(sf.col("anomaly_score_source"), sf.col("anomaly_score_target"))
    )
    features_in = features_in.withColumn(
        "anom_scores_mean", (sf.col("anomaly_score_source") + sf.col("anomaly_score_target")) / 2
    )
    features_in.write.parquet(location, mode="overwrite")

In [25]:
%%time

save_edge_features(train_features, location_features_edges_train)

25/08/01 13:43:02 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/08/01 13:43:03 WARN TaskSetManager: Stage 19 contains a task of very large size (3085 KiB). The maximum recommended task size is 1000 KiB.
25/08/01 13:43:41 WARN TaskSetManager: Stage 20 contains a task of very large size (8175 KiB). The maximum recommended task size is 1000 KiB.

CPU times: user 19.1 s, sys: 3.07 s, total: 22.2 s
Wall time: 1min 24s


                                                                                

In [26]:
%%time

save_edge_features(validation_features, location_features_edges_valid)

25/08/01 13:44:22 WARN TaskSetManager: Stage 28 contains a task of very large size (3085 KiB). The maximum recommended task size is 1000 KiB.
25/08/01 13:44:47 WARN TaskSetManager: Stage 29 contains a task of very large size (8175 KiB). The maximum recommended task size is 1000 KiB.

CPU times: user 14.1 s, sys: 2.66 s, total: 16.7 s
Wall time: 56.4 s


                                                                                

In [27]:
%%time

save_edge_features(test_features, location_features_edges_test)

25/08/01 13:45:18 WARN TaskSetManager: Stage 37 contains a task of very large size (3085 KiB). The maximum recommended task size is 1000 KiB.
25/08/01 13:45:43 WARN TaskSetManager: Stage 38 contains a task of very large size (8175 KiB). The maximum recommended task size is 1000 KiB.

CPU times: user 14 s, sys: 2.63 s, total: 16.6 s
Wall time: 55.3 s


                                                                                

In [28]:
def save_trx_features(data_in, location):
    columns = ["source", "target", "source_currency", "target_currency", "format"]

    # trx_features = data_in.groupby(columns).agg(
    #     sf.sum("amount").alias("amount"),
    #     sf.count("amount").alias("trx_count"),
    #     sf.max("is_laundering").alias("is_laundering"),
    # ).toPandas()

    trx_features = data_in.select(*(columns + ["amount", "is_laundering"])).toPandas()
    
    trx_features.loc[:, "inter_currency"] = trx_features["source_currency"] != trx_features["target_currency"]

    trx_features.to_parquet(location)
    del trx_features

In [29]:
# %%time

save_trx_features(train, location_train_trx_features)
save_trx_features(validation, location_valid_trx_features)
save_trx_features(test, location_test_trx_features)

                                                                                

In [30]:
# To free up memory for training

to_reset = %who_ls
to_reset = list(to_reset)
if "to_keep" in to_reset:
    to_reset.remove("to_keep")
to_reset = set(to_reset) - set(to_keep)
for var_to_reset in list(to_reset):
    var_to_reset = f"^{var_to_reset}$"
    %reset_selective -f {var_to_reset}

delete_large_vars(globals(), locals())

True

In [31]:
def combine_features(location_features_trx, location_features_edges, location_features, is_test_ds=False):
    features_input = spark.read.parquet(location_features_edges)
    trx_features_input = spark.read.parquet(location_features_trx).withColumnRenamed(
        "source", "source_trx"
    ).withColumnRenamed(
        "target", "target_trx"
    )
    drop = ["source_trx", "target_trx"]
    if not is_test_ds:
        drop += ["source", "target"]
    features_input = trx_features_input.join(
        features_input,
        (trx_features_input["source_trx"] == features_input["source"]) &
        (trx_features_input["target_trx"] == features_input["target"]),
        how="left"
    ).drop(*drop)
    features_input = features_input.write.parquet(location_features, mode="overwrite")

In [32]:
%%time

combine_features(location_train_trx_features, location_features_edges_train, location_train_features)
combine_features(location_valid_trx_features, location_features_edges_valid, location_valid_features)
combine_features(
    location_test_trx_features, location_features_edges_test, location_test_features,
    is_test_ds=True
)



CPU times: user 71.2 ms, sys: 35.3 ms, total: 106 ms
Wall time: 1min 49s


                                                                                

In [33]:
shutil.rmtree(MULTI_PROC_STAGING_LOCATION, ignore_errors=True)

In [34]:
category_features = ["source_currency", "target_currency", "format"]
category_features_map = {}
for feat in category_features:
    dist_vals = data.select(feat).distinct().toPandas()
    dist_vals = dist_vals.sort_values(feat).reset_index(drop=True)
    category_features_map[feat] = dict(zip(dist_vals[feat], dist_vals.index))

                                                                                

In [35]:
%%time

train_f = pd.read_parquet(location_train_features)
category_features_new = []
cat_data = train_f.loc[:, []].copy(deep=True)
for col in category_features:
    mapping = category_features_map[col]
    new_col = f"{col}_cat"
    cat_data.loc[:, new_col] = train_f.loc[:, col].apply(lambda x: mapping[x])
    del train_f[col]
    category_features_new.append(new_col)

train_f = pd.concat([train_f, cat_data], axis=1)
train_labels = train_f["is_laundering"]
del train_f["is_laundering"]

CPU times: user 1min 2s, sys: 54.7 s, total: 1min 57s
Wall time: 1min 37s


In [37]:
feature_types = []
for col, dtype in zip(train_f.columns, train_f.dtypes):
    if col in category_features_new:
        feature_types.append("c")
    elif dtype == bool:
        feature_types.append("i")
    elif str(dtype).startswith("float"):
        feature_types.append("q")
    elif str(dtype).startswith("int"):
        feature_types.append("int")
    else:
        raise

In [38]:
%%time

valid_f = pd.read_parquet(location_valid_features)
cat_data = valid_f.loc[:, []].copy(deep=True)
for col in category_features:
    mapping = category_features_map[col]
    new_col = f"{col}_cat"
    cat_data.loc[:, new_col] = valid_f.loc[:, col].apply(lambda x: mapping[x])
    del valid_f[col]

valid_f = pd.concat([valid_f, cat_data], axis=1)
valid_labels = valid_f["is_laundering"]
del valid_f["is_laundering"]

CPU times: user 21.2 s, sys: 25.5 s, total: 46.6 s
Wall time: 31.1 s


In [39]:
def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))


model = xgb.XGBClassifier(
    scale_pos_weight=5, early_stopping_rounds=10,
    eval_metric=f1_eval, disable_default_eval_metric=True, 
    num_parallel_tree=1, max_depth=6,
    colsample_bytree=1, subsample=1,
    n_estimators=100, enable_categorical=True,
    feature_types=feature_types,
)

In [40]:
%%time

model.fit(
    train_f, train_labels, verbose=True, eval_set=[(valid_f, valid_labels)],
)
print(model.best_score, model.best_iteration)

[0]	validation_0-f1_eval:0.93890
[1]	validation_0-f1_eval:0.93866
[2]	validation_0-f1_eval:0.93813
[3]	validation_0-f1_eval:0.93211
[4]	validation_0-f1_eval:0.92519
[5]	validation_0-f1_eval:0.91134
[6]	validation_0-f1_eval:0.91027
[7]	validation_0-f1_eval:0.90816
[8]	validation_0-f1_eval:0.88959
[9]	validation_0-f1_eval:0.88032
[10]	validation_0-f1_eval:0.87640
[11]	validation_0-f1_eval:0.87285
[12]	validation_0-f1_eval:0.86977
[13]	validation_0-f1_eval:0.86666
[14]	validation_0-f1_eval:0.85460
[15]	validation_0-f1_eval:0.85287
[16]	validation_0-f1_eval:0.85170
[17]	validation_0-f1_eval:0.84299
[18]	validation_0-f1_eval:0.84148
[19]	validation_0-f1_eval:0.83980
[20]	validation_0-f1_eval:0.83128
[21]	validation_0-f1_eval:0.83023
[22]	validation_0-f1_eval:0.82949
[23]	validation_0-f1_eval:0.82970
[24]	validation_0-f1_eval:0.82932
[25]	validation_0-f1_eval:0.82920
[26]	validation_0-f1_eval:0.82928
[27]	validation_0-f1_eval:0.82892
[28]	validation_0-f1_eval:0.82872
[29]	validation_0-f1_eva

In [42]:
model.best_score, model.best_iteration

(0.790035, 89)

In [43]:
del train_f
del valid_f

In [44]:
test_f = pd.read_parquet(location_test_features)
join_columns = ["source", "target", "source_currency", "target_currency", "format"]
test_predictions = test_f.loc[:, join_columns].copy(deep=True)
del test_f["source"]
del test_f["target"]

cat_data = test_f.loc[:, []].copy(deep=True)
for col in category_features:
    mapping = category_features_map[col]
    new_col = f"{col}_cat"
    cat_data.loc[:, new_col] = test_f.loc[:, col].apply(lambda x: mapping[x])
    del test_f[col]

test_f = pd.concat([test_f, cat_data], axis=1)
test_labels = test_f["is_laundering"]
del test_f["is_laundering"]

In [45]:
test_predictions.loc[:, "prediction"] = model.predict(test_f)

In [56]:
f1_final = f1_score(test_labels, test_predictions.loc[:, "prediction"]) * 100
recall = recall_score(test_labels, test_predictions.loc[:, "prediction"]) * 100
print(round(f1_score_final, 2), round(recall, 2))

31.64 20.65


In [65]:
# test_predictions_final = spark.createDataFrame(test_predictions).alias("pred")
# test_predictions_final = test.join(
#     test_predictions_final,
#     (test["source"] == test_predictions_final["pred.source"]) &
#     (test["target"] == test_predictions_final["pred.target"]) &
#     (test["source_currency"] == test_predictions_final["pred.source_currency"]) &
#     (test["target_currency"] == test_predictions_final["pred.target_currency"]) &
#     (test["format"] == test_predictions_final["pred.format"]),
#     how="left"
# ).select("transaction_id", "is_laundering", "prediction").persist(storageLevel=StorageLevel.DISK_ONLY)

# assert test_predictions_final.count() == test_count

# test_predictions_final = test_predictions_final.toPandas()
# f1_final = f1_score(test_predictions_final["is_laundering"], test_predictions_final["prediction"])
# f1_final = round(f1_final * 100, 4)
# recall = round(recall_score(test_predictions_final["is_laundering"], test_predictions_final["prediction"]) * 100, 4)
# print(f1_final, recall)

                                                                                

25.2725 33.4261


In [53]:
gfp_best = 24.23
gfp_std = 0.12

In [54]:
print(f"GFP best: {gfp_best} ± {gfp_std}")

GFP best: 24.23 ± 0.12


In [57]:
# print(f"{round(f1_final, 2)} ±{round(np.std(f1_scores), 2)}")
print(f"{round(f1_final, 2)} ± NA")

31.64 ± NA


In [58]:
uplift = round(((f1_final - gfp_best) / gfp_best) * 100, 2)
print(f"Uplift of {uplift}%")

Uplift of 30.6%
