In [1]:
import json
import os
import pickle
import random
import time
import shutil
import sys
import uuid
from collections import defaultdict
from datetime import timedelta
from glob import glob
from itertools import product
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, recall_score, RocCurveDisplay

import igraph as ig
import leidenalg as la
import numpy as np
import pandas as pd
import xgboost as xgb

import settings as s

assert s.FILE_SIZE == "Small"
assert s.HIGH_ILLICIT == False

os.environ["EXT_DATA_TYPE_FOLDER"] = s.OUTPUT_POSTFIX.lstrip("-")

from common import get_weights, delete_large_vars, MULTI_PROC_STAGING_LOCATION
from communities import get_communities_spark
from features import (
    generate_features_spark, generate_features_udf_wrapper, get_edge_features_udf,
    SCHEMA_FEAT_UDF, CURRENCY_RATES
)

%load_ext autoreload
%autoreload 2

In [2]:
SEED = int(os.environ.get("EXSTRAQT_SEED", 42))
print(f"{SEED=}")
random.seed(SEED)
np.random.seed(SEED)

SEED=42


In [3]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8"
    )

In [4]:
SPARK_CONF = [
    ("spark.driver.memory", "32g"),
    ("spark.worker.memory", "32g"),
    ("spark.driver.maxResultSize", "32g"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
    ("spark.network.timeout", "600s"),
    ("spark.sql.autoBroadcastJoinThreshold", -1),
    ("spark.local.dir", f".{os.sep}temp-spark"),
]

if "EXSTRAQT_SEED" in os.environ:
    SPARK_CONF.append(("spark.log.level", "ERROR"))

shutil.rmtree("artifacts", ignore_errors=True)
shutil.rmtree("temp-spark", ignore_errors=True)
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(SPARK_CONF))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/28 19:17:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/28 19:17:56 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in standalone/kubernetes and LOCAL_DIRS in YARN).


In [4]:
TRAIN_PERC = 0.64
VALIDATION_PERC = 0.19
TEST_PERC = 0.17

KEEP_TOP_N = 100

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

location_main = os.path.join("features", os.environ["EXT_DATA_TYPE_FOLDER"])
# shutil.rmtree(location_main, ignore_errors=True)

location_communities_leiden = f"{location_main}{os.sep}communities_leiden.parquet"

location_features_leiden = f"{location_main}{os.sep}features_leiden.parquet"
location_features_ego = f"{location_main}{os.sep}features_ego.parquet"
location_features_2_hop = f"{location_main}{os.sep}features_2_hop.parquet"
location_features_2_hop_out = f"{location_main}{os.sep}features_2_hop_out.parquet"
location_features_2_hop_in = f"{location_main}{os.sep}features_2_hop_in.parquet"
location_features_2_hop_combined = f"{location_main}{os.sep}features_2_hop_combined.parquet"
location_features_source = f"{location_main}{os.sep}features_source.parquet"
location_features_target = f"{location_main}{os.sep}features_target.parquet"

location_flow_dispense = f"{location_main}{os.sep}location_flow_dispense.parquet"
location_flow_passthrough = f"{location_main}{os.sep}location_flow_passthrough.parquet"
location_flow_sink = f"{location_main}{os.sep}location_flow_sink.parquet"

location_comm_as_source_features = f"{location_main}{os.sep}comm_as_source_features.parquet"
location_comm_as_target_features = f"{location_main}{os.sep}comm_as_target_features.parquet"
location_comm_as_passthrough_features = f"{location_main}{os.sep}comm_as_passthrough_features.parquet"
location_comm_as_passthrough_features_reverse = f"{location_main}{os.sep}comm_as_passthrough_features_reverse.parquet"

location_features_node_level = f"{location_main}{os.sep}features_node_level.parquet"
location_features_edges = f"{location_main}{os.sep}features_edges.parquet"

location_features_edges_train = f"{location_main}{os.sep}features_edges_train.parquet"
location_features_edges_valid = f"{location_main}{os.sep}features_edges_valid.parquet"
location_features_edges_test = f"{location_main}{os.sep}features_edges_test.parquet"

location_train_trx_features = f"{location_main}{os.sep}train_trx_features.parquet"
location_valid_trx_features = f"{location_main}{os.sep}valid_trx_features.parquet"
location_test_trx_features = f"{location_main}{os.sep}test_trx_features.parquet"

location_train_features = f"{location_main}{os.sep}train_features.parquet"
location_valid_features = f"{location_main}{os.sep}valid_features.parquet"
location_test_features = f"{location_main}{os.sep}test_features.parquet"

try:
    os.makedirs(location_main)
except FileExistsError:
    pass

In [6]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)
data = data.withColumn("is_laundering", sf.col("is_laundering").cast("boolean"))
data_count_original = data.count()

# Probably not used in the benchmarks
data = data.drop("source_entity", "target_entity")

                                                                                

In [7]:
# data = data.drop("source", "target")
# data = data.withColumnRenamed("source_entity", "source")
# data = data.withColumnRenamed("target_entity", "target")

In [8]:
%%time

trx_ids_sorted = data.sort("timestamp").select("transaction_id").toPandas()["transaction_id"].values
trx_count = len(trx_ids_sorted)

last_train_index = int(np.floor(trx_count * TRAIN_PERC))
last_validation_index = last_train_index + int(np.floor(trx_count * VALIDATION_PERC))
train_indexes = trx_ids_sorted[:last_train_index]
validation_indexes = trx_ids_sorted[last_train_index:last_validation_index]
test_indexes = trx_ids_sorted[last_validation_index:]

train_indexes_loc = os.path.join(location_main, "temp_train_indexes.parquet")
validation_indexes_loc = os.path.join(location_main, "temp_validation_indexes.parquet")
test_indexes_loc = os.path.join(location_main, "temp_test_indexes.parquet")

pd.DataFrame(train_indexes, columns=["transaction_id"]).to_parquet(train_indexes_loc)
pd.DataFrame(validation_indexes, columns=["transaction_id"]).to_parquet(validation_indexes_loc)
pd.DataFrame(test_indexes, columns=["transaction_id"]).to_parquet(test_indexes_loc)

train_indexes = spark.read.parquet(train_indexes_loc)
validation_indexes = spark.read.parquet(validation_indexes_loc)
test_indexes = spark.read.parquet(test_indexes_loc)

train = train_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
validation = validation_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
test = test_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
train_count, validation_count, test_count = train.count(), validation.count(), test.count()
print()
print(trx_count, train_count, validation_count, test_count)
print()

os.remove(train_indexes_loc)
os.remove(validation_indexes_loc)
os.remove(test_indexes_loc)




6914124 4425039 1313683 1175402

CPU times: user 165 ms, sys: 118 ms, total: 283 ms
Wall time: 37.5 s


                                                                                

In [9]:
def generate_edge_features(input_data):
    print(f"Generating edge features")
    to_select = ["source", "target", "format", "source_currency", "source_amount", "amount", "timestamp"]
    edges_features_input = input_data.select(*to_select).groupby(
        ["source", "target", "format", "source_currency"]
    ).agg(
        sf.sum("source_amount").alias("source_amount"), 
        sf.sum("amount").alias("amount"),
        sf.unix_timestamp(sf.min("timestamp")).alias("min_ts"),
        sf.unix_timestamp(sf.max("timestamp")).alias("max_ts"),
    ).repartition(os.cpu_count() * 2, "source", "target").persist(StorageLevel.DISK_ONLY)
    _ = edges_features_input.count()
    edge_features = edges_features_input.groupby(["source", "target"]).applyInPandas(
        get_edge_features_udf, schema=SCHEMA_FEAT_UDF
    ).toPandas()
    edge_features = pd.DataFrame(edge_features["features"].apply(json.loads).tolist())
    edge_features.to_parquet(location_features_edges)
    del edge_features

In [10]:
def add_node_features_to_edges(features_in, location):
    features_in = features_in.set_index("target").join(
        pd.read_parquet(location_features_node_level), how="left", rsuffix="_target"
    ).reset_index().set_index("source").join(
        pd.read_parquet(location_features_node_level), how="left", rsuffix="_source"
    ).reset_index()

    features_in.loc[:, "anomaly_scores_diff"] = features_in.loc[:, "anomaly_score"] - features_in.loc[:, "anomaly_score_source"]
    features_in.loc[:, "anomaly_scores_min"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).min(axis=0)
    features_in.loc[:, "anomaly_scores_max"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).max(axis=0)
    features_in.loc[:, "anomaly_scores_mean"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).mean(axis=0)

    features_in.to_parquet(location)

In [11]:
def save_trx_features(data_in, location):
    columns = [
        "source", "target", "source_currency", "target_currency", "format", "amount", 
        "source_dispensation",
        "target_accumulation",
        "source_positive_balance",
        "source_negative_balance",
        "target_positive_balance",
        "target_negative_balance",
        "source_active_for",
        "target_active_for",
        "is_laundering"
    ]
    trx_features = data_in.select(*columns).toPandas()
    trx_features.loc[:, "source_balance_ratio"] = (
        trx_features["source_positive_balance"] / trx_features["source_negative_balance"]
    ).fillna(0).replace(np.inf, 0)
    trx_features.loc[:, "target_balance_ratio"] = (
        trx_features["target_positive_balance"] / trx_features["target_negative_balance"]
    ).fillna(0).replace(np.inf, 0)
    trx_features.loc[:, "inter_currency"] = trx_features["source_currency"] != trx_features["target_currency"]
    trx_features = pd.get_dummies(trx_features, columns=["source_currency", "target_currency", "format"], drop_first=False)
    trx_features.to_parquet(location)
    del trx_features

# [To prevent data leakage]

### As the `train`, `validation`, and `test` sets are split in chronological order:
* `train` features are constructed, based on a **graph** (containing data), up till the last training record
* `validation` features are constructed, ..., up till the last validation record
* `train` features are constructed, ..., up till the last test record

In [12]:
%%time

data = train.select("*")
print(f"Constructing node-level features for `train` data: {data.count():,}")

%run node_level_features.ipynb

generate_edge_features(data)
train_edges = train.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
train_features = train_edges.join(
    pd.read_parquet(location_features_edges).set_index(["source", "target"]), how="left"
).reset_index()
add_node_features_to_edges(train_features, location_features_edges_train)
save_trx_features(train, location_train_trx_features)

Constructing node-level features for `train` data: 4,425,039


                                                                                

CPU times: user 8.4 s, sys: 229 ms, total: 8.63 s
Wall time: 38.2 s


                                                                                

FLOWS ET: 169


CPU times: user 7.3 s, sys: 2.14 s, total: 9.44 s
Wall time: 2min 59s
Constructing Leiden communities


IOStream.flush timed out
IOStream.flush timed out


CPU times: user 22min 16s, sys: 43.2 s, total: 22min 59s
Wall time: 25min 4s


                                                                                

0 700624
250000 700624
500000 700624
CPU times: user 31.2 s, sys: 759 ms, total: 32 s
Wall time: 1min 17s
Constructing 2-hop communities


                                                                                

CPU times: user 6.22 s, sys: 651 ms, total: 6.87 s
Wall time: 1min 25s


                                                                                

1285259


                                                                                

CPU times: user 9.28 s, sys: 1.31 s, total: 10.6 s
Wall time: 53.2 s
Leiden communitites features creation


25/10/28 19:52:36 WARN TaskSetManager: Stage 843 contains a task of very large size (1647 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 23.6 s, sys: 8.12 s, total: 31.7 s
Wall time: 8min 3s
2-hop communitites features creation


25/10/28 20:02:50 WARN TaskSetManager: Stage 854 contains a task of very large size (1647 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 36.8 s, sys: 24.2 s, total: 1min 1s
Wall time: 25min 20s
Temporal flows features creation


25/10/28 20:27:08 WARN TaskSetManager: Stage 873 contains a task of very large size (5947 KiB). The maximum recommended task size is 1000 KiB.
25/10/28 20:27:39 WARN TaskSetManager: Stage 893 contains a task of very large size (5729 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

4420818 4073341
dispense


                                                                                

passthrough


                                                                                

sink


                                                                                

CPU times: user 5.28 s, sys: 1.67 s, total: 6.95 s
Wall time: 11min 16s
1-hop-source features creation


25/10/28 20:36:18 WARN TaskSetManager: Stage 1138 contains a task of very large size (1647 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 19.1 s, sys: 4.18 s, total: 23.3 s
Wall time: 12min 29s
1-hop-target features creation


25/10/28 20:48:49 WARN TaskSetManager: Stage 1141 contains a task of very large size (1647 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 21.1 s, sys: 4.98 s, total: 26.1 s
Wall time: 12min 42s
Features: (700624, 388)
Deleted 14 constant columns
Training the anomaly detection model
CPU times: user 14.6 s, sys: 3.09 s, total: 17.7 s
Wall time: 18.5 s
Generating edge features


                                                                                

CPU times: user 26min 25s, sys: 2min 18s, total: 28min 44s
Wall time: 1h 53min 12s


In [13]:
%%time

data = train.union(validation).select("*")
print(f"Constructing node-level features for `validation` data: {data.count():,}")

%run node_level_features.ipynb

generate_edge_features(data)
validation_edges = validation.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
validation_features = validation_edges.join(
    pd.read_parquet(location_features_edges).set_index(["source", "target"]), how="left"
).reset_index()
add_node_features_to_edges(validation_features, location_features_edges_valid)
save_trx_features(validation, location_valid_trx_features)

                                                                                

Constructing node-level features for `validation` data: 5,738,722


                                                                                

CPU times: user 10.3 s, sys: 508 ms, total: 10.8 s
Wall time: 1min 3s


                                                                                

FLOWS ET: 422


CPU times: user 12.1 s, sys: 5.36 s, total: 17.4 s
Wall time: 7min 14s
Constructing Leiden communities
CPU times: user 22min 43s, sys: 52.7 s, total: 23min 36s
Wall time: 25min 54s


                                                                                

0 702264
250000 702264
500000 702264
CPU times: user 32.2 s, sys: 2.2 s, total: 34.3 s
Wall time: 3min 12s
Constructing 2-hop communities


                                                                                

CPU times: user 7.54 s, sys: 1.44 s, total: 8.98 s
Wall time: 1min 13s


                                                                                

1336641


                                                                                

CPU times: user 15.3 s, sys: 6.52 s, total: 21.9 s
Wall time: 1min 43s
Leiden communitites features creation


25/10/28 21:54:37 WARN TaskSetManager: Stage 2004 contains a task of very large size (1836 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 29.3 s, sys: 17.6 s, total: 46.9 s
Wall time: 8min 29s
2-hop communitites features creation


25/10/28 22:05:42 WARN TaskSetManager: Stage 2015 contains a task of very large size (1836 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 37.5 s, sys: 11.5 s, total: 49 s
Wall time: 26min 16s
Temporal flows features creation


25/10/28 22:28:46 WARN TaskSetManager: Stage 2040 contains a task of very large size (6181 KiB). The maximum recommended task size is 1000 KiB.
25/10/28 22:29:48 WARN TaskSetManager: Stage 2068 contains a task of very large size (5967 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

5733476 5257730
dispense


                                                                                

passthrough


                                                                                

sink


                                                                                

CPU times: user 4.99 s, sys: 2.44 s, total: 7.43 s
Wall time: 10min 45s
1-hop-source features creation


25/10/28 22:38:43 WARN TaskSetManager: Stage 2367 contains a task of very large size (1836 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 25.5 s, sys: 7.77 s, total: 33.3 s
Wall time: 14min 15s
1-hop-target features creation


25/10/28 22:53:25 WARN TaskSetManager: Stage 2370 contains a task of very large size (1836 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 27.9 s, sys: 15.3 s, total: 43.2 s
Wall time: 16min 17s
Features: (702264, 388)
Deleted 14 constant columns
Training the anomaly detection model
CPU times: user 14.9 s, sys: 4.63 s, total: 19.5 s
Wall time: 21.6 s
Generating edge features


                                                                                

CPU times: user 27min 11s, sys: 2min 44s, total: 29min 56s
Wall time: 2h 9min 33s


In [14]:
%%time

data = train.union(validation).union(test).select("*")
print(f"Constructing node-level features for `test` data: {data.count():,}")

%run node_level_features.ipynb

generate_edge_features(data)
test_edges = test.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
test_features = test_edges.join(
    pd.read_parquet(location_features_edges).set_index(["source", "target"]), how="left"
).reset_index()
add_node_features_to_edges(test_features, location_features_edges_test)
save_trx_features(test, location_test_trx_features)

                                                                                

Constructing node-level features for `test` data: 6,914,124


                                                                                

CPU times: user 10.9 s, sys: 609 ms, total: 11.5 s
Wall time: 1min 42s


                                                                                

FLOWS ET: 270


CPU times: user 13.9 s, sys: 7.08 s, total: 21 s
Wall time: 4min 48s
Constructing Leiden communities
CPU times: user 23min 35s, sys: 1min 4s, total: 24min 39s
Wall time: 27min 15s


                                                                                

0 703589
250000 703589
500000 703589
CPU times: user 44.3 s, sys: 3.66 s, total: 47.9 s
Wall time: 4min 7s
Constructing 2-hop communities


                                                                                

CPU times: user 14.8 s, sys: 9.91 s, total: 24.8 s
Wall time: 1min 54s


                                                                                

1384576


                                                                                

CPU times: user 10.9 s, sys: 2.54 s, total: 13.4 s
Wall time: 4min 9s
Leiden communitites features creation


25/10/29 00:07:34 WARN TaskSetManager: Stage 3271 contains a task of very large size (2001 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 29.1 s, sys: 11.1 s, total: 40.2 s
Wall time: 9min 17s
2-hop communitites features creation


25/10/29 00:19:46 WARN TaskSetManager: Stage 3282 contains a task of very large size (2001 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 48.1 s, sys: 32.3 s, total: 1min 20s
Wall time: 32min 44s
Temporal flows features creation


25/10/29 00:53:36 WARN TaskSetManager: Stage 3313 contains a task of very large size (6399 KiB). The maximum recommended task size is 1000 KiB.
25/10/29 00:54:59 WARN TaskSetManager: Stage 3349 contains a task of very large size (6190 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

6906470 6329177
dispense


                                                                                

passthrough


                                                                                

sink


                                                                                

CPU times: user 7.05 s, sys: 3.26 s, total: 10.3 s
Wall time: 17min 2s
1-hop-source features creation


25/10/29 01:05:22 WARN TaskSetManager: Stage 3702 contains a task of very large size (2001 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 29.8 s, sys: 20.2 s, total: 50 s
Wall time: 17min 49s
1-hop-target features creation


25/10/29 01:23:26 WARN TaskSetManager: Stage 3705 contains a task of very large size (2001 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 31.5 s, sys: 10.5 s, total: 42 s
Wall time: 13min 57s
Features: (703589, 388)
Deleted 14 constant columns
Training the anomaly detection model
CPU times: user 14.7 s, sys: 2.9 s, total: 17.6 s
Wall time: 18.5 s
Generating edge features


                                                                                

CPU times: user 28min 36s, sys: 3min 12s, total: 31min 48s
Wall time: 2h 27min 9s


In [15]:
# To free up memory for training

to_reset = %who_ls
to_reset = list(to_reset)
to_reset.remove("to_keep")
to_reset = set(to_reset) - set(to_keep)
for var_to_reset in list(to_reset):
    var_to_reset = f"^{var_to_reset}$"
    %reset_selective -f {var_to_reset}

delete_large_vars(globals(), locals())

Deleted `global` DataFrame: edges
Deleted `global` DataFrame: weights
Deleted `global` DataFrame: edges_agg
Deleted `global` large object: nodes_source
Deleted `global` large object: nodes_target
Deleted `global` large object: nodes_passthrough
Deleted `global` large object: totals_sent
Deleted `global` large object: totals_received
Deleted `global` large object: nodes_mapping
Deleted `global` large object: communities_leiden
Deleted `global` large object: sizes_leiden
Deleted `global` DataFrame: data_agg_weights
Deleted `global` large object: nodes_neighborhoods
Deleted `global` large object: communities_2_hop
Deleted `global` large object: sizes_2_hop
Deleted `global` DataFrame: features_leiden
Deleted `global` large object: communities_leiden_dict
Deleted `global` DataFrame: features_2_hop
Deleted `global` DataFrame: features_source
Deleted `global` DataFrame: features_target
Deleted `global` DataFrame: anomalies
Deleted `global` DataFrame: train_edges
Deleted `global` DataFrame: tr

True

In [16]:
def combine_features(location_features_trx, location_features_edges, location_features):
    features_input = spark.read.parquet(location_features_edges)
    trx_features_input = spark.read.parquet(location_features_trx)
    features_input = trx_features_input.join(
        features_input,
        on=["source", "target"],
        how="left"
    ).drop("source", "target")
    features_input.write.parquet(location_features, mode="overwrite")

In [17]:
%%time

combine_features(location_train_trx_features, location_features_edges_train, location_train_features)
combine_features(location_valid_trx_features, location_features_edges_valid, location_valid_features)
combine_features(location_test_trx_features, location_features_edges_test, location_test_features)

25/10/29 01:49:04 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

CPU times: user 82.7 ms, sys: 207 ms, total: 289 ms
Wall time: 4min 51s


In [18]:
shutil.rmtree(MULTI_PROC_STAGING_LOCATION, ignore_errors=True)

In [5]:
train_features = pd.read_parquet(location_train_features)
validation_features = pd.read_parquet(location_valid_features)
test_features = pd.read_parquet(location_test_features)

In [6]:
all_columns = set(train_features.columns) | set(validation_features.columns) | set(test_features.columns)

for missing in (
    all_columns.symmetric_difference(train_features.columns) |
    all_columns.symmetric_difference(validation_features.columns) |
    all_columns.symmetric_difference(test_features.columns)
):
    if missing in train_features.columns:
        print(f"Deleting {missing} from train")
        del train_features[missing]
    if missing in validation_features.columns:
        print(f"Deleting {missing} from validation")
        del validation_features[missing]
    if missing in test_features.columns:
        print(f"Deleting {missing} from test")
        del test_features[missing]

validation_features = validation_features.loc[:, list(train_features.columns)]
test_features = test_features.loc[:, list(train_features.columns)]

Deleting format_Reinvestment from train


In [7]:
assert train_features.shape[0] == train_count
assert validation_features.shape[0] == validation_count
assert test_features.shape[0] == test_count

In [7]:
train_features_labels = train_features.loc[:, ["is_laundering"]].copy(deep=True)
del train_features["is_laundering"]

validation_features_labels = validation_features.loc[:, ["is_laundering"]].copy(deep=True)
del validation_features["is_laundering"]

test_features_labels = test_features.loc[:, ["is_laundering"]].copy(deep=True)
del test_features["is_laundering"]

In [8]:
cuda_available = False
try:
    import torch
    cuda_available = torch.cuda.is_available()
except ImportError:
    pass


xgb_args = dict(
    early_stopping_rounds=10, scale_pos_weight=5,
    eval_metric="aucpr", 
    disable_default_eval_metric=True, 
    num_parallel_tree=10, max_depth=6,
    colsample_bytree=0.5, subsample=1, 
    eta=0.05,
    device="cpu", nthread=10,
    n_estimators=500, seed=SEED,
)
if cuda_available:
    xgb_args["device"] = "cuda"
    xgb_args["nthread"] = 2

xgb_fit_args = {
    "eval_set": [(validation_features, validation_features_labels["is_laundering"].values)],
    "verbose": True,
}

In [24]:
%%time

model = xgb.XGBClassifier(**xgb_args)
model.fit(
    train_features, train_features_labels["is_laundering"].values, 
    **xgb_fit_args
)
y_test_predicted = model.predict(test_features)
f1_test = f1_score(test_features_labels["is_laundering"], y_test_predicted) * 100
print(
    f"{SEED=}",
    f"{model.best_iteration=}",
    f"f1={round(f1_test, 2)}",
    f"recall={round(recall_score(test_features_labels['is_laundering'], y_test_predicted) * 100, 2)}",
)
print(f1_test)

SEED=42 model.best_iteration=163 f1=45.49 recall=32.04
45.48672566371682
CPU times: user 5h 9min 53s, sys: 10min 47s, total: 5h 20min 41s
Wall time: 36min 35s


In [None]:
xgb_args = dict(
    scale_pos_weight=3,
    eval_metric="aucpr", 
    disable_default_eval_metric=True, 
    num_parallel_tree=10, max_depth=6,
    colsample_bytree=0.5, subsample=1, 
    eta=0.05,
    device="cpu", nthread=10,
    n_estimators=5, seed=SEED,
)
model = xgb.XGBClassifier(**xgb_args)
if "xgb_model" in xgb_fit_args:
    del xgb_fit_args["xgb_model"]
    print("Deleting `xgb_model`")
for i in range(200):
    model = model.fit(
        train_features, train_features_labels["is_laundering"].values, 
        **xgb_fit_args
    )
    xgb_fit_args["xgb_model"] = model
    y_test_predicted = model.predict(test_features)
    f1_test = f1_score(test_features_labels["is_laundering"], y_test_predicted) * 100
    print(
        (i + 1) * xgb_args["n_estimators"],
        f"f1={round(f1_test, 2)}",
        f"recall={round(recall_score(test_features_labels['is_laundering'], y_test_predicted) * 100, 2)}",
    )
    print()

Deleting `xgb_model`
[0]	validation_0-aucpr:0.21038
[1]	validation_0-aucpr:0.21668
[2]	validation_0-aucpr:0.22498
[3]	validation_0-aucpr:0.23252
[4]	validation_0-aucpr:0.23682
5 f1=1.98 recall=1.0

[0]	validation_0-aucpr:0.23873
[1]	validation_0-aucpr:0.23862
[2]	validation_0-aucpr:0.24144
[3]	validation_0-aucpr:0.24429
[4]	validation_0-aucpr:0.24760
10 f1=26.0 recall=14.96

[0]	validation_0-aucpr:0.25057
[1]	validation_0-aucpr:0.25145
[2]	validation_0-aucpr:0.25289
[3]	validation_0-aucpr:0.25388
[4]	validation_0-aucpr:0.25477
15 f1=36.35 recall=22.32

[0]	validation_0-aucpr:0.25587
[1]	validation_0-aucpr:0.25765
[2]	validation_0-aucpr:0.25844
[3]	validation_0-aucpr:0.26037
[4]	validation_0-aucpr:0.26146
20 f1=40.2 recall=25.44

[0]	validation_0-aucpr:0.26280
[1]	validation_0-aucpr:0.26449
[2]	validation_0-aucpr:0.26591
[3]	validation_0-aucpr:0.26723
[4]	validation_0-aucpr:0.26902
25 f1=42.13 recall=27.18

[0]	validation_0-aucpr:0.26998
[1]	validation_0-aucpr:0.27075
[2]	validation_0-a

In [None]:
# 100 f1=45.7 recall=31.17
# 4