In [1]:
import json
import os
import pickle
import shutil
import sys
import time
from datetime import timedelta, datetime

import igraph as ig
import numpy as np
import pandas as pd
import xgboost as xgb
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.metrics import f1_score, recall_score
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

import settings_small_hi as s

os.environ["EXT_DATA_TYPE_FOLDER"] = s.OUTPUT_POSTFIX.lstrip("-")

from common import get_weights
from communities import get_communities_spark
from features import (
    generate_features_spark, generate_features_udf_wrapper, get_edge_features_udf, 
    SCHEMA_FEAT_UDF, FEATURE_TYPES
)


%load_ext autoreload
%autoreload 2

In [2]:
shutil.rmtree("artifacts", ignore_errors=True)

config = [
    ("spark.driver.memory", "32g"),
    ("spark.worker.memory", "32g"),
    ("spark.driver.maxResultSize", "32g"),
    ("spark.driver.bindAddress", "127.0.0.1"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
]
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(config))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/05 18:08:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
start = time.time()

In [4]:
WINDOW_SIZE = 7
TRAIN_PERC = 0.6
VALIDATION_PERC = 0.2
TEST_PERC = 0.2

NUM_PROCS = 10

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

In [5]:
location_main = os.path.join("features", os.environ["EXT_DATA_TYPE_FOLDER"])
# shutil.rmtree(location_main, ignore_errors=True)

location_flow_dispense = f"{location_main}{os.sep}flow_dispense.parquet"
location_flow_passthrough = f"{location_main}{os.sep}flow_passthrough.parquet"
location_flow_sink = f"{location_main}{os.sep}flow_sink.parquet"

location_comm_as_source_features = f"{location_main}{os.sep}comm_as_source_features.parquet"
location_comm_as_target_features = f"{location_main}{os.sep}comm_as_target_features.parquet"
location_comm_as_passthrough_features = f"{location_main}{os.sep}comm_as_passthrough_features.parquet"
location_comm_as_passthrough_features_reverse = f"{location_main}{os.sep}comm_as_passthrough_features_reverse.parquet"

location_features_node_level = f"{location_main}{os.sep}features_node_level.parquet"
location_features_edges = f"{location_main}{os.sep}features_edges.parquet"

location_features_edges_train = f"{location_main}{os.sep}features_edges_train.parquet"
location_features_edges_valid = f"{location_main}{os.sep}features_edges_valid.parquet"
location_features_edges_test = f"{location_main}{os.sep}features_edges_test.parquet"

location_train_trx_features = f"{location_main}{os.sep}train_trx_features"
location_valid_trx_features = f"{location_main}{os.sep}valid_trx_features"
location_test_trx_features = f"{location_main}{os.sep}test_trx_features"

try:
    os.makedirs(location_main)
except FileExistsError:
    pass

In [6]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)
data_count_original = data.count()

                                                                                

In [7]:
%%time

KEEP_TOP_N = 100

data_agg_weights = get_weights(
    data.groupby(["source", "target"])
    .agg(
        sf.sum("amount").alias("amount")
    ).toPandas()
)
data_agg_weights.sort_values("weight", ascending=False, inplace=True)

edges_to_keep = data_agg_weights.groupby("source").head(KEEP_TOP_N).reset_index(drop=True)
edges_to_keep.sort_values("weight", ascending=False, inplace=True)
edges_to_keep = edges_to_keep.groupby("target").head(KEEP_TOP_N).reset_index(drop=True)
edges_to_keep = edges_to_keep.loc[:, ["source", "target"]].drop_duplicates()
edges_to_keep = spark.createDataFrame(edges_to_keep)

data_graph = data.join(
    edges_to_keep.select(sf.col("source").alias("src"), sf.col("target").alias("dst")),
    (sf.col("source") == sf.col("src")) &
    (sf.col("target") == sf.col("dst"))
).drop("src", "dst").persist(StorageLevel.DISK_ONLY)
data_count_graph = data_graph.count()
reduction = round((data_count_graph / data_count_original) * 100, 2)
print(f"\nReduced to {reduction}%\n")

25/07/05 18:08:41 WARN TaskSetManager: Stage 8 contains a task of very large size (4543 KiB). The maximum recommended task size is 1000 KiB.


Reduced to 91.5%

CPU times: user 6.76 s, sys: 160 ms, total: 6.92 s
Wall time: 18.1 s


                                                                                

In [8]:
%%time

left = data_graph.select("source", "target", "timestamp", "amount")
select = []
for column in left.columns:
    select.append(sf.col(column).alias(f"left_{column}"))
left = left.select(*select)
right = data_graph.select("source", "target", "timestamp", "amount")

flows_temporal = left.join(
    right,
    (left["left_target"] == right["source"]) &
    (left["left_timestamp"] <= right["timestamp"]),
    how="inner"
).groupby(["left_source", "left_target", "source", "target"]).agg(
    sf.sum("left_amount").alias("left_amount"),
    sf.sum("amount").alias("amount"),
).drop("left_target").select(
    sf.col("left_source").alias("dispense"),
    sf.col("source").alias("passthrough"),
    sf.col("target").alias("sink"),
    sf.least("left_amount", "amount").alias("amount"),
).persist(StorageLevel.DISK_ONLY)
flows_temporal.count()
flows_temporal = flows_temporal.toPandas()

# TODO: This can be made much faster!
flow_dispense, flow_passthrough, flow_sink = [], [], []
for flow_data, flow_type in [
    (flow_dispense, "dispense"), (flow_passthrough, "passthrough"), (flow_sink, "sink")
]:
    print(flow_type)
    prefix = f"{s.G_FLOW_PREFIX}{flow_type}_"
    for key, group in flows_temporal.groupby(flow_type):
        cycle = group[(group["dispense"] == group["sink"]) & (group["dispense"] != group["passthrough"])]
        row = {
            "key": key,
            f"{prefix}amount_sum": group["amount"].sum(),
            f"{prefix}amount_mean": group["amount"].mean(),
            f"{prefix}amount_max": group["amount"].max(),
            f"{prefix}dispense_count": group["dispense"].nunique(),
            f"{prefix}passthrough_count": group["passthrough"].nunique(),
            f"{prefix}sink_count": group["sink"].nunique(),
            f"{prefix}cycle_sum": cycle["amount"].sum(),
            f"{prefix}cycle_mean": cycle["amount"].mean(),
            f"{prefix}cycle_max": cycle["amount"].max(),
            f"{prefix}cycle_passthrough_count": cycle["passthrough"].nunique(),
        }
        flow_data.append(row)

pd.DataFrame(flow_dispense).set_index("key").to_parquet(location_flow_dispense)
pd.DataFrame(flow_passthrough).set_index("key").to_parquet(location_flow_passthrough)
pd.DataFrame(flow_sink).set_index("key").to_parquet(location_flow_sink)

del flows_temporal
del flow_dispense
del flow_passthrough
del flow_sink

                                                                                

dispense
passthrough
sink
CPU times: user 5min 41s, sys: 16.9 s, total: 5min 58s
Wall time: 6min 10s


In [9]:
%%time

trx_ids_sorted = data.sort("timestamp").select("transaction_id").toPandas()["transaction_id"].values
trx_count = len(trx_ids_sorted)
print(trx_count)

last_train_index = int(np.floor(trx_count * TRAIN_PERC))
last_validation_index = last_train_index + int(np.floor(trx_count * VALIDATION_PERC))
train_indexes = trx_ids_sorted[:last_train_index]
validation_indexes = trx_ids_sorted[last_train_index:last_validation_index]
test_indexes = trx_ids_sorted[last_validation_index:]

train_indexes = spark.createDataFrame(
    pd.DataFrame(train_indexes, columns=["transaction_id"])
).persist(StorageLevel.DISK_ONLY)
validation_indexes = spark.createDataFrame(
    pd.DataFrame(validation_indexes, columns=["transaction_id"])
).persist(StorageLevel.DISK_ONLY)
test_indexes = spark.createDataFrame(
    pd.DataFrame(test_indexes, columns=["transaction_id"])
).persist(StorageLevel.DISK_ONLY)

train = train_indexes.join(
    data, on="transaction_id", how="left"
)
validation = validation_indexes.join(
    data, on="transaction_id", how="left"
)
test = test_indexes.join(
    data, on="transaction_id", how="left"
)
train_validation = train.union(validation)

                                                                                

5072693
CPU times: user 53.6 ms, sys: 30.4 ms, total: 84 ms
Wall time: 3.38 s


In [10]:
%%time

data_input = data.select("*")
nodes_source = set(data.select("source").distinct().toPandas()["source"])
nodes_target = set(data.select("target").distinct().toPandas()["target"])
nodes_passthrough = nodes_source.intersection(nodes_target)

%run generate_flow_features.ipynb

comm_as_source_features.to_parquet(location_comm_as_source_features)
comm_as_target_features.to_parquet(location_comm_as_target_features)
comm_as_passthrough_features.to_parquet(location_comm_as_passthrough_features)
comm_as_passthrough_features_reverse.to_parquet(location_comm_as_passthrough_features_reverse)

del comm_as_source_features
del comm_as_target_features
del comm_as_passthrough_features
del comm_as_passthrough_features_reverse

                                                                                


Processing comm_as_source

Processed hop #1 | 609,479 | 304,896
Processed hop #2 | 878,611 | 182,007
Processed hop #3 | 1,204,789 | 136,313
Processed hop #4 | 1,484,369 | 117,205

Processing comm_as_target

Processed hop #1 | 642,727 | 283,656
Processed hop #2 | 2,584,739 | 228,416
Processed hop #3 | 3,335,450 | 204,753
Processed hop #4 | 4,138,999 | 190,491

Processing comm_as_passthrough

Processed hop #1 | 504,165 | 211,409
Processed hop #2 | 718,202 | 132,477
Processed hop #3 | 994,799 | 106,327
Processed hop #4 | 1,214,780 | 91,457

Processing comm_as_passthrough_reverse

Processed hop #1 | 603,456 | 266,303
Processed hop #2 | 2,430,787 | 214,752
Processed hop #3 | 3,138,668 | 192,637
Processed hop #4 | 3,894,875 | 179,453


comm_as_source_features

CPU times: user 19.1 s, sys: 345 ms, total: 19.5 s
Wall time: 19.4 s

comm_as_target_features

CPU times: user 23.2 s, sys: 329 ms, total: 23.5 s
Wall time: 23.5 s

comm_as_passthrough_features

CPU times: user 14 s, sys: 168 ms, tota

In [11]:
%%time

ts_min = data_graph.select(sf.min("timestamp").alias("x")).collect()[0]["x"] - timedelta(minutes=1)
data_graph_agg = data_graph.groupby(["source", "target", "source_bank", "target_bank", "source_currency"]).agg(
    sf.count("source").alias("num_transactions"),
    sf.sum("amount").alias("amount"),
    sf.sum("source_amount").alias("source_amount"),
    sf.collect_list(sf.array((sf.col("timestamp") - ts_min).cast("long"), sf.col("amount"))).alias("timestamps_amounts"),
)
data_graph_agg_sdf = data_graph_agg.persist(StorageLevel.DISK_ONLY)
_ = data_graph_agg_sdf.count()
data_graph_agg = data_graph_agg_sdf.toPandas().convert_dtypes()

                                                                                

CPU times: user 862 ms, sys: 287 ms, total: 1.15 s
Wall time: 4.81 s


In [12]:
%%time

print("Constructing communities")

in_scope_nodes = list(set(data_graph_agg["source"].unique()).union(data_graph_agg["target"].unique()))
window_edges = get_weights(
    data_graph_agg.groupby(["source", "target"]).agg(amount=("amount", "sum")).reset_index()
)
graph = ig.Graph.DataFrame(window_edges, use_vids=False, directed=True)
communities = get_communities_spark(in_scope_nodes, graph, NUM_PROCS, spark)

del in_scope_nodes
del window_edges
del graph

Constructing communities


                                                                                

CPU times: user 8.65 s, sys: 252 ms, total: 8.9 s
Wall time: 44.2 s


In [13]:
%%time

print("Communities features creation")

graph = ig.Graph.DataFrame(data_graph_agg, use_vids=False, directed=True)
features = generate_features_spark(communities, graph, spark)
features.columns = [f"{s.G_COMM_PREFIX}{x}" if x != "key" else x for x in features.columns]

del graph
del communities
del data_graph_agg

Communities features creation


                                                                                

CPU times: user 3min 59s, sys: 25.3 s, total: 4min 24s
Wall time: 15min 17s


In [14]:
%%time

print("1-hop-source features creation")

features_source = data_graph_agg_sdf.withColumn("key", sf.col("source")).groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_source = pd.DataFrame(features_source["features"].apply(json.loads).tolist())
types = {k: v for k, v in FEATURE_TYPES.items() if k in features_source.columns}
features_source = features_source.astype(types)
features_source.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_source.columns]

1-hop-source features creation


                                                                                

CPU times: user 2.65 s, sys: 263 ms, total: 2.92 s
Wall time: 8min 5s


In [15]:
%%time

print("1-hop-target features creation")

features_target = data_graph_agg_sdf.withColumn("key", sf.col("target")).groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_target = pd.DataFrame(features_target["features"].apply(json.loads).tolist())
types = {k: v for k, v in FEATURE_TYPES.items() if k in features_target.columns}
features_target = features_target.astype(types)
features_target.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_target.columns]

1-hop-target features creation


                                                                                

CPU times: user 2.21 s, sys: 286 ms, total: 2.5 s
Wall time: 6min 8s


In [16]:
%%time

all_features = features.set_index("key").join(
    features_source.set_index("key"), how="left", rsuffix=f"_1_hop_as_source"
)
all_features.index.name = "key"
all_features = all_features.reset_index()

all_features = all_features.set_index("key").join(
    features_target.set_index("key"), how="left", rsuffix=f"_1_hop_as_target"
)

all_features = all_features.join(
    pd.read_parquet(location_comm_as_source_features), how="left", rsuffix="_dispense"
).join(
    pd.read_parquet(location_comm_as_target_features), how="left", rsuffix="_sink"
).join(
    pd.read_parquet(location_comm_as_passthrough_features), how="left", rsuffix="_passthrough"
).join(
    pd.read_parquet(location_comm_as_passthrough_features_reverse), how="left", rsuffix="_passthrough_rev"
).join(
    pd.read_parquet(location_flow_dispense), how="left"
).join(
    pd.read_parquet(location_flow_passthrough), how="left"
).join(
    pd.read_parquet(location_flow_sink), how="left"
)

all_features.to_parquet(location_features_node_level)
del all_features

CPU times: user 4.33 s, sys: 671 ms, total: 5 s
Wall time: 4.55 s


In [17]:
all_features = pd.read_parquet(location_features_node_level)

In [18]:
anomalies = all_features.loc[:, []]
anomalies.loc[:, "anomaly_score"] = IsolationForest().fit(
    all_features.fillna(0)
).decision_function(all_features.fillna(0))
anomalies.loc[:, "anomaly_score"] += abs(anomalies.loc[:, "anomaly_score"].min())

In [19]:
if s.FILE_SIZE == "Small":
    n_components = 50
elif s.FILE_SIZE == "Medium":
    n_components = 20
else:
    n_components = 5

pca = PCA(n_components=n_components)
all_features_dim_reduced = pd.DataFrame(
    pca.fit_transform(normalize(all_features.fillna(0), norm="l1", axis=1)),
    index=all_features.index
)
print(n_components, round(sum(pca.explained_variance_ratio_) * 100, 2))
all_features_dim_reduced.columns = [
    f"pca_{x + 1}" for x in all_features_dim_reduced.columns
]
del all_features

50 100.0


In [20]:
%%time

print(f"Generating edge features")

to_select = ["source", "target", "format", "source_currency", "source_amount", "amount"]

edges_features_input = data.select(to_select).groupby(
    ["source", "target", "format", "source_currency"]
).agg(
    sf.sum("source_amount").alias("source_amount"), sf.sum("amount").alias("amount")
).persist(StorageLevel.DISK_ONLY)
_ = edges_features_input.count()

edge_features = edges_features_input.groupby(["source", "target"]).applyInPandas(
    get_edge_features_udf, schema=SCHEMA_FEAT_UDF
).toPandas()
edge_features = pd.DataFrame(edge_features["features"].apply(json.loads).tolist())

edge_features.to_parquet(location_features_edges)
del edge_features

Generating edge features


                                                                                

CPU times: user 3.55 s, sys: 1.84 s, total: 5.39 s
Wall time: 3min 49s


In [21]:
edge_features = pd.read_parquet(location_features_edges)

In [22]:
%%time

train_edges = train.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
valid_edges = validation.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
test_edges = test.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)

train_features = train_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
validation_features = valid_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
test_features = test_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()

25/07/05 18:52:12 WARN TaskSetManager: Stage 109 contains a task of very large size (8183 KiB). The maximum recommended task size is 1000 KiB.
25/07/05 18:52:13 WARN TaskSetManager: Stage 111 contains a task of very large size (8183 KiB). The maximum recommended task size is 1000 KiB.
25/07/05 18:52:18 WARN TaskSetManager: Stage 119 contains a task of very large size (2734 KiB). The maximum recommended task size is 1000 KiB.
25/07/05 18:52:18 WARN TaskSetManager: Stage 121 contains a task of very large size (2734 KiB). The maximum recommended task size is 1000 KiB.
25/07/05 18:52:21 WARN TaskSetManager: Stage 129 contains a task of very large size (2734 KiB). The maximum recommended task size is 1000 KiB.
25/07/05 18:52:22 WARN TaskSetManager: Stage 131 contains a task of very large size (2734 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 3.45 s, sys: 222 ms, total: 3.67 s
Wall time: 16.2 s


In [23]:
def save_edge_features(features_in, location):
    features_in = features_in.set_index("target").join(
        anomalies, how="left"
    ).reset_index().set_index("source").join(
        anomalies, how="left", rsuffix="_source"
    ).reset_index().set_index("target").join(
        all_features_dim_reduced, how="left"
    ).reset_index().set_index("source").join(
        all_features_dim_reduced, how="left", rsuffix="_source"
    ).reset_index()
    features_in.loc[:, "anom_scores_diff"] = features_in.loc[:, "anomaly_score"] - features_in.loc[:, "anomaly_score_source"]
    features_in.loc[:, "anom_scores_min"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).min(axis=0)
    features_in.loc[:, "anom_scores_max"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).max(axis=0)
    features_in.loc[:, "anom_scores_mean"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).mean(axis=0)
    features_in.to_parquet(location)

In [24]:
%%time

save_edge_features(train_features, location_features_edges_train)

CPU times: user 3.9 s, sys: 724 ms, total: 4.63 s
Wall time: 4.44 s


In [25]:
%%time

save_edge_features(validation_features, location_features_edges_valid)

CPU times: user 1.47 s, sys: 176 ms, total: 1.65 s
Wall time: 1.58 s


In [26]:
%%time

save_edge_features(test_features, location_features_edges_test)

CPU times: user 1.96 s, sys: 243 ms, total: 2.21 s
Wall time: 2.11 s


In [27]:
def save_trx_features(data_in, location):
    columns = ["source", "target", "source_currency", "target_currency", "format", "amount", "is_laundering"]
    
    trx_features = data_in.select(*columns).toPandas()
    trx_features.loc[:, "inter_currency"] = trx_features["source_currency"] != trx_features["target_currency"]

    trx_features.to_parquet(location)
    del trx_features

In [28]:
%%time

save_trx_features(train, location_train_trx_features)
save_trx_features(validation, location_valid_trx_features)
save_trx_features(test, location_test_trx_features)

25/07/05 18:52:37 WARN TaskSetManager: Stage 139 contains a task of very large size (8183 KiB). The maximum recommended task size is 1000 KiB.
25/07/05 18:52:40 WARN TaskSetManager: Stage 144 contains a task of very large size (2734 KiB). The maximum recommended task size is 1000 KiB.
25/07/05 18:52:42 WARN TaskSetManager: Stage 149 contains a task of very large size (2734 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 2.02 s, sys: 276 ms, total: 2.3 s
Wall time: 8.09 s


In [29]:
def combine_features(location_features_trx, location_features_edges):
    label_columns = ["source", "target", "is_laundering"]
    columns_category = ["source_currency", "target_currency", "format"]
    new_types = {column: "category" for column in columns_category}
    new_types.update({"is_laundering": bool})
    features_input = spark.read.parquet(location_features_edges)
    trx_features_input = spark.read.parquet(location_features_trx).withColumn(
        "source_left", sf.col("source")
    ).withColumn(
        "target_left", sf.col("target")
    ).drop("source", "target")
    features_input = trx_features_input.join(
        features_input,
        (trx_features_input["source_left"] == features_input["source"]) &
        (trx_features_input["target_left"] == features_input["target"]),
        how="left"
    ).drop("source_left", "target_left")
    features_input = features_input.toPandas()
    return features_input.astype(new_types)

In [30]:
%%time

train_features = combine_features(location_train_trx_features, location_features_edges_train)
validation_features = combine_features(location_valid_trx_features, location_features_edges_valid)
test_features = combine_features(location_test_trx_features, location_features_edges_test)

25/07/05 18:52:45 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/07/05 18:52:52 ERROR TransportRequestHandler: Error sending result StreamResponse[streamId=1409698681157_0,byteCount=271462243,body=org.apache.spark.storage.BlockManagerManagedBuffer@33cf9e8] to /127.0.0.1:57520; closing connection
java.io.IOException: No buffer space available
	at java.base/sun.nio.ch.FileDispatcherImpl.write0(Native Method)
	at java.base/sun.nio.ch.SocketDispatcher.write(SocketDispatcher.java:62)
	at java.base/sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:132)
	at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:97)
	at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:53)
	at java.base/sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:532)
	at org.apache.spark.util.io.ChunkedByteBufferFileRegion.transferTo(ChunkedByteBufferFileRegion.scala:60)
	at org.apache.spark.networ

CPU times: user 3.25 s, sys: 7.18 s, total: 10.4 s
Wall time: 25.7 s


In [31]:
%%time

label_columns = ["source", "target", "is_laundering"]

missing_columns = (
    (set(train_features.columns).symmetric_difference(validation_features.columns)) |
    (set(train_features.columns).symmetric_difference(test_features.columns)) |
    (set(test_features.columns).symmetric_difference(validation_features.columns))
)
for column in missing_columns:
    if missing in train_features.columns:
        print(f"Deleting missing column from train: {column}")
        del train_features[column]
    if missing in validation_features.columns:
        print(f"Deleting missing column from validation: {column}")
        del validation_features[column]
    if missing in test_features.columns:
        print(f"Deleting missing column from test: {column}")
        del test_features[column]

train_features_labels = train_features.loc[:, label_columns].copy(deep=True)
del train_features["is_laundering"]
del train_features["source"]
del train_features["target"]

validation_features_labels = validation_features.loc[:, label_columns].copy(deep=True)
validation_features = validation_features.loc[:, train_features.columns]

test_features_labels = test_features.loc[:, label_columns].copy(deep=True)
test_features = test_features.loc[:, train_features.columns]

CPU times: user 506 ms, sys: 263 ms, total: 768 ms
Wall time: 768 ms


In [32]:
print((time.time() - start) // 60)
start = time.time()

44.0


In [33]:
def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))


def train_model(x, y, x_, y_, cv=False):
    if cv:
        model = xgb.XGBClassifier(
            early_stopping_rounds=50, scale_pos_weight=5,
            eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=1, max_depth=6,
            colsample_bytree=1, subsample=1, n_estimators=200,
            enable_categorical=True,
        )
    else:
        model = xgb.XGBClassifier(
            early_stopping_rounds=20, scale_pos_weight=7,
            eval_metric=f1_eval, disable_default_eval_metric=True, 
            num_parallel_tree=20, max_depth=6,
            colsample_bytree=0.5, subsample=0.5, 
            n_estimators=100, enable_categorical=True,
        )
    model.fit(x, y, verbose=not cv, eval_set=[(x_, y_)])
    print(f"Best iteration: {model.best_iteration}\n")
    return model

In [34]:
%%time

model = train_model(
    train_features, train_features_labels["is_laundering"].values, 
    validation_features, validation_features_labels["is_laundering"].values,
)
y_test_predicted = model.predict(test_features)
f1_final = f1_score(test_features_labels["is_laundering"], y_test_predicted) * 100
print(
    round(f1_final, 2),
    round(recall_score(test_features_labels["is_laundering"], y_test_predicted) * 100, 2)
)
print()

[0]	validation_0-f1_eval:0.96367
[1]	validation_0-f1_eval:0.92692
[2]	validation_0-f1_eval:0.81088
[3]	validation_0-f1_eval:0.67253
[4]	validation_0-f1_eval:0.59094
[5]	validation_0-f1_eval:0.52533
[6]	validation_0-f1_eval:0.50032
[7]	validation_0-f1_eval:0.47466
[8]	validation_0-f1_eval:0.45476
[9]	validation_0-f1_eval:0.44889
[10]	validation_0-f1_eval:0.43423
[11]	validation_0-f1_eval:0.43335
[12]	validation_0-f1_eval:0.43086
[13]	validation_0-f1_eval:0.42486
[14]	validation_0-f1_eval:0.41975
[15]	validation_0-f1_eval:0.42087
[16]	validation_0-f1_eval:0.41838
[17]	validation_0-f1_eval:0.41477
[18]	validation_0-f1_eval:0.40987
[19]	validation_0-f1_eval:0.40720
[20]	validation_0-f1_eval:0.40741
[21]	validation_0-f1_eval:0.40451
[22]	validation_0-f1_eval:0.40529
[23]	validation_0-f1_eval:0.40243
[24]	validation_0-f1_eval:0.40419
[25]	validation_0-f1_eval:0.40493
[26]	validation_0-f1_eval:0.40517
[27]	validation_0-f1_eval:0.40220
[28]	validation_0-f1_eval:0.40197
[29]	validation_0-f1_eva

In [35]:
%%time

CV_FOLD_PERC = 0.8
N_FOLDS = 5

f1_scores = []
for fold in range(N_FOLDS):
    print("Fold", fold + 1)
    x_train = train_features.sample(frac=CV_FOLD_PERC)
    x_train_labels = x_train.loc[:, []].join(train_features_labels, how="left")
    x_validation = validation_features.sample(frac=CV_FOLD_PERC)
    x_validation_labels = x_validation.loc[:, []].join(validation_features_labels, how="left")
    model = train_model(
        x_train, x_train_labels["is_laundering"].values, 
        x_validation, x_validation_labels["is_laundering"].values,
        cv=True
    )
    y_test_predicted = model.predict(test_features)
    f1_cv = f1_score(test_features_labels["is_laundering"], y_test_predicted) * 100
    print(
        round(f1_cv, 2),
        round(recall_score(test_features_labels["is_laundering"], y_test_predicted) * 100, 2)
    )
    f1_scores.append(f1_cv)

Fold 1
Best iteration: 69

62.61 51.31
Fold 2
Best iteration: 58

62.98 51.92
Fold 3
Best iteration: 64

63.21 52.2
Fold 4
Best iteration: 42

64.14 53.64
Fold 5
Best iteration: 71

62.69 51.53
CPU times: user 23min 11s, sys: 11.3 s, total: 23min 22s
Wall time: 3min 10s


In [38]:
print("GFP best: 64.77 ± 0.47")

GFP best: 64.77 ± 0.47


In [36]:
print(f"{round(f1_final, 2)} ±{round(np.std(f1_scores), 2)}")

67.8 ±0.55


In [37]:
print((time.time() - start) // 60)

7.0
