In [1]:
import json
import os
import pickle
import random
import time
import shutil
import sys
import uuid
from collections import defaultdict
from datetime import timedelta
from glob import glob
from itertools import product
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, recall_score, RocCurveDisplay

import igraph as ig
import leidenalg as la
import numpy as np
import pandas as pd
import xgboost as xgb

import settings as s

random.seed(42)

assert s.FILE_SIZE == "Small"
assert s.HIGH_ILLICIT == True

os.environ["EXT_DATA_TYPE_FOLDER"] = s.OUTPUT_POSTFIX.lstrip("-")

from common import get_weights, delete_large_vars, MULTI_PROC_STAGING_LOCATION
from communities import get_communities_spark
from features import (
    generate_features_spark, generate_features_udf_wrapper, get_edge_features_udf,
    SCHEMA_FEAT_UDF, CURRENCY_RATES
)

%load_ext autoreload
%autoreload 2

In [2]:
start_script = time.time()

In [3]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8"
    )

In [4]:
SPARK_CONF = [
    ("spark.driver.memory", "32g"),
    ("spark.worker.memory", "32g"),
    ("spark.driver.maxResultSize", "32g"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
    ("spark.network.timeout", "600s"),
    ("spark.sql.autoBroadcastJoinThreshold", -1),
    ("spark.local.dir", f".{os.sep}temp-spark"),
]

shutil.rmtree("artifacts", ignore_errors=True)
shutil.rmtree("temp-spark", ignore_errors=True)
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(SPARK_CONF))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/16 21:28:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/16 21:28:22 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in standalone/kubernetes and LOCAL_DIRS in YARN).


In [4]:
TRAIN_PERC = 0.64
VALIDATION_PERC = 0.19
TEST_PERC = 0.17

KEEP_TOP_N = 100

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

location_main = os.path.join("features", os.environ["EXT_DATA_TYPE_FOLDER"])
# shutil.rmtree(location_main, ignore_errors=True)

location_communities_leiden = f"{location_main}{os.sep}communities_leiden.parquet"

location_features_leiden = f"{location_main}{os.sep}features_leiden.parquet"
location_features_ego = f"{location_main}{os.sep}features_ego.parquet"
location_features_2_hop = f"{location_main}{os.sep}features_2_hop.parquet"
location_features_2_hop_out = f"{location_main}{os.sep}features_2_hop_out.parquet"
location_features_2_hop_in = f"{location_main}{os.sep}features_2_hop_in.parquet"
location_features_2_hop_combined = f"{location_main}{os.sep}features_2_hop_combined.parquet"
location_features_source = f"{location_main}{os.sep}features_source.parquet"
location_features_target = f"{location_main}{os.sep}features_target.parquet"

location_flow_dispense = f"{location_main}{os.sep}location_flow_dispense.parquet"
location_flow_passthrough = f"{location_main}{os.sep}location_flow_passthrough.parquet"
location_flow_sink = f"{location_main}{os.sep}location_flow_sink.parquet"

location_comm_as_source_features = f"{location_main}{os.sep}comm_as_source_features.parquet"
location_comm_as_target_features = f"{location_main}{os.sep}comm_as_target_features.parquet"
location_comm_as_passthrough_features = f"{location_main}{os.sep}comm_as_passthrough_features.parquet"
location_comm_as_passthrough_features_reverse = f"{location_main}{os.sep}comm_as_passthrough_features_reverse.parquet"

location_features_node_level = f"{location_main}{os.sep}features_node_level.parquet"
location_features_edges = f"{location_main}{os.sep}features_edges.parquet"

location_features_edges_train = f"{location_main}{os.sep}features_edges_train.parquet"
location_features_edges_valid = f"{location_main}{os.sep}features_edges_valid.parquet"
location_features_edges_test = f"{location_main}{os.sep}features_edges_test.parquet"

location_train_trx_features = f"{location_main}{os.sep}train_trx_features.parquet"
location_valid_trx_features = f"{location_main}{os.sep}valid_trx_features.parquet"
location_test_trx_features = f"{location_main}{os.sep}test_trx_features.parquet"

location_train_features = f"{location_main}{os.sep}train_features.parquet"
location_valid_features = f"{location_main}{os.sep}valid_features.parquet"
location_test_features = f"{location_main}{os.sep}test_features.parquet"

try:
    os.makedirs(location_main)
except FileExistsError:
    pass

In [6]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)
data = data.withColumn("is_laundering", sf.col("is_laundering").cast("boolean"))
data_count_original = data.count()

# Probably not used in the benchmarks
data = data.drop("source_entity", "target_entity")

In [7]:
# data = data.drop("source", "target")
# data = data.withColumnRenamed("source_entity", "source")
# data = data.withColumnRenamed("target_entity", "target")

In [8]:
%%time

trx_ids_sorted = data.sort("timestamp").select("transaction_id").toPandas()["transaction_id"].values
trx_count = len(trx_ids_sorted)

last_train_index = int(np.floor(trx_count * TRAIN_PERC))
last_validation_index = last_train_index + int(np.floor(trx_count * VALIDATION_PERC))
train_indexes = trx_ids_sorted[:last_train_index]
validation_indexes = trx_ids_sorted[last_train_index:last_validation_index]
test_indexes = trx_ids_sorted[last_validation_index:]

train_indexes_loc = os.path.join(location_main, "temp_train_indexes.parquet")
validation_indexes_loc = os.path.join(location_main, "temp_validation_indexes.parquet")
test_indexes_loc = os.path.join(location_main, "temp_test_indexes.parquet")

pd.DataFrame(train_indexes, columns=["transaction_id"]).to_parquet(train_indexes_loc)
pd.DataFrame(validation_indexes, columns=["transaction_id"]).to_parquet(validation_indexes_loc)
pd.DataFrame(test_indexes, columns=["transaction_id"]).to_parquet(test_indexes_loc)

train_indexes = spark.read.parquet(train_indexes_loc)
validation_indexes = spark.read.parquet(validation_indexes_loc)
test_indexes = spark.read.parquet(test_indexes_loc)

train = train_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
validation = validation_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
test = test_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
train_count, validation_count, test_count = train.count(), validation.count(), test.count()
print()
print(trx_count, train_count, validation_count, test_count)
print()

os.remove(train_indexes_loc)
os.remove(validation_indexes_loc)
os.remove(test_indexes_loc)

train.write.parquet("temp-train", mode="overwrite")
validation.write.parquet("temp-validation", mode="overwrite")
test.write.parquet("temp-test", mode="overwrite")

                                                                                


5072693 3246523 963811 862359





CPU times: user 112 ms, sys: 49.9 ms, total: 162 ms
Wall time: 23.2 s


                                                                                

In [9]:
train = spark.read.parquet("temp-train")
validation = spark.read.parquet("temp-validation")
test = spark.read.parquet("temp-test")
train_count, validation_count, test_count = train.count(), validation.count(), test.count()

In [10]:
%%time

edges = data.groupby(["source", "target"]).agg(
    sf.sum("amount").alias("amount")
).toPandas()
weights = get_weights(edges)
edges_agg = edges.set_index(["source", "target"]).join(
    weights.set_index(["source", "target"]), how="left"
).reset_index()
edges_agg.loc[:, "amount_weighted"] = (
    edges_agg.loc[:, "amount"] * 
    (edges_agg.loc[:, "weight"] / edges_agg.loc[:, "weight"].max())
)

                                                                                

CPU times: user 6.06 s, sys: 141 ms, total: 6.2 s
Wall time: 8.46 s


In [11]:
# Later on, we will reset the variables (to free up memory), while still keeping these intact
to_keep = %who_ls
to_keep = list(to_keep)

In [12]:
TOP_N = 50
NUM_HOPS = 5

In [13]:
%%time

data_input = spark.createDataFrame(edges_agg)
nodes_source = set(edges_agg["source"].unique())
nodes_target = set(edges_agg["target"].unique())
nodes_passthrough = nodes_source.intersection(nodes_target)

%run generate_flow_features.ipynb

comm_as_source_features.to_parquet(location_comm_as_source_features)
comm_as_target_features.to_parquet(location_comm_as_target_features)
comm_as_passthrough_features.to_parquet(location_comm_as_passthrough_features)
comm_as_passthrough_features_reverse.to_parquet(location_comm_as_passthrough_features_reverse)

del comm_as_source_features
del comm_as_target_features
del comm_as_passthrough_features
del comm_as_passthrough_features_reverse

25/10/16 20:12:02 WARN TaskSetManager: Stage 71 contains a task of very large size (5378 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

FLOWS ET: 146


CPU times: user 5.4 s, sys: 1.46 s, total: 6.86 s
Wall time: 2min 33s


In [14]:
%%time

# TODO: Use https://docs.rapids.ai/api/cugraph/legacy/api_docs/api/cugraph/cugraph.leiden/ ?

print("Constructing Leiden communities")

graph = ig.Graph.DataFrame(edges_agg.loc[:, ["source", "target", "amount_weighted"]], use_vids=False, directed=True)
nodes_mapping = {x.index: x["name"] for x in graph.vs()}
communities_leiden = la.find_partition(
    graph, la.ModularityVertexPartition, n_iterations=100, weights="amount_weighted", seed=42,
)
communities_leiden = [[nodes_mapping[_] for _ in x] for x in communities_leiden]
communities_leiden = [(str(uuid.uuid4()), set(x)) for x in communities_leiden]
sizes_leiden = [len(x[1]) for x in communities_leiden]

with open(location_communities_leiden, "wb") as fl:
    pickle.dump(communities_leiden, fl)

Constructing Leiden communities
CPU times: user 10min 43s, sys: 5.83 s, total: 10min 49s
Wall time: 10min 48s


In [15]:
with open(location_communities_leiden, "rb") as fl:
    communities_leiden = pickle.load(fl)

In [16]:
%%time

data_agg_weights = get_weights(
    data.groupby(["source", "target"])
    .agg(
        sf.sum("amount").alias("amount")
    ).toPandas()
)

data_agg_weights_rev = data_agg_weights.rename(
    columns={"target": "source", "source": "target"}
).loc[:, ["source", "target", "weight"]]
data_agg_weights_ud = pd.concat([data_agg_weights, data_agg_weights_rev], ignore_index=True)
data_agg_weights_ud = data_agg_weights_ud.groupby(["source", "target"]).agg(weight=("weight", "sum")).reset_index()

data_agg_weights_ud.sort_values("weight", ascending=False, inplace=True)
grouped_ud = data_agg_weights_ud.groupby("source").head(KEEP_TOP_N).reset_index(drop=True)
grouped_ud = grouped_ud.groupby("source").agg(targets=("target", set))

total = grouped_ud.index.nunique()
nodes_neighborhoods = {}
for index, (source, targets) in enumerate(grouped_ud.iterrows()):
    community_candidates = {source}
    for target in targets["targets"]:
        community_candidates |= (grouped_ud.loc[target, "targets"] | {target})
    nodes_neighborhoods[source] = set(community_candidates)
    if not (index % 250_000):
        print(index, total)

del data_agg_weights_rev
del data_agg_weights_ud
del grouped_ud

                                                                                

0 513439
250000 513439
500000 513439
CPU times: user 21 s, sys: 274 ms, total: 21.3 s
Wall time: 23 s


In [17]:
%%time

print("Constructing 2-hop communities")

communities_2_hop = get_communities_spark(
    nodes_neighborhoods,
    ig.Graph.DataFrame(edges_agg.loc[:, ["source", "target", "amount_weighted"]], use_vids=False, directed=True), 
    os.cpu_count(), spark, 2, "all", 0.01, "amount_weighted"
)
sizes_2_hop = [len(x[1]) for x in communities_2_hop]

Constructing 2-hop communities


                                                                                

CPU times: user 4.97 s, sys: 174 ms, total: 5.15 s
Wall time: 24.8 s


In [18]:
%%time

ts_min = data.select(sf.min("timestamp").alias("x")).collect()[0]["x"] - timedelta(minutes=1)
data_graph_agg = data.groupby(["source", "target", "source_bank", "target_bank", "source_currency"]).agg(
    sf.count("source").alias("num_transactions"),
    sf.sum("amount").alias("amount"),
    sf.sum("source_amount").alias("source_amount"),
    sf.collect_list(sf.array((sf.col("timestamp") - ts_min).cast("long"), sf.col("amount"))).alias("timestamps_amounts"),
)
data_graph_agg_sdf = data_graph_agg.persist(StorageLevel.DISK_ONLY)
print(data_graph_agg_sdf.count())
data_graph_agg = data_graph_agg_sdf.toPandas()
index = ["source", "target"]
edges_agg.loc[:, index + ["weight"]].set_index(index)
data_graph_agg = data_graph_agg.set_index(index).join(
    edges_agg.loc[:, index + ["weight"]].set_index(index), how="left"
).reset_index()
data_graph_agg.loc[:, "amount_weighted"] = (
    data_graph_agg.loc[:, "amount"] * 
    (data_graph_agg.loc[:, "weight"] / data_graph_agg.loc[:, "weight"].max())
)

                                                                                

1015572
CPU times: user 5.5 s, sys: 298 ms, total: 5.8 s
Wall time: 10.9 s


In [19]:
graph = ig.Graph.DataFrame(data_graph_agg, use_vids=False, directed=True)

In [20]:
%%time

print("Leiden communitites features creation")

features_leiden = generate_features_spark(communities_leiden, data_graph_agg, spark)
features_leiden = features_leiden.rename(columns={"key": "key_fake"})
communities_leiden_dict = dict(communities_leiden)
features_leiden.loc[:, "key"] = features_leiden.loc[:, "key_fake"].apply(lambda x: communities_leiden_dict[x])
features_leiden = features_leiden.explode("key")
del features_leiden["key_fake"]
features_leiden.set_index("key").to_parquet(location_features_leiden)

Leiden communitites features creation


25/10/16 20:26:51 WARN TaskSetManager: Stage 835 contains a task of very large size (1917 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 14.8 s, sys: 704 ms, total: 15.5 s
Wall time: 2min 52s


In [21]:
%%time

print("2-hop communitites features creation")

features_2_hop = generate_features_spark(communities_2_hop, data_graph_agg, spark)
features_2_hop.set_index("key").to_parquet(location_features_2_hop)

2-hop communitites features creation


25/10/16 20:30:31 WARN TaskSetManager: Stage 846 contains a task of very large size (1917 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 24 s, sys: 1.71 s, total: 25.7 s
Wall time: 11min 13s


In [22]:
del graph

In [23]:
%%time

print("Temporal flows features creation")

edges_totals = data.select("source", "target", "amount").groupby(
    ["source", "target"]
).agg(sf.count("amount").alias("amount")).toPandas()
edges_totals = edges_totals.sort_values("amount", ascending=False).reset_index(drop=True)
left_edges = spark.createDataFrame(edges_totals.groupby("target").head(TOP_N).loc[:, ["source", "target"]])
right_edges = spark.createDataFrame(edges_totals.groupby("source").head(TOP_N).loc[:, ["source", "target"]])

columns = ["source", "target", "timestamp", "amount"]

left = left_edges.select(sf.col("source").alias("src"), sf.col("target").alias("tgt")).join(
    data.select(*columns),
    on=(sf.col("src") == sf.col("source")) & (sf.col("tgt") == sf.col("target")),
    how="left"
).drop("src", "tgt").persist(StorageLevel.DISK_ONLY)
select = []
for column in left.columns:
    select.append(sf.col(column).alias(f"left_{column}"))
left = left.select(*select)
right = right_edges.select(sf.col("source").alias("src"), sf.col("target").alias("tgt")).join(
    data.select(*columns),
    on=(sf.col("src") == sf.col("source")) & (sf.col("tgt") == sf.col("target")),
    how="left"
).drop("src", "tgt").persist(StorageLevel.DISK_ONLY)

print(left.count(), right.count())

flows_temporal = left.join(
    right,
    (left["left_target"] == right["source"]) &
    (left["left_timestamp"] <= right["timestamp"]),
    how="inner"
).groupby(["left_source", "left_target", "source", "target"]).agg(
    sf.sum("left_amount").alias("left_amount"),
    sf.sum("amount").alias("amount"),
).drop("left_target").select(
    sf.col("left_source").alias("dispense"),
    sf.col("source").alias("passthrough"),
    sf.col("target").alias("sink"),
    sf.least("left_amount", "amount").alias("amount"),
)

aggregate = [
    sf.sum("amount").alias("amount_sum"),
    sf.mean("amount").alias("amount_mean"),
    sf.median("amount").alias("amount_median"),
    sf.max("amount").alias("amount_max"),
    sf.stddev("amount").alias("amount_std"),
    sf.countDistinct("dispense").alias("dispense_count"),
    sf.countDistinct("passthrough").alias("passthrough_count"),
    sf.countDistinct("sink").alias("sink_count"),
]
for flow_location, flow_type in [
    (location_flow_dispense, "dispense"), (location_flow_passthrough, "passthrough"), (location_flow_sink, "sink")
]:
    print(flow_type)
    flows_temporal_stats = flows_temporal.groupby(flow_type).agg(*aggregate).toPandas()
    flows_temporal_cyclic_stats = flows_temporal.where(
        (sf.col("dispense") == sf.col("sink"))
    ).groupby(flow_type).agg(*aggregate).toPandas()
    flows_temporal_stats = flows_temporal_stats.set_index(flow_type).join(
        flows_temporal_cyclic_stats.set_index(flow_type),
        how="left", rsuffix="_cycle"
    )
    flows_temporal_stats.index.name = "key"
    flows_temporal_stats.to_parquet(flow_location)
    del flows_temporal_stats
    del flows_temporal_cyclic_stats

left.unpersist()
right.unpersist()

del edges_totals
del left_edges
del right_edges

Temporal flows features creation


25/10/16 20:40:35 WARN TaskSetManager: Stage 859 contains a task of very large size (4696 KiB). The maximum recommended task size is 1000 KiB.
25/10/16 20:40:39 WARN TaskSetManager: Stage 871 contains a task of very large size (4543 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

5067362 4646909
dispense


                                                                                

passthrough


                                                                                

sink


                                                                                

CPU times: user 2.69 s, sys: 602 ms, total: 3.29 s
Wall time: 2min 52s


In [24]:
%%time

print("1-hop-source features creation")

features_source = spark.createDataFrame(data_graph_agg).withColumn(
    "key", sf.col("source")
).repartition(os.cpu_count() * 5, "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_source = pd.DataFrame(features_source["features"].apply(json.loads).tolist())
features_source.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_source.columns]
features_source.set_index("key").to_parquet(location_features_source)

1-hop-source features creation


25/10/16 20:43:33 WARN TaskSetManager: Stage 1062 contains a task of very large size (1917 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 18 s, sys: 2.1 s, total: 20.1 s
Wall time: 6min 9s


In [25]:
%%time

print("1-hop-target features creation")

features_target = spark.createDataFrame(data_graph_agg).withColumn(
    "key", sf.col("target")
).repartition(os.cpu_count() * 5, "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_target = pd.DataFrame(features_target["features"].apply(json.loads).tolist())
features_target.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_target.columns]
features_target.set_index("key").to_parquet(location_features_target)

1-hop-target features creation


25/10/16 20:49:42 WARN TaskSetManager: Stage 1065 contains a task of very large size (1917 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 16.9 s, sys: 1.14 s, total: 18 s
Wall time: 5min 2s


In [26]:
del data_graph_agg

In [10]:
ENABLED_FEATURES = [
    ("leiden", location_features_leiden),
    ("2_hop", location_features_2_hop),
    ("as_source", location_features_source),
    ("as_target", location_features_target),
    ("comm_as_source_features", location_comm_as_source_features),
    ("comm_as_target_features", location_comm_as_target_features),
    ("comm_as_passthrough_features", location_comm_as_passthrough_features),
    ("comm_as_passthrough_features_reverse", location_comm_as_passthrough_features_reverse),
    ("flow_dispense", location_flow_dispense),
    ("flow_passthrough", location_flow_passthrough),
    ("flow_sink", location_flow_sink),   
]

In [11]:
all_features = pd.DataFrame()
all_features.index.name = "key"

for feature_group, location in ENABLED_FEATURES:
    all_features = all_features.join(
        pd.read_parquet(location), how="outer", rsuffix=f"_{feature_group}"
    )

all_features.to_parquet(location_features_node_level)
print("Features:", all_features.shape)
del all_features

Features: (513439, 388)


In [12]:
all_features = pd.read_parquet(location_features_node_level)

In [13]:
constants = []
for column in all_features.columns:
    if all_features[column].nunique(dropna=True) <= 1:
        del all_features[column]
        constants.append(column)
print(f"Deleted {len(constants)} constant columns")

Deleted 14 constant columns


In [14]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")

Script executed in 0:00:59


In [15]:
%%time

print("Training the anomaly detection model")

anomalies = all_features.loc[:, []]
# Disabling
# anomalies.loc[:, "anomaly_score"] = 0
model_ad = IsolationForest(n_estimators=1_000, random_state=42)
anomalies.loc[:, "anomaly_score"] = -model_ad.fit(
    all_features.fillna(0)
).decision_function(all_features.fillna(0))
anomalies.loc[:, "anomaly_score"] += abs(anomalies.loc[:, "anomaly_score"].min()) + 1e-10
anomalies.loc[:, "anomaly_score"] /= anomalies.loc[:, "anomaly_score"].max()

Training the anomaly detection model
CPU times: user 10.1 s, sys: 1.08 s, total: 11.2 s
Wall time: 11.2 s


In [16]:
%%time

print(f"Generating edge features")

to_select = ["source", "target", "format", "source_currency", "source_amount", "amount", "timestamp"]

edges_features_input = data.select(to_select).groupby(
    ["source", "target", "format", "source_currency"]
).agg(
    sf.sum("source_amount").alias("source_amount"), 
    sf.sum("amount").alias("amount"),
    sf.unix_timestamp(sf.min("timestamp")).alias("min_ts"),
    sf.unix_timestamp(sf.max("timestamp")).alias("max_ts"),
).repartition(os.cpu_count() * 2, "source", "target").persist(StorageLevel.DISK_ONLY)
_ = edges_features_input.count()

edge_features = edges_features_input.groupby(["source", "target"]).applyInPandas(
    get_edge_features_udf, schema=SCHEMA_FEAT_UDF
).toPandas()
edge_features = pd.DataFrame(edge_features["features"].apply(json.loads).tolist())

edge_features.to_parquet(location_features_edges)
del edge_features

Generating edge features


                                                                                

CPU times: user 3.53 s, sys: 465 ms, total: 3.99 s
Wall time: 2min 45s


In [17]:
edge_features = pd.read_parquet(location_features_edges)

In [18]:
%%time

train_edges = train.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
valid_edges = validation.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
test_edges = test.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)

train_features = train_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
validation_features = valid_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
test_features = test_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()

                                                                                

CPU times: user 3.2 s, sys: 271 ms, total: 3.47 s
Wall time: 7.92 s


In [19]:
def save_edge_features(features_in, location):
    features_in = features_in.set_index("target").join(
        all_features, how="left", rsuffix="_target"
    ).reset_index().set_index("source").join(
        all_features, how="left", rsuffix="_source"
    ).reset_index()
    
    features_in = features_in.set_index("target").join(
        anomalies, how="left"
    ).reset_index().set_index("source").join(
        anomalies, how="left", rsuffix="_source"
    ).reset_index()

    features_in.loc[:, "anom_scores_diff"] = features_in.loc[:, "anomaly_score"] - features_in.loc[:, "anomaly_score_source"]
    features_in.loc[:, "anom_scores_min"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).min(axis=0)
    features_in.loc[:, "anom_scores_max"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).max(axis=0)
    features_in.loc[:, "anom_scores_mean"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).mean(axis=0)

    features_in.to_parquet(location)

In [20]:
%%time

save_edge_features(train_features, location_features_edges_train)

CPU times: user 19.2 s, sys: 15 s, total: 34.2 s
Wall time: 34.1 s


In [21]:
%%time

save_edge_features(validation_features, location_features_edges_valid)

CPU times: user 5.79 s, sys: 2.35 s, total: 8.14 s
Wall time: 7.61 s


In [22]:
%%time

save_edge_features(test_features, location_features_edges_test)

CPU times: user 8.12 s, sys: 3.27 s, total: 11.4 s
Wall time: 10.6 s


In [23]:
def save_trx_features(data_in, location):
    columns = [
        "source", "target", "source_currency", "target_currency", "format", "amount", 
        "source_dispensation",
        "target_accumulation",
        "source_positive_balance",
        "source_negative_balance",
        "target_positive_balance",
        "target_negative_balance",
        "source_active_for",
        "target_active_for",
        "is_laundering"
    ]
    trx_features = data_in.select(*columns).toPandas()
    trx_features.loc[:, "source_balance_ratio"] = (
        trx_features["source_positive_balance"] / trx_features["source_negative_balance"]
    ).fillna(0).replace(np.inf, 0)
    trx_features.loc[:, "target_balance_ratio"] = (
        trx_features["target_positive_balance"] / trx_features["target_negative_balance"]
    ).fillna(0).replace(np.inf, 0)
    trx_features.loc[:, "inter_currency"] = trx_features["source_currency"] != trx_features["target_currency"]
    trx_features = pd.get_dummies(trx_features, columns=["source_currency", "target_currency", "format"], drop_first=False)
    trx_features.to_parquet(location)
    del trx_features

In [24]:
%%time

save_trx_features(train, location_train_trx_features)
save_trx_features(validation, location_valid_trx_features)
save_trx_features(test, location_test_trx_features)

                                                                                

CPU times: user 4.09 s, sys: 739 ms, total: 4.83 s
Wall time: 6.11 s


In [26]:
# To free up memory for training

to_reset = %who_ls
to_reset = list(to_reset)
to_reset.remove("to_keep")
to_reset = set(to_reset) - set(to_keep)
for var_to_reset in list(to_reset):
    var_to_reset = f"^{var_to_reset}$"
    %reset_selective -f {var_to_reset}

delete_large_vars(globals(), locals())

In [27]:
def combine_features(location_features_trx, location_features_edges, location_features):
    features_input = spark.read.parquet(location_features_edges)
    trx_features_input = spark.read.parquet(location_features_trx)
    features_input = trx_features_input.join(
        features_input,
        on=["source", "target"],
        how="left"
    ).drop("source", "target")
    features_input.write.parquet(location_features, mode="overwrite")

In [28]:
%%time

combine_features(location_train_trx_features, location_features_edges_train, location_train_features)
combine_features(location_valid_trx_features, location_features_edges_valid, location_valid_features)
combine_features(location_test_trx_features, location_features_edges_test, location_test_features)

25/10/16 21:34:03 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

CPU times: user 37.1 ms, sys: 21.8 ms, total: 58.9 ms
Wall time: 2min 5s


In [None]:
shutil.rmtree(MULTI_PROC_STAGING_LOCATION, ignore_errors=True)

In [5]:
train_features = pd.read_parquet(location_train_features)
validation_features = pd.read_parquet(location_valid_features)
test_features = pd.read_parquet(location_test_features)

In [6]:
all_columns = set(train_features.columns) | set(validation_features.columns) | set(test_features.columns)

for missing in (
    all_columns.symmetric_difference(train_features.columns) |
    all_columns.symmetric_difference(validation_features.columns) |
    all_columns.symmetric_difference(test_features.columns)
):
    if missing in train_features.columns:
        print(f"Deleting {missing} from train")
        del train_features[missing]
    if missing in validation_features.columns:
        print(f"Deleting {missing} from validation")
        del validation_features[missing]
    if missing in test_features.columns:
        print(f"Deleting {missing} from test")
        del test_features[missing]

validation_features = validation_features.loc[:, list(train_features.columns)]
test_features = test_features.loc[:, list(train_features.columns)]

Deleting format_Reinvestment from train


In [7]:
# assert train_features.shape[0] == train_count
# assert validation_features.shape[0] == validation_count
# assert test_features.shape[0] == test_count

In [8]:
train_features_labels = train_features.loc[:, ["is_laundering"]].copy(deep=True)
del train_features["is_laundering"]

validation_features_labels = validation_features.loc[:, ["is_laundering"]].copy(deep=True)
del validation_features["is_laundering"]

test_features_labels = test_features.loc[:, ["is_laundering"]].copy(deep=True)
del test_features["is_laundering"]

In [9]:
len(train_features.columns)

826

In [10]:
# cols = [x for x in train_features.columns if "anom" not in x]
# len(cols)

In [11]:
# train_features = train_features.loc[:, cols]
# validation_features = validation_features.loc[:, cols]
# test_features = test_features.loc[:, cols]

In [12]:
cuda_available = False
try:
    import torch
    cuda_available = torch.cuda.is_available()
except ImportError:
    pass


def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))


xgb_args = dict(
    early_stopping_rounds=20, scale_pos_weight=5,
    # eval_metric=f1_eval, 
    eval_metric="aucpr", 
    disable_default_eval_metric=True, 
    num_parallel_tree=10, max_depth=6,
    colsample_bytree=0.5, subsample=1, 
    eta=0.05,
    device="cpu", nthread=10,
    n_estimators=500, seed=50,
)
if cuda_available:
    xgb_args["device"] = "cuda"
    xgb_args["nthread"] = 2

xgb_fit_args = {
    "eval_set": [(validation_features, validation_features_labels["is_laundering"].values)],
    "verbose": True,
}

In [13]:
%%time

model = xgb.XGBClassifier(**xgb_args)
model.fit(
    train_features, train_features_labels["is_laundering"].values, 
    **xgb_fit_args
)
y_test_predicted = model.predict(test_features)
f1_first = f1_score(test_features_labels["is_laundering"], y_test_predicted) * 100
print(
    round(f1_first, 2),
    round(recall_score(test_features_labels["is_laundering"], y_test_predicted) * 100, 2)
)
print()

[0]	validation_0-aucpr:0.60022
[1]	validation_0-aucpr:0.60989
[2]	validation_0-aucpr:0.61243
[3]	validation_0-aucpr:0.61576
[4]	validation_0-aucpr:0.62089
[5]	validation_0-aucpr:0.61732
[6]	validation_0-aucpr:0.62626
[7]	validation_0-aucpr:0.62734
[8]	validation_0-aucpr:0.62781
[9]	validation_0-aucpr:0.62780
[10]	validation_0-aucpr:0.62954
[11]	validation_0-aucpr:0.63191
[12]	validation_0-aucpr:0.63377
[13]	validation_0-aucpr:0.63536
[14]	validation_0-aucpr:0.63568
[15]	validation_0-aucpr:0.63731
[16]	validation_0-aucpr:0.63888
[17]	validation_0-aucpr:0.64027
[18]	validation_0-aucpr:0.64103
[19]	validation_0-aucpr:0.64173
[20]	validation_0-aucpr:0.64235
[21]	validation_0-aucpr:0.64375
[22]	validation_0-aucpr:0.64437
[23]	validation_0-aucpr:0.64473
[24]	validation_0-aucpr:0.64576
[25]	validation_0-aucpr:0.64648
[26]	validation_0-aucpr:0.64708
[27]	validation_0-aucpr:0.64798
[28]	validation_0-aucpr:0.64913
[29]	validation_0-aucpr:0.64948
[30]	validation_0-aucpr:0.65015
[31]	validation_0-

In [None]:
%%time

f1_scores = []
xgb_fit_args["verbose"] = False
for seed in [0, 10, 20, 30, 40]:
    xgb_args["seed"] = seed
    model = xgb.XGBClassifier(**xgb_args)
    model.fit(train_features, train_features_labels["is_laundering"].values, **xgb_fit_args)
    y_test_predicted = model.predict(test_features)
    f1_scores.append(f1_score(test_features_labels["is_laundering"], y_test_predicted) * 100)
    print(round(f1_scores[-1], 2), "Seed", seed)

In [None]:
print(f"{round(f1_first, 2)} ±{round(np.std(f1_scores), 2)}")