In [1]:
import json
import os
import pickle
import shutil
import sys
import time
from datetime import timedelta, datetime

import igraph as ig
import numpy as np
import pandas as pd
import xgboost as xgb
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.metrics import f1_score, recall_score
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

import settings_large_li as s

os.environ["EXT_DATA_TYPE_FOLDER"] = s.OUTPUT_POSTFIX.lstrip("-")

from common import get_weights, delete_large_vars, MULTI_PROC_STAGING_LOCATION
from communities import get_communities_spark
from features import (
    generate_features_spark, generate_features_udf_wrapper, get_edge_features_udf, 
    SCHEMA_FEAT_UDF, FEATURE_TYPES
)

%load_ext autoreload
%autoreload 2

In [2]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8 (Tested on: Conda 24.1.2 | Apple M3 Pro)"
    )

In [3]:
SPARK_CONF = [
    ("spark.driver.memory", "32g"),
    ("spark.worker.memory", "32g"),
    ("spark.driver.maxResultSize", "32g"),
    ("spark.driver.bindAddress", "127.0.0.1"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
    ("spark.network.timeout", "600s"),
]

shutil.rmtree("artifacts", ignore_errors=True)
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(SPARK_CONF))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/12 10:02:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
start = time.time()

In [5]:
TRAIN_PERC = 0.6
VALIDATION_PERC = 0.2
TEST_PERC = 0.2

KEEP_TOP_N = 100

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

In [6]:
location_main = os.path.join("features", os.environ["EXT_DATA_TYPE_FOLDER"])
# shutil.rmtree(location_main, ignore_errors=True)

location_flows = f"{location_main}{os.sep}flows_input{os.sep}"
location_flow_dispense = f"{location_main}{os.sep}flow_dispense.parquet"
location_flow_passthrough = f"{location_main}{os.sep}flow_passthrough.parquet"
location_flow_sink = f"{location_main}{os.sep}flow_sink.parquet"

location_comm_as_source_features = f"{location_main}{os.sep}comm_as_source_features.parquet"
location_comm_as_target_features = f"{location_main}{os.sep}comm_as_target_features.parquet"
location_comm_as_passthrough_features = f"{location_main}{os.sep}comm_as_passthrough_features.parquet"
location_comm_as_passthrough_features_reverse = f"{location_main}{os.sep}comm_as_passthrough_features_reverse.parquet"

location_features_node_level = f"{location_main}{os.sep}features_node_level.parquet"
location_features_edges = f"{location_main}{os.sep}features_edges.parquet"

location_features_edges_train = f"{location_main}{os.sep}features_edges_train.parquet"
location_features_edges_valid = f"{location_main}{os.sep}features_edges_valid.parquet"
location_features_edges_test = f"{location_main}{os.sep}features_edges_test.parquet"

location_train_trx_features = f"{location_main}{os.sep}train_trx_features.parquet"
location_valid_trx_features = f"{location_main}{os.sep}valid_trx_features.parquet"
location_test_trx_features = f"{location_main}{os.sep}test_trx_features.parquet"

location_train_features = f"{location_main}{os.sep}train_features.parquet"
location_valid_features = f"{location_main}{os.sep}valid_features.parquet"
location_test_features = f"{location_main}{os.sep}test_features.parquet"

try:
    os.makedirs(location_main)
except FileExistsError:
    pass

In [7]:
data = spark.read.parquet(s.STAGED_DATA_LOCATION)
data = data.withColumn("is_laundering", sf.col("is_laundering").cast("boolean"))
data_count_original = data.count()

In [8]:
%%time

trx_ids_sorted = data.sort("timestamp").select("transaction_id").toPandas()["transaction_id"].values
trx_count = len(trx_ids_sorted)

last_train_index = int(np.floor(trx_count * TRAIN_PERC))
last_validation_index = last_train_index + int(np.floor(trx_count * VALIDATION_PERC))
train_indexes = trx_ids_sorted[:last_train_index]
validation_indexes = trx_ids_sorted[last_train_index:last_validation_index]
test_indexes = trx_ids_sorted[last_validation_index:]

train_indexes = spark.createDataFrame(
    pd.DataFrame(train_indexes, columns=["transaction_id"])
).persist(StorageLevel.DISK_ONLY)
validation_indexes = spark.createDataFrame(
    pd.DataFrame(validation_indexes, columns=["transaction_id"])
).persist(StorageLevel.DISK_ONLY)
test_indexes = spark.createDataFrame(
    pd.DataFrame(test_indexes, columns=["transaction_id"])
).persist(StorageLevel.DISK_ONLY)

train = train_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
validation = validation_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
test = test_indexes.join(
    data, on="transaction_id", how="left"
).persist(StorageLevel.DISK_ONLY)
print()
print(trx_count, train.count(), validation.count(), test.count())
print()

                                                                                






175887982 105532789 35177596 35177597

CPU times: user 3.07 s, sys: 1.46 s, total: 4.53 s
Wall time: 25min 7s


                                                                                

In [9]:
# Later on, we will reset the variables (to free up memory), while still keeping these intact
to_keep = %who_ls
to_keep = list(to_keep)

In [10]:
%%time

edges_totals = data.select("source", "target", "amount").groupby(
    ["source", "target"]
).agg(sf.count("amount").alias("amount")).toPandas()
edges_totals = edges_totals.sort_values("amount", ascending=False).reset_index(drop=True)
left_edges = spark.createDataFrame(edges_totals.groupby("target").head(KEEP_TOP_N).loc[:, ["source", "target"]])
right_edges = spark.createDataFrame(edges_totals.groupby("source").head(KEEP_TOP_N).loc[:, ["source", "target"]])

columns = ["source", "target", "timestamp", "amount"]

left = left_edges.select(sf.col("source").alias("src"), sf.col("target").alias("tgt")).join(
    data.select(*columns),
    on=(sf.col("src") == sf.col("source")) & (sf.col("tgt") == sf.col("target")),
    how="left"
).drop("src", "tgt").persist(StorageLevel.DISK_ONLY)
select = []
for column in left.columns:
    select.append(sf.col(column).alias(f"left_{column}"))
left = left.select(*select)
right = right_edges.select(sf.col("source").alias("src"), sf.col("target").alias("tgt")).join(
    data.select(*columns),
    on=(sf.col("src") == sf.col("source")) & (sf.col("tgt") == sf.col("target")),
    how="left"
).drop("src", "tgt").persist(StorageLevel.DISK_ONLY)

print(left.count(), right.count())



175820031 159671706
CPU times: user 11.4 s, sys: 739 ms, total: 12.1 s
Wall time: 2min 30s


                                                                                

In [11]:
%%time

flows_temporal = left.join(
    right,
    (left["left_target"] == right["source"]) &
    (left["left_timestamp"] <= right["timestamp"]),
    how="inner"
).groupby(["left_source", "left_target", "source", "target"]).agg(
    sf.sum("left_amount").alias("left_amount"),
    sf.sum("amount").alias("amount"),
).drop("left_target").select(
    sf.col("left_source").alias("dispense"),
    sf.col("source").alias("passthrough"),
    sf.col("target").alias("sink"),
    sf.least("left_amount", "amount").alias("amount"),
)

aggregate = [
    sf.sum("amount").alias("amount_sum"),
    sf.mean("amount").alias("amount_mean"),
    sf.median("amount").alias("amount_median"),
    sf.max("amount").alias("amount_max"),
    sf.stddev("amount").alias("amount_std"),
    sf.countDistinct("dispense").alias("dispense_count"),
    sf.countDistinct("passthrough").alias("passthrough_count"),
    sf.countDistinct("sink").alias("sink_count"),
]
for flow_location, flow_type in [
    (location_flow_dispense, "dispense"), (location_flow_passthrough, "passthrough"), (location_flow_sink, "sink")
]:
    print(flow_type)
    flows_temporal_stats = flows_temporal.groupby(flow_type).agg(*aggregate).toPandas()
    flows_temporal_cyclic_stats = flows_temporal.where(
        (sf.col("dispense") == sf.col("sink"))
    ).groupby(flow_type).agg(*aggregate).toPandas()
    flows_temporal_stats = flows_temporal_stats.set_index(flow_type).join(
        flows_temporal_cyclic_stats.set_index(flow_type),
        how="left", rsuffix="_cycle"
    )
    flows_temporal_stats.index.name = "key"
    flows_temporal_stats.to_parquet(flow_location)
    del flows_temporal_stats
    del flows_temporal_cyclic_stats

left.unpersist()
right.unpersist()

del edges_totals
del left_edges
del right_edges

dispense


                                                                                

passthrough


                                                                                

sink


                                                                                

CPU times: user 5.8 s, sys: 1.45 s, total: 7.26 s
Wall time: 19min 58s


In [12]:
%%time

data_input = data.select("*")
nodes_source = set(data.select("source").distinct().toPandas()["source"])
nodes_target = set(data.select("target").distinct().toPandas()["target"])
nodes_passthrough = nodes_source.intersection(nodes_target)

%run generate_flow_features.ipynb

comm_as_source_features.to_parquet(location_comm_as_source_features)
comm_as_target_features.to_parquet(location_comm_as_target_features)
comm_as_passthrough_features.to_parquet(location_comm_as_passthrough_features)
comm_as_passthrough_features_reverse.to_parquet(location_comm_as_passthrough_features_reverse)

del comm_as_source_features
del comm_as_target_features
del comm_as_passthrough_features
del comm_as_passthrough_features_reverse

                                                                                


Processing comm_as_source

Processed hop #1 | 6,546,146 | 1,667,461
Processed hop #2 | 11,743,314 | 1,469,375
Processed hop #3 | 21,126,347 | 1,409,144
Processed hop #4 | 28,378,197 | 1,391,266

Processing comm_as_target

Processed hop #1 | 6,558,360 | 1,314,480
Processed hop #2 | 18,557,822 | 1,194,763
Processed hop #3 | 31,991,627 | 1,162,194
Processed hop #4 | 40,376,550 | 1,141,529

Processing comm_as_passthrough

Processed hop #1 | 6,037,558 | 1,296,091
Processed hop #2 | 10,075,947 | 1,116,547
Processed hop #3 | 17,959,727 | 1,070,066
Processed hop #4 | 22,473,290 | 1,055,391

Processing comm_as_passthrough_reverse

Processed hop #1 | 6,430,169 | 1,286,192
Processed hop #2 | 18,024,540 | 1,166,579
Processed hop #3 | 31,226,605 | 1,135,556
Processed hop #4 | 39,696,655 | 1,116,244


comm_as_source_features

CPU times: user 2min 35s, sys: 9.71 s, total: 2min 45s
Wall time: 2min 38s

comm_as_target_features

CPU times: user 2min 16s, sys: 8.5 s, total: 2min 24s
Wall time: 2min 18s


In [13]:
%%time

data_agg_weights = get_weights(
    data.groupby(["source", "target"])
    .agg(
        sf.sum("amount").alias("amount")
    ).toPandas()
)

data_agg_weights_rev = data_agg_weights.rename(
    columns={"target": "source", "source": "target"}
).loc[:, ["source", "target", "weight"]]
data_agg_weights_ud = pd.concat([data_agg_weights, data_agg_weights_rev], ignore_index=True)
data_agg_weights_ud = data_agg_weights_ud.groupby(["source", "target"]).agg(weight=("weight", "sum")).reset_index()

data_agg_weights_ud.sort_values("weight", ascending=False, inplace=True)
grouped_ud = data_agg_weights_ud.groupby("source").head(KEEP_TOP_N).reset_index(drop=True)
grouped_ud = grouped_ud.groupby("source").agg(targets=("target", set))

total = grouped_ud.index.nunique()
nodes_neighborhoods = {}
for index, (source, targets) in enumerate(grouped_ud.iterrows()):
    community_candidates = {source}
    for target in targets["targets"]:
        community_candidates |= (grouped_ud.loc[target, "targets"] | {target})
    nodes_neighborhoods[source] = set(community_candidates)
    if not (index % 250_000):
        print(index, total)

del data_agg_weights_rev
del data_agg_weights_ud
del grouped_ud

                                                                                

0 2047791
250000 2047791
500000 2047791
750000 2047791
1000000 2047791
1250000 2047791
1500000 2047791
1750000 2047791
2000000 2047791
CPU times: user 2min 50s, sys: 2.41 s, total: 2min 53s
Wall time: 3min 12s


In [14]:
%%time

print("Constructing communities")

graph = ig.Graph.DataFrame(data_agg_weights, use_vids=False, directed=True)
communities = get_communities_spark(nodes_neighborhoods, graph, os.cpu_count(), spark)

del graph
del data_agg_weights
del nodes_neighborhoods

Constructing communities


                                                                                

CPU times: user 51.2 s, sys: 4.07 s, total: 55.3 s
Wall time: 7min 9s


In [15]:
%%time

ts_min = data.select(sf.min("timestamp").alias("x")).collect()[0]["x"] - timedelta(minutes=1)
data_graph_agg = data.groupby(["source", "target", "source_bank", "target_bank", "source_currency"]).agg(
    sf.count("source").alias("num_transactions"),
    sf.sum("amount").alias("amount"),
    sf.sum("source_amount").alias("source_amount"),
    sf.collect_list(sf.array((sf.col("timestamp") - ts_min).cast("long"), sf.col("amount"))).alias("timestamps_amounts"),
)
data_graph_agg_sdf = data_graph_agg.persist(StorageLevel.DISK_ONLY)
print(data_graph_agg_sdf.count())
data_graph_agg = data_graph_agg_sdf.toPandas()

                                                                                

8177437


                                                                                

CPU times: user 16.8 s, sys: 5.4 s, total: 22.2 s
Wall time: 2min 40s


In [16]:
%%time

print("Communities features creation")

graph = ig.Graph.DataFrame(data_graph_agg, use_vids=False, directed=True)
features = generate_features_spark(communities, graph, spark)
features.columns = [f"{s.G_COMM_PREFIX}{x}" if x != "key" else x for x in features.columns]

del graph
del communities
del data_graph_agg

Communities features creation


                                                                                

CPU times: user 30min 26s, sys: 14min 47s, total: 45min 13s
Wall time: 1h 18min 48s


In [17]:
%%time

print("1-hop-source features creation")

features_source = data_graph_agg_sdf.withColumn("key", sf.col("source")).groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_source = pd.DataFrame(features_source["features"].apply(json.loads).tolist())
types = {k: v for k, v in FEATURE_TYPES.items() if k in features_source.columns}
features_source = features_source.astype(types)
features_source.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_source.columns]

1-hop-source features creation


                                                                                

CPU times: user 10.3 s, sys: 1.99 s, total: 12.3 s
Wall time: 23min 18s


In [18]:
%%time

print("1-hop-target features creation")

features_target = data_graph_agg_sdf.withColumn("key", sf.col("target")).groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_target = pd.DataFrame(features_target["features"].apply(json.loads).tolist())
types = {k: v for k, v in FEATURE_TYPES.items() if k in features_target.columns}
features_target = features_target.astype(types)
features_target.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_target.columns]

1-hop-target features creation


                                                                                

CPU times: user 8.43 s, sys: 992 ms, total: 9.43 s
Wall time: 18min 18s


In [20]:
%%time

all_features = features.set_index("key").join(
    features_source.set_index("key"), how="left", rsuffix=f"_1_hop_as_source"
)
all_features.index.name = "key"
all_features = all_features.reset_index()

all_features = all_features.set_index("key").join(
    features_target.set_index("key"), how="left", rsuffix=f"_1_hop_as_target"
)

all_features = all_features.join(
    pd.read_parquet(location_comm_as_source_features), how="left", rsuffix="_dispense"
).join(
    pd.read_parquet(location_comm_as_target_features), how="left", rsuffix="_sink"
).join(
    pd.read_parquet(location_comm_as_passthrough_features), how="left", rsuffix="_passthrough"
).join(
    pd.read_parquet(location_comm_as_passthrough_features_reverse), how="left", rsuffix="_passthrough_rev"
).join(
    pd.read_parquet(location_flow_dispense), how="left", rsuffix="_dispense"
).join(
    pd.read_parquet(location_flow_passthrough), how="left", rsuffix="_passthrough"
).join(
    pd.read_parquet(location_flow_sink), how="left", rsuffix="_sink"
)

all_features.to_parquet(location_features_node_level)
del all_features

CPU times: user 20.8 s, sys: 3.57 s, total: 24.4 s
Wall time: 21.9 s


In [21]:
all_features = pd.read_parquet(location_features_node_level)

In [22]:
anomalies = all_features.loc[:, []]
anomalies.loc[:, "anomaly_score"] = IsolationForest().fit(
    all_features.fillna(0)
).decision_function(all_features.fillna(0))
anomalies.loc[:, "anomaly_score"] += abs(anomalies.loc[:, "anomaly_score"].min())

In [23]:
if s.FILE_SIZE == "Small":
    n_components = 50
elif s.FILE_SIZE == "Medium":
    n_components = 20
else:
    n_components = 5

pca = PCA(n_components=n_components)
all_features_dim_reduced = pd.DataFrame(
    pca.fit_transform(normalize(all_features.fillna(0), norm="l1", axis=1)),
    index=all_features.index
)
print(n_components, round(sum(pca.explained_variance_ratio_) * 100, 2))
all_features_dim_reduced.columns = [
    f"pca_{x + 1}" for x in all_features_dim_reduced.columns
]
del all_features

5 93.55


In [24]:
%%time

print(f"Generating edge features")

to_select = ["source", "target", "format", "source_currency", "source_amount", "amount", "timestamp"]

edges_features_input = data.select(to_select).groupby(
    ["source", "target", "format", "source_currency"]
).agg(
    sf.sum("source_amount").alias("source_amount"), 
    sf.sum("amount").alias("amount"),
    sf.unix_timestamp(sf.min("timestamp")).alias("min_ts"),
    sf.unix_timestamp(sf.max("timestamp")).alias("max_ts"),
).persist(StorageLevel.DISK_ONLY)
_ = edges_features_input.count()

edge_features = edges_features_input.groupby(["source", "target"]).applyInPandas(
    get_edge_features_udf, schema=SCHEMA_FEAT_UDF
).toPandas()
edge_features = pd.DataFrame(edge_features["features"].apply(json.loads).tolist())

edge_features.to_parquet(location_features_edges)
del edge_features

Generating edge features


                                                                                

CPU times: user 25.6 s, sys: 4.68 s, total: 30.3 s
Wall time: 28min 58s


In [25]:
edge_features = pd.read_parquet(location_features_edges)

In [26]:
%%time

train_edges = train.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
valid_edges = validation.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)
test_edges = test.select("source", "target").drop_duplicates().toPandas().set_index(
    ["source", "target"]
)

train_features = train_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
validation_features = valid_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
test_features = test_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()

                                                                                

CPU times: user 37.7 s, sys: 3.31 s, total: 41.1 s
Wall time: 1min 50s


In [27]:
def save_edge_features(features_in, location):
    features_in = features_in.set_index("target").join(
        anomalies, how="left"
    ).reset_index().set_index("source").join(
        anomalies, how="left", rsuffix="_source"
    ).reset_index().set_index("target").join(
        all_features_dim_reduced, how="left"
    ).reset_index().set_index("source").join(
        all_features_dim_reduced, how="left", rsuffix="_source"
    ).reset_index()
    features_in.loc[:, "anom_scores_diff"] = features_in.loc[:, "anomaly_score"] - features_in.loc[:, "anomaly_score_source"]
    features_in.loc[:, "anom_scores_min"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).min(axis=0)
    features_in.loc[:, "anom_scores_max"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).max(axis=0)
    features_in.loc[:, "anom_scores_mean"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).mean(axis=0)
    features_in.to_parquet(location)

In [28]:
%%time

save_edge_features(train_features, location_features_edges_train)

CPU times: user 20.7 s, sys: 3.72 s, total: 24.4 s
Wall time: 24.2 s


In [29]:
%%time

save_edge_features(validation_features, location_features_edges_valid)

CPU times: user 14.4 s, sys: 2.45 s, total: 16.8 s
Wall time: 16.4 s


In [30]:
%%time

save_edge_features(test_features, location_features_edges_test)

CPU times: user 14.4 s, sys: 2.33 s, total: 16.8 s
Wall time: 16.4 s


In [31]:
def save_trx_features(data_in, location):
    columns = ["source", "target", "source_currency", "target_currency", "format", "amount", "is_laundering"]
    
    trx_features = data_in.select(*columns).toPandas()
    trx_features.loc[:, "inter_currency"] = trx_features["source_currency"] != trx_features["target_currency"]

    trx_features.to_parquet(location)
    del trx_features

In [32]:
%%time

save_trx_features(train, location_train_trx_features)
save_trx_features(validation, location_valid_trx_features)
save_trx_features(test, location_test_trx_features)

                                                                                

CPU times: user 1min 25s, sys: 12.9 s, total: 1min 38s
Wall time: 2min 10s


In [33]:
# To free up memory for training

to_reset = %who_ls
to_reset = list(to_reset)
to_reset.remove("to_keep")
to_reset = set(to_reset) - set(to_keep)
for var_to_reset in list(to_reset):
    var_to_reset = f"^{var_to_reset}$"
    %reset_selective -f {var_to_reset}

delete_large_vars(globals(), locals())

True

In [34]:
def combine_features(location_features_trx, location_features_edges, location_features):
    columns_category = ["source_currency", "target_currency", "format"]
    new_types = {column: "category" for column in columns_category}
    features_input = spark.read.parquet(location_features_edges)
    trx_features_input = spark.read.parquet(location_features_trx).withColumn(
        "source_left", sf.col("source")
    ).withColumn(
        "target_left", sf.col("target")
    ).drop("source", "target")
    features_input = trx_features_input.join(
        features_input,
        (trx_features_input["source_left"] == features_input["source"]) &
        (trx_features_input["target_left"] == features_input["target"]),
        how="left"
    ).drop("source_left", "target_left", "source", "target")
    features_input = features_input.coalesce(1).write.parquet("temp-spark.parquet", mode="overwrite")
    features_input = pd.read_parquet("temp-spark.parquet").fillna(0)
    for column in features_input.columns:
        if features_input[column].dtype == np.float64:
            new_types[column] = np.float32
    features_input = features_input.astype(new_types)
    features_input.to_parquet(location_features)
    del features_input

In [35]:
%%time

combine_features(location_train_trx_features, location_features_edges_train, location_train_features)
combine_features(location_valid_trx_features, location_features_edges_valid, location_valid_features)
combine_features(location_test_trx_features, location_features_edges_test, location_test_features)

25/07/12 14:09:48 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

CPU times: user 2min 45s, sys: 1min 19s, total: 4min 5s
Wall time: 9min 49s


In [36]:
shutil.rmtree(MULTI_PROC_STAGING_LOCATION, ignore_errors=True)

In [37]:
print((time.time() - start) // 60)
start = time.time()

257.0


In [38]:
train_features = pd.read_parquet(location_train_features)
validation_features = pd.read_parquet(location_valid_features)
test_features = pd.read_parquet(location_test_features)

In [39]:
%%time

missing_columns = (
    (set(train_features.columns).symmetric_difference(validation_features.columns)) |
    (set(train_features.columns).symmetric_difference(test_features.columns)) |
    (set(test_features.columns).symmetric_difference(validation_features.columns))
)
for column in missing_columns:
    if missing in train_features.columns:
        print(f"Deleting missing column from train: {column}")
        del train_features[column]
    if missing in validation_features.columns:
        print(f"Deleting missing column from validation: {column}")
        del validation_features[column]
    if missing in test_features.columns:
        print(f"Deleting missing column from test: {column}")
        del test_features[column]

train_features_labels = train_features.loc[:, ["is_laundering"]].copy(deep=True)
del train_features["is_laundering"]

validation_features_labels = validation_features.loc[:, ["is_laundering"]].copy(deep=True)
validation_features = validation_features.loc[:, train_features.columns]

test_features_labels = test_features.loc[:, ["is_laundering"]].copy(deep=True)
test_features = test_features.loc[:, train_features.columns]

CPU times: user 1.4 s, sys: 2.82 s, total: 4.22 s
Wall time: 6.24 s


In [40]:
def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))


def train_model(x, y, x_, y_, cv=False):
    if cv:
        model = xgb.XGBClassifier(
            early_stopping_rounds=20, scale_pos_weight=5,
            eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=1, max_depth=6,
            colsample_bytree=1, subsample=1, n_estimators=100,
            enable_categorical=True,
        )
    else:
        model = xgb.XGBClassifier(
            early_stopping_rounds=20, scale_pos_weight=5,
            eval_metric=f1_eval, disable_default_eval_metric=True, 
            num_parallel_tree=1, max_depth=6,
            colsample_bytree=1, subsample=1, 
            n_estimators=500, enable_categorical=True,
        )
    model.fit(x, y, verbose=not cv, eval_set=[(x_, y_)])
    print(f"Best iteration: {model.best_iteration}\n")
    return model

In [41]:
%%time

model = train_model(
    train_features, train_features_labels["is_laundering"].values, 
    validation_features, validation_features_labels["is_laundering"].values,
)
y_test_predicted = model.predict(test_features)
f1_final = f1_score(test_features_labels["is_laundering"], y_test_predicted) * 100
print(
    round(f1_final, 2),
    round(recall_score(test_features_labels["is_laundering"], y_test_predicted) * 100, 2)
)
print()

[0]	validation_0-f1_eval:1.00000
[1]	validation_0-f1_eval:0.99704
[2]	validation_0-f1_eval:0.99704
[3]	validation_0-f1_eval:0.99506
[4]	validation_0-f1_eval:0.88028
[5]	validation_0-f1_eval:0.86268
[6]	validation_0-f1_eval:0.86089
[7]	validation_0-f1_eval:0.85624
[8]	validation_0-f1_eval:0.85533
[9]	validation_0-f1_eval:0.85548
[10]	validation_0-f1_eval:0.85296
[11]	validation_0-f1_eval:0.85227
[12]	validation_0-f1_eval:0.84973
[13]	validation_0-f1_eval:0.84943
[14]	validation_0-f1_eval:0.84867
[15]	validation_0-f1_eval:0.84860
[16]	validation_0-f1_eval:0.84225
[17]	validation_0-f1_eval:0.84151
[18]	validation_0-f1_eval:0.84223
[19]	validation_0-f1_eval:0.83910
[20]	validation_0-f1_eval:0.83820
[21]	validation_0-f1_eval:0.83831
[22]	validation_0-f1_eval:0.83781
[23]	validation_0-f1_eval:0.83766
[24]	validation_0-f1_eval:0.83773
[25]	validation_0-f1_eval:0.84362
[26]	validation_0-f1_eval:0.83802
[27]	validation_0-f1_eval:0.83775
[28]	validation_0-f1_eval:0.83761
[29]	validation_0-f1_eva

In [42]:
%%time

CV_FOLD_PERC = 0.8
N_FOLDS = 5

f1_scores = []
for fold in range(N_FOLDS):
    print("Fold", fold + 1)
    x_train = train_features.sample(frac=CV_FOLD_PERC)
    x_train_labels = x_train.loc[:, []].join(train_features_labels, how="left")
    x_validation = validation_features.sample(frac=CV_FOLD_PERC)
    x_validation_labels = x_validation.loc[:, []].join(validation_features_labels, how="left")
    model = train_model(
        x_train, x_train_labels["is_laundering"].values, 
        x_validation, x_validation_labels["is_laundering"].values,
        cv=True
    )
    y_test_predicted = model.predict(test_features)
    f1_cv = f1_score(test_features_labels["is_laundering"], y_test_predicted) * 100
    print(
        round(f1_cv, 2),
        round(recall_score(test_features_labels["is_laundering"], y_test_predicted) * 100, 2)
    )
    f1_scores.append(f1_cv)

Fold 1
Best iteration: 99

29.2 18.98
Fold 2
Best iteration: 87

27.96 18.24
Fold 3
Best iteration: 93

28.54 18.69
Fold 4
Best iteration: 98

28.86 18.87
Fold 5
Best iteration: 99

28.94 18.99
CPU times: user 2h 25min 40s, sys: 7min 3s, total: 2h 32min 44s
Wall time: 35min 30s


In [43]:
gfp_best = 24.23
gfp_std = 0.12

In [44]:
print(f"GFP best: {gfp_best} ± {gfp_std}")

GFP best: 24.23 ± 0.12


In [45]:
print(f"{round(f1_final, 2)} ±{round(np.std(f1_scores), 2)}")

30.35 ±0.43


In [46]:
uplift = round(((f1_final - gfp_best) / gfp_best) * 100, 2)
print(f"Uplift of {uplift}%")

Uplift of 25.25%


In [47]:
print((time.time() - start) // 60)

51.0
