### https://arxiv.org/pdf/2402.08593

In [1]:
import json
import random
import os
import pickle
import time
import shutil
import sys
import uuid
from collections import defaultdict
from datetime import timedelta, datetime
from glob import glob
from itertools import product
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, recall_score, RocCurveDisplay

import igraph as ig
import leidenalg as la
import numpy as np
import pandas as pd
import xgboost as xgb

import settings as s

os.environ["EXT_DATA_TYPE_FOLDER"] = "ethereum"

from common import get_weights, delete_large_vars, MULTI_PROC_STAGING_LOCATION
from communities import get_communities_spark
from features import (
    generate_features_spark, generate_features_udf_wrapper,
    SCHEMA_FEAT_UDF
)

%load_ext autoreload
%autoreload 2

In [2]:
start_script = time.time()

In [3]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 3):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.3"
    )

In [4]:
SPARK_CONF = [
    ("spark.driver.memory", "32g"),
    ("spark.worker.memory", "32g"),
    ("spark.driver.maxResultSize", "32g"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
    ("spark.network.timeout", "600s"),
    ("spark.sql.autoBroadcastJoinThreshold", -1),
    ("spark.local.dir", f".{os.sep}spark-temp")
]

shutil.rmtree("artifacts", ignore_errors=True)
shutil.rmtree("spark-temp", ignore_errors=True)
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(SPARK_CONF))
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/17 22:25:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/17 22:25:02 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [5]:
TRAIN_PERC = 0.65
VALIDATION_PERC = 0.15
TEST_PERC = 0.2

KEEP_TOP_N = 100

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

location_main = os.path.join("features", os.environ["EXT_DATA_TYPE_FOLDER"])
# shutil.rmtree(location_main, ignore_errors=True)

location_communities_leiden = f"{location_main}{os.sep}communities_leiden.parquet"

location_features_leiden = f"{location_main}{os.sep}features_leiden.parquet"
location_features_ego = f"{location_main}{os.sep}features_ego.parquet"
location_features_2_hop = f"{location_main}{os.sep}features_2_hop.parquet"
location_features_2_hop_out = f"{location_main}{os.sep}features_2_hop_out.parquet"
location_features_2_hop_in = f"{location_main}{os.sep}features_2_hop_in.parquet"
location_features_2_hop_combined = f"{location_main}{os.sep}features_2_hop_combined.parquet"
location_features_source = f"{location_main}{os.sep}features_source.parquet"
location_features_target = f"{location_main}{os.sep}features_target.parquet"

location_flow_dispense = f"{location_main}{os.sep}location_flow_dispense.parquet"
location_flow_passthrough = f"{location_main}{os.sep}location_flow_passthrough.parquet"
location_flow_sink = f"{location_main}{os.sep}location_flow_sink.parquet"

location_comm_as_source_features = f"{location_main}{os.sep}comm_as_source_features.parquet"
location_comm_as_target_features = f"{location_main}{os.sep}comm_as_target_features.parquet"
location_comm_as_passthrough_features = f"{location_main}{os.sep}comm_as_passthrough_features.parquet"
location_comm_as_passthrough_features_reverse = f"{location_main}{os.sep}comm_as_passthrough_features_reverse.parquet"

location_features_node_level = f"{location_main}{os.sep}features_node_level.parquet"
location_features_edges = f"{location_main}{os.sep}features_edges.parquet"

location_features_edges_train = f"{location_main}{os.sep}features_edges_train.parquet"
location_features_edges_valid = f"{location_main}{os.sep}features_edges_valid.parquet"
location_features_edges_test = f"{location_main}{os.sep}features_edges_test.parquet"

location_train_trx_features = f"{location_main}{os.sep}train_trx_features.parquet"
location_valid_trx_features = f"{location_main}{os.sep}valid_trx_features.parquet"
location_test_trx_features = f"{location_main}{os.sep}test_trx_features.parquet"

location_train_features = f"{location_main}{os.sep}train_features.parquet"
location_valid_features = f"{location_main}{os.sep}valid_features.parquet"
location_test_features = f"{location_main}{os.sep}test_features.parquet"

try:
    os.makedirs(location_main)
except FileExistsError:
    pass

In [6]:
data = pd.read_parquet(s.INPUT_DATA_FILE)
# Only interested when "target" is phishing
phishing_nodes = set(data.loc[data["is_phishing"], "target"].unique())
assert len(phishing_nodes) == 1164

In [7]:
%%time

source_firsts = data.groupby("source").agg(first_trx=("timestamp", "min"))
target_firsts = data.groupby("target").agg(first_trx=("timestamp", "min"))
active_since = source_firsts.join(target_firsts, lsuffix="_left", how="outer").fillna(datetime.now())
active_since.loc[:, "active_since"] = active_since.apply(lambda x: min([x["first_trx_left"], x["first_trx"]]), axis=1)
active_since = active_since.loc[:, ["active_since"]]
active_since.sort_values("active_since", inplace=True)

number_of_train_accounts = int(np.floor(active_since.shape[0] * TRAIN_PERC))
number_of_validation_accounts = int(np.floor(active_since.shape[0] * VALIDATION_PERC))
train_accounts = set(active_since.head(number_of_train_accounts).index.tolist())
assert len(train_accounts) == number_of_train_accounts
remaining = active_since.loc[~active_since.index.isin(train_accounts), :].sort_values("active_since")
validation_accounts = set(remaining.head(number_of_validation_accounts).index.tolist())
assert len(validation_accounts) == number_of_validation_accounts
test_accounts = set(active_since.index) - train_accounts - validation_accounts
print(f"{len(train_accounts):,} | {len(validation_accounts):,} | {len(test_accounts):,}")
assert sorted(train_accounts | validation_accounts | test_accounts) == sorted(active_since.index)

1,932,767 | 446,023 | 594,699
CPU times: user 1min 23s, sys: 793 ms, total: 1min 23s
Wall time: 1min 24s


In [8]:
train = data.loc[data["source"].isin(train_accounts) & data["target"].isin(train_accounts), :]
validation = data.loc[data["source"].isin(validation_accounts) & data["target"].isin(validation_accounts), :]
test = data.loc[data["source"].isin(test_accounts) & data["target"].isin(test_accounts), :]
print(
    round(train.shape[0] / data.shape[0], 2), 
    round(validation.shape[0] / data.shape[0], 2), 
    round(test.shape[0] / data.shape[0], 2)
)
train_count, validation_count, test_count = train.shape[0], validation.shape[0], test.shape[0]

assert set(train.index).intersection(validation.index) == set()
assert set(validation.index).intersection(test.index) == set()
assert set(train.index).intersection(test.index) == set()

0.74 0.05 0.03


In [9]:
%%time

edges = data.groupby(["source", "target"]).agg(
    amount_usd=("amount_usd", "sum")
).reset_index()
weights = get_weights(edges)
edges_agg = edges.set_index(["source", "target"]).join(
    weights.set_index(["source", "target"]), how="left"
).reset_index()
edges_agg.loc[:, "amount_usd_weighted"] = (
    edges_agg.loc[:, "amount_usd"] * 
    (edges_agg.loc[:, "weight"] / edges_agg.loc[:, "weight"].max())
)

CPU times: user 1min 22s, sys: 2.19 s, total: 1min 24s
Wall time: 1min 24s


In [10]:
# Later on, we will reset the variables (to free up memory), while still keeping these intact
to_keep = %who_ls
to_keep = list(to_keep)

In [11]:
TOP_N = 50
NUM_HOPS = 5

In [12]:
%%time

data_input = spark.createDataFrame(edges_agg)
nodes_source = set(edges_agg["source"].unique())
nodes_target = set(edges_agg["target"].unique())
nodes_passthrough = nodes_source.intersection(nodes_target)

%run generate_flow_features.ipynb

comm_as_source_features.to_parquet(location_comm_as_source_features)
comm_as_target_features.to_parquet(location_comm_as_target_features)
comm_as_passthrough_features.to_parquet(location_comm_as_passthrough_features)
comm_as_passthrough_features_reverse.to_parquet(location_comm_as_passthrough_features_reverse)

del comm_as_source_features
del comm_as_target_features
del comm_as_passthrough_features
del comm_as_passthrough_features_reverse

                                                                                

data_agg 5355155


                                                                                


Processing comm_as_source



                                                                                

Processed hop #1 | 4,097,207 | 2,113,092


                                                                                

Processed hop #2 | 28,187,498 | 1,061,380


25/08/17 12:23:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:23:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:23:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:23:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:23:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:23:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:23:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:23:33 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:23:33 WARN RowBasedKeyValueBatch: Calling spill() on

Processed hop #3 | 27,947,678 | 905,405


25/08/17 12:26:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:26:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:26:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:26:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:26:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:26:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:26:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:26:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:26:36 WARN RowBasedKeyValueBatch: Calling spill() on

Processed hop #4 | 33,630,298 | 900,985


25/08/17 12:31:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:31:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:31:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:31:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:31:12 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:31:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:31:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:31:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/08/17 12:31:13 WARN RowBasedKeyValueBatch: Calling spill() on

Processed hop #5 | 33,279,305 | 889,540

Processing comm_as_target



                                                                                

Processed hop #1 | 1,595,960 | 1,119,024


                                                                                

Processed hop #2 | 36,986,916 | 1,109,886


                                                                                

Processed hop #3 | 36,884,480 | 1,068,444


                                                                                

Processed hop #4 | 52,038,968 | 1,082,890


                                                                                

Processed hop #5 | 52,261,072 | 1,064,713

Processing comm_as_passthrough



                                                                                

Processed hop #1 | 1,214,666 | 258,627


                                                                                

Processed hop #2 | 5,402,821 | 182,194


                                                                                

Processed hop #3 | 5,449,649 | 159,934


                                                                                

Processed hop #4 | 6,521,836 | 157,566


                                                                                

Processed hop #5 | 6,354,889 | 156,225

Processing comm_as_passthrough_reverse



                                                                                

Processed hop #1 | 537,548 | 258,628


                                                                                

Processed hop #2 | 8,850,094 | 250,407


                                                                                

Processed hop #3 | 9,263,090 | 243,777


                                                                                

Processed hop #4 | 11,708,036 | 246,105


                                                                                

Processed hop #5 | 12,045,187 | 242,996


comm_as_source_features



                                                                                

CPU times: user 11 s, sys: 1.73 s, total: 12.8 s
Wall time: 30.5 s

comm_as_target_features



                                                                                

CPU times: user 6.12 s, sys: 1 s, total: 7.12 s
Wall time: 22.5 s

comm_as_passthrough_features



                                                                                

CPU times: user 3.55 s, sys: 531 ms, total: 4.08 s
Wall time: 8.76 s

comm_as_passthrough_features_reverse



                                                                                

CPU times: user 7.56 s, sys: 1.26 s, total: 8.81 s
Wall time: 14 s


CPU times: user 50.8 s, sys: 8.26 s, total: 59.1 s
Wall time: 31min 52s


In [13]:
%%time

# TODO: Use https://docs.rapids.ai/api/cugraph/legacy/api_docs/api/cugraph/cugraph.leiden/ ?

print("Constructing Leiden communities")

graph = ig.Graph.DataFrame(edges_agg.loc[:, ["source", "target", "amount_usd_weighted"]], use_vids=False, directed=True)
nodes_mapping = {x.index: x["name"] for x in graph.vs()}
communities_leiden = la.find_partition(
    graph, la.ModularityVertexPartition, n_iterations=100, weights="amount_usd_weighted"
)
communities_leiden = [[nodes_mapping[_] for _ in x] for x in communities_leiden]
communities_leiden = [(str(uuid.uuid4()), set(x)) for x in communities_leiden]
sizes_leiden = [len(x[1]) for x in communities_leiden]

with open(location_communities_leiden, "wb") as fl:
    pickle.dump(communities_leiden, fl)

Constructing Leiden communities
CPU times: user 2h 10min 31s, sys: 29.8 s, total: 2h 11min
Wall time: 2h 10min 57s


In [12]:
with open(location_communities_leiden, "rb") as fl:
    communities_leiden = pickle.load(fl)

In [26]:
data = spark.createDataFrame(data)

In [14]:
%%time

data_agg_weights = get_weights(
    data.groupby(["source", "target"])
    .agg(
        sf.sum("amount_usd").alias("amount_usd")
    ).toPandas()
)

data_agg_weights_rev = data_agg_weights.rename(
    columns={"target": "source", "source": "target"}
).loc[:, ["source", "target", "weight"]]
data_agg_weights_ud = pd.concat([data_agg_weights, data_agg_weights_rev], ignore_index=True)
data_agg_weights_ud = data_agg_weights_ud.groupby(["source", "target"]).agg(weight=("weight", "sum")).reset_index()

data_agg_weights_ud.sort_values("weight", ascending=False, inplace=True)
grouped_ud = data_agg_weights_ud.groupby("source").head(KEEP_TOP_N).reset_index(drop=True)
grouped_ud = grouped_ud.groupby("source").agg(targets=("target", set))

total = grouped_ud.index.nunique()
nodes_neighborhoods = {}
for index, (source, targets) in enumerate(grouped_ud.iterrows()):
    community_candidates = {source}
    for target in targets["targets"]:
        community_candidates |= (grouped_ud.loc[target, "targets"] | {target})
    nodes_neighborhoods[source] = set(community_candidates)
    if not (index % 250_000):
        print(index, total)

del data_agg_weights_rev
del data_agg_weights_ud
del grouped_ud

                                                                                

0 2973489
250000 2973489
500000 2973489
750000 2973489
1000000 2973489
1250000 2973489
1500000 2973489
1750000 2973489
2000000 2973489
2250000 2973489
2500000 2973489
2750000 2973489
CPU times: user 6min 8s, sys: 14.4 s, total: 6min 23s
Wall time: 14min 48s


In [15]:
%%time

print("Constructing 2-hop communities")

communities_2_hop = get_communities_spark(
    nodes_neighborhoods,
    ig.Graph.DataFrame(edges_agg.loc[:, ["source", "target", "amount_usd_weighted"]], use_vids=False, directed=True), 
    os.cpu_count(), spark, 2, "all", 0.01, "amount_usd_weighted"
)
sizes_2_hop = [len(x[1]) for x in communities_2_hop]
del nodes_neighborhoods

Constructing 2-hop communities


                                                                                

CPU times: user 2min 39s, sys: 5.17 s, total: 2min 44s
Wall time: 21min 10s


In [16]:
%%time

ts_min = data.select(sf.min("timestamp").alias("x")).collect()[0]["x"] - timedelta(minutes=1)
data_graph_agg = data.groupby(["source", "target"]).agg(
    sf.sum("num_transactions").alias("num_transactions"),
    sf.sum("amount_usd").alias("amount_usd"),
    sf.collect_list(sf.array((sf.col("timestamp") - ts_min).cast("long"), sf.col("amount_usd"))).alias("timestamps_amounts"),
)
data_graph_agg_sdf = data_graph_agg.persist(StorageLevel.DISK_ONLY)
print(data_graph_agg_sdf.count())
data_graph_agg = data_graph_agg_sdf.toPandas()
index = ["source", "target"]
edges_agg.loc[:, index + ["weight"]].set_index(index)
data_graph_agg = data_graph_agg.set_index(index).join(
    edges_agg.loc[:, index + ["weight"]].set_index(index), how="left"
).reset_index()
data_graph_agg.loc[:, "amount_usd_weighted"] = (
    data_graph_agg.loc[:, "amount_usd"] * 
    (data_graph_agg.loc[:, "weight"] / data_graph_agg.loc[:, "weight"].max())
)

                                                                                

5355155


                                                                                

CPU times: user 31.7 s, sys: 2.83 s, total: 34.5 s
Wall time: 14min 11s


In [17]:
graph = ig.Graph.DataFrame(data_graph_agg, use_vids=False, directed=True)

In [18]:
%%time

print("Leiden communitites features creation")

communities_leiden = [(x, y) for x, y in communities_leiden if len(y) > 1]
features_leiden = generate_features_spark(communities_leiden, data_graph_agg, spark)
features_leiden = features_leiden.rename(columns={"key": "key_fake"})
communities_leiden_dict = dict(communities_leiden)
features_leiden.loc[:, "key"] = features_leiden.loc[:, "key_fake"].apply(lambda x: communities_leiden_dict[x])
features_leiden = features_leiden.explode("key")
del features_leiden["key_fake"]
features_leiden.set_index("key").to_parquet(location_features_leiden)

Leiden communitites features creation


25/08/17 16:56:18 WARN TaskSetManager: Stage 20 contains a task of very large size (1031 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1min 24s, sys: 2.49 s, total: 1min 27s
Wall time: 15min 46s


In [19]:
%%time

print("2-hop communitites features creation")

features_2_hop = generate_features_spark(communities_2_hop, data_graph_agg, spark)
features_2_hop.set_index("key").to_parquet(location_features_2_hop)

2-hop communitites features creation


25/08/17 17:36:49 WARN TaskSetManager: Stage 31 contains a task of very large size (1031 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 3min 9s, sys: 10.8 s, total: 3min 19s
Wall time: 1h 8min 51s


In [20]:
del graph

In [21]:
%%time

print("Temporal flows features creation")

edges_totals = data.select("source", "target", "amount_usd").groupby(
    ["source", "target"]
).agg(sf.count("amount_usd").alias("amount_usd")).toPandas()
edges_totals = edges_totals.sort_values("amount_usd", ascending=False).reset_index(drop=True)
left_edges = spark.createDataFrame(edges_totals.groupby("target").head(TOP_N).loc[:, ["source", "target"]])
right_edges = spark.createDataFrame(edges_totals.groupby("source").head(TOP_N).loc[:, ["source", "target"]])

columns = ["source", "target", "timestamp", "amount_usd"]

left = left_edges.select(sf.col("source").alias("src"), sf.col("target").alias("tgt")).join(
    data.select(*columns),
    on=(sf.col("src") == sf.col("source")) & (sf.col("tgt") == sf.col("target")),
    how="left"
).drop("src", "tgt").persist(StorageLevel.DISK_ONLY)
select = []
for column in left.columns:
    select.append(sf.col(column).alias(f"left_{column}"))
left = left.select(*select)
right = right_edges.select(sf.col("source").alias("src"), sf.col("target").alias("tgt")).join(
    data.select(*columns),
    on=(sf.col("src") == sf.col("source")) & (sf.col("tgt") == sf.col("target")),
    how="left"
).drop("src", "tgt").persist(StorageLevel.DISK_ONLY)

print(left.count(), right.count())

flows_temporal = left.join(
    right,
    (left["left_target"] == right["source"]) &
    (left["left_timestamp"] <= right["timestamp"]),
    how="inner"
).groupby(["left_source", "left_target", "source", "target"]).agg(
    sf.sum("left_amount_usd").alias("left_amount_usd"),
    sf.sum("amount_usd").alias("amount_usd"),
).drop("left_target").select(
    sf.col("left_source").alias("dispense"),
    sf.col("source").alias("passthrough"),
    sf.col("target").alias("sink"),
    sf.least("left_amount_usd", "amount_usd").alias("amount_usd"),
)

aggregate = [
    sf.sum("amount_usd").alias("amount_sum"),
    sf.mean("amount_usd").alias("amount_mean"),
    sf.median("amount_usd").alias("amount_median"),
    sf.max("amount_usd").alias("amount_max"),
    sf.stddev("amount_usd").alias("amount_std"),
    sf.countDistinct("dispense").alias("dispense_count"),
    sf.countDistinct("passthrough").alias("passthrough_count"),
    sf.countDistinct("sink").alias("sink_count"),
]
for flow_location, flow_type in [
    (location_flow_dispense, "dispense"), 
    (location_flow_passthrough, "passthrough"), 
    (location_flow_sink, "sink")
]:
    print(flow_type)
    flows_temporal_stats = flows_temporal.groupby(flow_type).agg(*aggregate).toPandas()
    flows_temporal_cyclic_stats = flows_temporal.where(
        (sf.col("dispense") == sf.col("sink"))
    ).groupby(flow_type).agg(*aggregate).toPandas()
    flows_temporal_stats = flows_temporal_stats.set_index(flow_type).join(
        flows_temporal_cyclic_stats.set_index(flow_type),
        how="left", rsuffix="_cycle"
    )
    flows_temporal_stats.index.name = "key"
    flows_temporal_stats.to_parquet(flow_location)
    del flows_temporal_stats
    del flows_temporal_cyclic_stats

left.unpersist()
right.unpersist()

del edges_totals
del left_edges
del right_edges

Temporal flows features creation


25/08/17 18:25:53 WARN TaskSetManager: Stage 42 contains a task of very large size (6438 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

7874361 11725385
dispense


                                                                                

passthrough


                                                                                

sink


                                                                                

CPU times: user 17.9 s, sys: 658 ms, total: 18.5 s
Wall time: 1h 6min 44s


In [22]:
%%time

print("1-hop-source features creation")

features_source = spark.createDataFrame(data_graph_agg).withColumn(
    "key", sf.col("source")
).repartition(os.cpu_count() * 5, "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_source = pd.DataFrame(features_source["features"].apply(json.loads).tolist())
features_source.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_source.columns]
features_source.set_index("key").to_parquet(location_features_source)

1-hop-source features creation


25/08/17 19:24:58 WARN TaskSetManager: Stage 246 contains a task of very large size (1031 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 2min 5s, sys: 2.43 s, total: 2min 8s
Wall time: 20min 40s


In [23]:
%%time

print("1-hop-target features creation")

features_target = spark.createDataFrame(data_graph_agg).withColumn(
    "key", sf.col("target")
).repartition(os.cpu_count() * 5, "key").groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_target = pd.DataFrame(features_target["features"].apply(json.loads).tolist())
features_target.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_target.columns]
features_target.set_index("key").to_parquet(location_features_target)

1-hop-target features creation


25/08/17 19:45:39 WARN TaskSetManager: Stage 249 contains a task of very large size (1031 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

CPU times: user 1min 36s, sys: 1.49 s, total: 1min 38s
Wall time: 12min 26s


In [24]:
del data_graph_agg

In [9]:
ENABLED_FEATURES = [
    ("leiden", location_features_leiden),
    ("2_hop", location_features_2_hop),
    ("as_source", location_features_source),
    ("as_target", location_features_target),
    ("comm_as_source_features", location_comm_as_source_features),
    ("comm_as_target_features", location_comm_as_target_features),
    ("comm_as_passthrough_features", location_comm_as_passthrough_features),
    ("comm_as_passthrough_features_reverse", location_comm_as_passthrough_features_reverse),
    ("flow_dispense", location_flow_dispense),
    ("flow_passthrough", location_flow_passthrough),
    ("flow_sink", location_flow_sink),
]

In [10]:
all_features = pd.DataFrame()
all_features.index.name = "key"

for feature_group, location in ENABLED_FEATURES:
    all_features = all_features.join(
        pd.read_parquet(location), how="outer", rsuffix=f"_{feature_group}"
    )

all_features.to_parquet(location_features_node_level)
print("Features:", all_features.shape)
del all_features

Features: (2973489, 288)


In [9]:
all_features = pd.read_parquet(location_features_node_level)

In [10]:
constants = []
for column in all_features.columns:
    if all_features[column].nunique(dropna=True) <= 1:
        del all_features[column]
        constants.append(column)
print(f"Deleted {len(constants)} constant columns")

Deleted 14 constant columns


In [11]:
medians = {}
for column in all_features.columns:
    medians[column] = np.nanmedian(all_features[column])

In [12]:
delta = round(time.time() - start_script)
print(f"Script executed in {timedelta(seconds=delta)}")

Script executed in 0:03:18


In [13]:
%%time

print("Training the anomaly detection model")

anomalies = all_features.loc[:, []]
model_ad = IsolationForest(n_estimators=1_000)
anomalies.loc[:, "anomaly_score"] = -model_ad.fit(
    all_features.fillna(medians)
).decision_function(all_features.fillna(medians))
anomalies.loc[:, "anomaly_score"] += abs(anomalies.loc[:, "anomaly_score"].min()) + 1e-10
anomalies.loc[:, "anomaly_score"] /= anomalies.loc[:, "anomaly_score"].max()

Training the anomaly detection model
CPU times: user 1min 59s, sys: 18.3 s, total: 2min 18s
Wall time: 2min 10s


In [28]:
%%time

print(f"Generating edge features")

to_select = ["source", "target", "timestamp", "num_transactions", "amount", "amount_usd", "is_zero_transaction"]

edges_features_input = data.select(to_select).groupby(
    ["source", "target"]
).agg(
    sf.sum("num_transactions").alias("num_transactions"), 
    sf.sum("amount").alias("amount"),
    sf.sum("amount_usd").alias("amount_usd"),
    sf.count(sf.when(sf.col("is_zero_transaction"), 1).otherwise(0)).alias("count_zero_transactions"),
    sf.count(sf.when(sf.col("is_zero_transaction"), 0).otherwise(1)).alias("count_non_zero_transactions"),
    (sf.unix_timestamp(sf.max("timestamp")) - sf.unix_timestamp(sf.min("timestamp"))).alias("related_for"),
).persist(StorageLevel.DISK_ONLY)
_ = edges_features_input.count()

edge_features = edges_features_input.toPandas()
edge_features.to_parquet(location_features_edges)
del edge_features

Generating edge features


                                                                                

CPU times: user 4.85 s, sys: 540 ms, total: 5.39 s
Wall time: 17min 22s


In [14]:
edge_features = pd.read_parquet(location_features_edges)

In [15]:
%%time

train_edges = train.loc[:, ["source", "target"]].drop_duplicates().set_index(
    ["source", "target"]
)
valid_edges = validation.loc[:, ["source", "target"]].drop_duplicates().set_index(
    ["source", "target"]
)
test_edges = test.loc[:, ["source", "target"]].drop_duplicates().set_index(
    ["source", "target"]
)

train_features = train_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
validation_features = valid_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
test_features = test_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()

CPU times: user 34.7 s, sys: 1.31 s, total: 36 s
Wall time: 36.1 s


In [16]:
def save_edge_features(features_in, location):
    features_in = features_in.set_index("target").join(
        anomalies, how="left"
    ).reset_index()
    features_in = features_in.set_index("source").join(
        anomalies, how="left", rsuffix="_source"
    ).reset_index()
    features_in = features_in.set_index("target").join(
        all_features, how="left", rsuffix="_target"
    ).reset_index()
    features_in = features_in.set_index("source").join(
        all_features, how="left", rsuffix="_source"
    ).reset_index()
    features_in.loc[:, "anom_scores_diff"] = features_in.loc[:, "anomaly_score"] - features_in.loc[:, "anomaly_score_source"]
    features_in.loc[:, "anom_scores_min"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).min(axis=0)
    features_in.loc[:, "anom_scores_max"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).max(axis=0)
    features_in.loc[:, "anom_scores_mean"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).mean(axis=0)
    features_in.to_parquet(location)

In [17]:
%%time

save_edge_features(train_features, location_features_edges_train)

CPU times: user 1min 59s, sys: 23.8 s, total: 2min 23s
Wall time: 2min 18s


In [18]:
%%time

save_edge_features(validation_features, location_features_edges_valid)

CPU times: user 26.9 s, sys: 2.5 s, total: 29.4 s
Wall time: 29.1 s


In [19]:
%%time

save_edge_features(test_features, location_features_edges_test)

CPU times: user 24.5 s, sys: 1.55 s, total: 26 s
Wall time: 25.9 s


In [20]:
def save_trx_features(data_in, location):
    columns = [
        "source",
        "target",
        "amount",
        "amount_usd",
        "is_zero_transaction",
        "source_dispensation",
        "target_accumulation",
        "source_positive_balance",
        "source_negative_balance",
        "target_positive_balance",
        "target_negative_balance",
        "source_active_for",
        "target_active_for",
        "is_phishing",
    ]
    trx_features = data_in.loc[:, columns]
    trx_features.to_parquet(location)
    del trx_features

In [21]:
%%time

save_trx_features(train, location_train_trx_features)
save_trx_features(validation, location_valid_trx_features)
save_trx_features(test, location_test_trx_features)

CPU times: user 7.3 s, sys: 382 ms, total: 7.68 s
Wall time: 7.69 s


In [22]:
# To free up memory for training

to_reset = %who_ls
to_reset = list(to_reset)
to_reset.remove("to_keep")
to_reset = set(to_reset) - set(to_keep)
for var_to_reset in list(to_reset):
    var_to_reset = f"^{var_to_reset}$"
    %reset_selective -f {var_to_reset}

delete_large_vars(globals(), locals())

In [23]:
def combine_features(location_features_trx, location_features_edges, location_features):
    features_input = spark.read.parquet(location_features_edges)
    trx_features_input = spark.read.parquet(location_features_trx).withColumnRenamed(
        "source", "source_trx"
    ).withColumnRenamed(
        "target", "target_trx"
    ).withColumnRenamed(
        "amount", "amount_trx"
    ).withColumnRenamed(
        "amount_usd", "amount_usd_trx"
    )
    features_input = trx_features_input.join(
        features_input,
        (trx_features_input["source_trx"] == features_input["source"]) &
        (trx_features_input["target_trx"] == features_input["target"]),
        how="left"
    ).drop("source_trx", "target_trx", "source", "target")
    features_input.write.parquet(location_features, mode="overwrite")

In [24]:
%%time

combine_features(location_train_trx_features, location_features_edges_train, location_train_features)
combine_features(location_valid_trx_features, location_features_edges_valid, location_valid_features)
combine_features(location_test_trx_features, location_features_edges_test, location_test_features)

25/08/17 21:57:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

CPU times: user 84.8 ms, sys: 19.2 ms, total: 104 ms
Wall time: 2min 45s


In [25]:
shutil.rmtree(MULTI_PROC_STAGING_LOCATION, ignore_errors=True)

In [6]:
%%time

train_features = pd.read_parquet(location_train_features)

CPU times: user 1min 2s, sys: 36.9 s, total: 1min 39s
Wall time: 9.01 s


In [7]:
%%time

validation_features = pd.read_parquet(location_valid_features)

CPU times: user 3.99 s, sys: 2min 46s, total: 2min 50s
Wall time: 5.97 s


In [8]:
%%time

test_features = pd.read_parquet(location_test_features)

CPU times: user 2.79 s, sys: 4.21 s, total: 7.01 s
Wall time: 416 ms


In [9]:
all_columns = set(train_features.columns) | set(validation_features.columns) | set(test_features.columns)

for missing in (
    all_columns.symmetric_difference(train_features.columns) |
    all_columns.symmetric_difference(validation_features.columns) |
    all_columns.symmetric_difference(test_features.columns)
):
    if missing in train_features.columns:
        print(f"Deleting {missing} from train")
        del train_features[missing]
    if missing in validation_features.columns:
        print(f"Deleting {missing} from validation")
        del validation_features[missing]
    if missing in test_features.columns:
        print(f"Deleting {missing} from test")
        del test_features[missing]

validation_features = validation_features.loc[:, list(train_features.columns)]
test_features = test_features.loc[:, list(train_features.columns)]

In [11]:
assert train_features.shape[0] == train_count
assert validation_features.shape[0] == validation_count
assert test_features.shape[0] == test_count

In [10]:
train_features_labels = train_features.loc[:, ["is_phishing"]].copy(deep=True)
del train_features["is_phishing"]

validation_features_labels = validation_features.loc[:, ["is_phishing"]].copy(deep=True)
validation_features = validation_features.loc[:, train_features.columns]

test_features_labels = test_features.loc[:, ["is_phishing"]].copy(deep=True)
test_features = test_features.loc[:, train_features.columns]

In [12]:
cuda_available = False
try:
    import torch
    cuda_available = torch.cuda.is_available()
except ImportError:
    pass


def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))


xgb_args = dict(
    early_stopping_rounds=10, scale_pos_weight=1000,
    eval_metric=f1_eval, disable_default_eval_metric=True, 
    num_parallel_tree=10, max_depth=6,
    colsample_bytree=1, subsample=0.5,
    device="cpu", nthread=16,
    n_estimators=100, seed=0,
)
if cuda_available:
    xgb_args["device"] = "cuda"
    xgb_args["nthread"] = 2

xgb_fit_args = {
    "eval_set": [(validation_features, validation_features_labels["is_phishing"].values)],
    "verbose": True,
}

In [14]:
%%time

model = xgb.XGBClassifier(**xgb_args)
model.fit(train_features, train_features_labels["is_phishing"].values, **xgb_fit_args)
y_test_predicted = model.predict(test_features)
f1_final = f1_score(test_features_labels["is_phishing"], y_test_predicted) * 100
print(
    round(f1_final, 2),
    round(recall_score(test_features_labels["is_phishing"], y_test_predicted) * 100, 2)
)
print()

[0]	validation_0-f1_eval:0.86539
[1]	validation_0-f1_eval:0.82877
[2]	validation_0-f1_eval:0.80248
[3]	validation_0-f1_eval:0.79393
[4]	validation_0-f1_eval:0.77311
[5]	validation_0-f1_eval:0.75992
[6]	validation_0-f1_eval:0.74926
[7]	validation_0-f1_eval:0.73140
[8]	validation_0-f1_eval:0.71620
[9]	validation_0-f1_eval:0.70261
[10]	validation_0-f1_eval:0.68067
[11]	validation_0-f1_eval:0.66696
[12]	validation_0-f1_eval:0.65585
[13]	validation_0-f1_eval:0.64675
[14]	validation_0-f1_eval:0.63479
[15]	validation_0-f1_eval:0.60401
[16]	validation_0-f1_eval:0.59719
[17]	validation_0-f1_eval:0.59574
[18]	validation_0-f1_eval:0.58282
[19]	validation_0-f1_eval:0.57521
[20]	validation_0-f1_eval:0.57050
[21]	validation_0-f1_eval:0.56473
[22]	validation_0-f1_eval:0.55605
[23]	validation_0-f1_eval:0.54864
[24]	validation_0-f1_eval:0.54100
[25]	validation_0-f1_eval:0.52017
[26]	validation_0-f1_eval:0.50567
[27]	validation_0-f1_eval:0.49363
[28]	validation_0-f1_eval:0.48584
[29]	validation_0-f1_eva

In [None]:
%%time

f1_scores = [f1_final]
xgb_fit_args["verbose"] = False
for seed in [10, 20, 30, 40]:
    xgb_args["seed"] = seed
    print("seed", seed)
    model = xgb.XGBClassifier(**xgb_args)
    model.fit(train_features, train_features_labels["is_phishing"].values, **xgb_fit_args)
    f1_scores.append(f1_score(test_features_labels["is_phishing"], model.predict(test_features)) * 100)
    print(round(f1_scores[-1], 2))

seed 10


In [None]:
gfp_best = 51.49
gfp_std = 4.29

In [None]:
print(f"GFP best: {gfp_best} ± {gfp_std}")

In [None]:
print(f"{round(max(f1_scores), 2)} ±{round(np.std(f1_scores), 2)}")

In [None]:
uplift = round(((max(f1_scores) - gfp_best) / gfp_best) * 100, 2)
print(f"Uplift of {uplift}%")