### https://dl.acm.org/doi/pdf/10.1145/3677052.3698648

In [None]:
import json
import random
import os
import pickle
import time
import shutil
import sys
import uuid
from collections import defaultdict
from datetime import timedelta, datetime
from glob import glob
from itertools import product
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, recall_score, RocCurveDisplay

import igraph as ig
import leidenalg as la
import numpy as np
import pandas as pd
import xgboost as xgb

import settings as s

os.environ["EXT_DATA_TYPE_FOLDER"] = "ethereum"

from common import get_weights, delete_large_vars, MULTI_PROC_STAGING_LOCATION
from communities import get_communities_spark
from features import (
    generate_features_spark, generate_features_udf_wrapper,
    SCHEMA_FEAT_UDF
)

%load_ext autoreload
%autoreload 2

In [None]:
SEED = int(os.environ.get("EXSTRAQT_SEED", 42))
print(f"{SEED=}")
random.seed(SEED)
np.random.seed(SEED)

In [None]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8"
    )

In [None]:
SPARK_CONF = [
    ("spark.driver.memory", "32g"),
    ("spark.worker.memory", "32g"),
    ("spark.driver.maxResultSize", "32g"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
    ("spark.network.timeout", "600s"),
    ("spark.sql.autoBroadcastJoinThreshold", -1),
    ("spark.local.dir", f".{os.sep}temp-spark"),
]

if "EXSTRAQT_SEED" in os.environ:
    SPARK_CONF.append(("spark.log.level", "ERROR"))

shutil.rmtree("artifacts", ignore_errors=True)
shutil.rmtree("temp-spark", ignore_errors=True)
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(SPARK_CONF))
    .getOrCreate()
)

In [None]:
TRAIN_PERC = 0.65
VALIDATION_PERC = 0.15
TEST_PERC = 0.2

KEEP_TOP_N = 100

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

location_main = os.path.join("features", os.environ["EXT_DATA_TYPE_FOLDER"])
# shutil.rmtree(location_main, ignore_errors=True)

location_communities_leiden = f"{location_main}{os.sep}communities_leiden.parquet"

location_features_leiden = f"{location_main}{os.sep}features_leiden.parquet"
location_features_ego = f"{location_main}{os.sep}features_ego.parquet"
location_features_2_hop = f"{location_main}{os.sep}features_2_hop.parquet"
location_features_2_hop_out = f"{location_main}{os.sep}features_2_hop_out.parquet"
location_features_2_hop_in = f"{location_main}{os.sep}features_2_hop_in.parquet"
location_features_2_hop_combined = f"{location_main}{os.sep}features_2_hop_combined.parquet"
location_features_source = f"{location_main}{os.sep}features_source.parquet"
location_features_target = f"{location_main}{os.sep}features_target.parquet"

location_flow_dispense = f"{location_main}{os.sep}location_flow_dispense.parquet"
location_flow_passthrough = f"{location_main}{os.sep}location_flow_passthrough.parquet"
location_flow_sink = f"{location_main}{os.sep}location_flow_sink.parquet"

location_comm_as_source_features = f"{location_main}{os.sep}comm_as_source_features.parquet"
location_comm_as_target_features = f"{location_main}{os.sep}comm_as_target_features.parquet"
location_comm_as_passthrough_features = f"{location_main}{os.sep}comm_as_passthrough_features.parquet"
location_comm_as_passthrough_features_reverse = f"{location_main}{os.sep}comm_as_passthrough_features_reverse.parquet"

location_features_node_level = f"{location_main}{os.sep}features_node_level.parquet"
location_features_edges = f"{location_main}{os.sep}features_edges.parquet"

location_features_edges_train = f"{location_main}{os.sep}features_edges_train.parquet"
location_features_edges_valid = f"{location_main}{os.sep}features_edges_valid.parquet"
location_features_edges_test = f"{location_main}{os.sep}features_edges_test.parquet"

location_train_trx_features = f"{location_main}{os.sep}train_trx_features.parquet"
location_valid_trx_features = f"{location_main}{os.sep}valid_trx_features.parquet"
location_test_trx_features = f"{location_main}{os.sep}test_trx_features.parquet"

location_train_features = f"{location_main}{os.sep}train_features.parquet"
location_valid_features = f"{location_main}{os.sep}valid_features.parquet"
location_test_features = f"{location_main}{os.sep}test_features.parquet"

try:
    os.makedirs(location_main)
except FileExistsError:
    pass

In [None]:
data = pd.read_parquet(s.INPUT_DATA_FILE)
# Only interested when "target" is phishing
phishing_nodes = set(data.loc[data["is_phishing"], "target"].unique())
assert len(phishing_nodes) == 1164

In [None]:
%%time

source_firsts = data.groupby("source").agg(first_trx=("timestamp", "min"))
target_firsts = data.groupby("target").agg(first_trx=("timestamp", "min"))
active_since = source_firsts.join(target_firsts, lsuffix="_left", how="outer").fillna(datetime.now())
active_since.loc[:, "active_since"] = active_since.apply(lambda x: min([x["first_trx_left"], x["first_trx"]]), axis=1)
active_since = active_since.loc[:, ["active_since"]]
active_since.sort_values("active_since", inplace=True)

number_of_train_accounts = int(np.floor(active_since.shape[0] * TRAIN_PERC))
number_of_validation_accounts = int(np.floor(active_since.shape[0] * VALIDATION_PERC))
train_accounts = set(active_since.head(number_of_train_accounts).index.tolist())
assert len(train_accounts) == number_of_train_accounts
remaining = active_since.loc[~active_since.index.isin(train_accounts), :].sort_values("active_since")
validation_accounts = set(remaining.head(number_of_validation_accounts).index.tolist())
assert len(validation_accounts) == number_of_validation_accounts
test_accounts = set(active_since.index) - train_accounts - validation_accounts
print(f"{len(train_accounts):,} | {len(validation_accounts):,} | {len(test_accounts):,}")
assert sorted(train_accounts | validation_accounts | test_accounts) == sorted(active_since.index)

# [To prevent data leakage]
### Each accounts set is _exclusive_ for `train`, `validation`, and `test` data

In [None]:
train = data.loc[data["source"].isin(train_accounts) & data["target"].isin(train_accounts), :]
validation = data.loc[data["source"].isin(validation_accounts) & data["target"].isin(validation_accounts), :]
test = data.loc[data["source"].isin(test_accounts) & data["target"].isin(test_accounts), :]
print(
    round(train.shape[0] / data.shape[0], 2), 
    round(validation.shape[0] / data.shape[0], 2), 
    round(test.shape[0] / data.shape[0], 2)
)
train_count, validation_count, test_count = train.shape[0], validation.shape[0], test.shape[0]

assert set(train.index).intersection(validation.index) == set()
assert set(validation.index).intersection(test.index) == set()
assert set(train.index).intersection(test.index) == set()

In [None]:
def generate_edge_features(input_data):
    print(f"Generating edge features")
    to_select = ["source", "target", "timestamp", "num_transactions", "amount", "amount_usd", "is_zero_transaction"]    
    edges_features_input = input_data.select(*to_select).groupby(
        ["source", "target"]
    ).agg(
        sf.sum("num_transactions").alias("num_transactions"), 
        sf.sum("amount").alias("amount"),
        sf.sum("amount_usd").alias("amount_usd"),
        sf.count(sf.when(sf.col("is_zero_transaction"), 1).otherwise(0)).alias("count_zero_transactions"),
        sf.count(sf.when(sf.col("is_zero_transaction"), 0).otherwise(1)).alias("count_non_zero_transactions"),
        (sf.unix_timestamp(sf.max("timestamp")) - sf.unix_timestamp(sf.min("timestamp"))).alias("related_for"),
    ).persist(StorageLevel.DISK_ONLY)
    _ = edges_features_input.count()
    edge_features = edges_features_input.toPandas()
    edge_features.to_parquet(location_features_edges)
    del edge_features

In [None]:
def add_node_features_to_edges(features_in, location):
    features_in = features_in.set_index("target").join(
        pd.read_parquet(location_features_node_level), how="left", rsuffix="_target"
    ).reset_index().set_index("source").join(
        pd.read_parquet(location_features_node_level), how="left", rsuffix="_source"
    ).reset_index()

    features_in.loc[:, "anomaly_scores_diff"] = features_in.loc[:, "anomaly_score"] - features_in.loc[:, "anomaly_score_source"]
    features_in.loc[:, "anomaly_scores_min"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).min(axis=0)
    features_in.loc[:, "anomaly_scores_max"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).max(axis=0)
    features_in.loc[:, "anomaly_scores_mean"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).mean(axis=0)

    features_in.to_parquet(location)

In [None]:
def save_trx_features(data_in, location):
    columns = [
        "source",
        "target",
        "amount",
        "amount_usd",
        "is_zero_transaction",
        "source_dispensation",
        "target_accumulation",
        "source_positive_balance",
        "source_negative_balance",
        "target_positive_balance",
        "target_negative_balance",
        "source_active_for",
        "target_active_for",
        "is_phishing",
    ]
    trx_features = data_in.loc[:, columns]
    trx_features.loc[:, "source_balance_ratio"] = (
        trx_features["source_positive_balance"] / trx_features["source_negative_balance"]
    ).fillna(0).replace(np.inf, 0)
    trx_features.loc[:, "target_balance_ratio"] = (
        trx_features["target_positive_balance"] / trx_features["target_negative_balance"]
    ).fillna(0).replace(np.inf, 0)
    trx_features.to_parquet(location)
    del trx_features

In [None]:
%%time

print(f"Constructing node-level features: {data.shape[0]:,}")

%run node_level_features.ipynb

In [None]:
%%time

generate_edge_features(data)

In [None]:
%%time

train_edges = train.loc[:, ["source", "target"]].drop_duplicates().set_index(
    ["source", "target"]
)
train_features = train_edges.join(
    pd.read_parquet(location_features_edges).set_index(["source", "target"]), how="left"
).reset_index()
add_node_features_to_edges(train_features, location_features_edges_train)
save_trx_features(train, location_train_trx_features)

In [None]:
%%time

validation_edges = validation.loc[:, ["source", "target"]].drop_duplicates().set_index(
    ["source", "target"]
)
validation_features = validation_edges.join(
    pd.read_parquet(location_features_edges).set_index(["source", "target"]), how="left"
).reset_index()
add_node_features_to_edges(validation_features, location_features_edges_valid)
save_trx_features(validation, location_valid_trx_features)

In [None]:
%%time

test_edges = test.loc[:, ["source", "target"]].drop_duplicates().set_index(
    ["source", "target"]
)
test_features = test_edges.join(
    pd.read_parquet(location_features_edges).set_index(["source", "target"]), how="left"
).reset_index()
add_node_features_to_edges(test_features, location_features_edges_test)
save_trx_features(test, location_test_trx_features)

In [None]:
# To free up memory for training

to_reset = %who_ls
to_reset = list(to_reset)
to_reset.remove("to_keep")
to_reset = set(to_reset) - set(to_keep)
for var_to_reset in list(to_reset):
    var_to_reset = f"^{var_to_reset}$"
    %reset_selective -f {var_to_reset}

delete_large_vars(globals(), locals())

In [None]:
def combine_features(location_features_trx, location_features_edges, location_features):
    features_input = spark.read.parquet(location_features_edges)
    trx_features_input = spark.read.parquet(location_features_trx).withColumnRenamed(
        "amount", "amount_trx"
    ).withColumnRenamed(
        "amount_usd", "amount_usd_trx"
    )
    features_input = trx_features_input.join(
        features_input,
        on=["source", "target"],
        how="left"
    ).drop("source", "target")
    features_input.write.parquet(location_features, mode="overwrite")

In [None]:
%%time

combine_features(location_train_trx_features, location_features_edges_train, location_train_features)
combine_features(location_valid_trx_features, location_features_edges_valid, location_valid_features)
combine_features(location_test_trx_features, location_features_edges_test, location_test_features)

In [None]:
shutil.rmtree(MULTI_PROC_STAGING_LOCATION, ignore_errors=True)

In [None]:
%%time

train_features = pd.read_parquet(location_train_features)

In [None]:
%%time

validation_features = pd.read_parquet(location_valid_features)

In [None]:
%%time

test_features = pd.read_parquet(location_test_features)

In [None]:
all_columns = set(train_features.columns) | set(validation_features.columns) | set(test_features.columns)

for missing in (
    all_columns.symmetric_difference(train_features.columns) |
    all_columns.symmetric_difference(validation_features.columns) |
    all_columns.symmetric_difference(test_features.columns)
):
    if missing in train_features.columns:
        print(f"Deleting {missing} from train")
        del train_features[missing]
    if missing in validation_features.columns:
        print(f"Deleting {missing} from validation")
        del validation_features[missing]
    if missing in test_features.columns:
        print(f"Deleting {missing} from test")
        del test_features[missing]

validation_features = validation_features.loc[:, list(train_features.columns)]
test_features = test_features.loc[:, list(train_features.columns)]

In [None]:
assert train_features.shape[0] == train_count
assert validation_features.shape[0] == validation_count
assert test_features.shape[0] == test_count

In [None]:
train_features_labels = train_features.loc[:, ["is_phishing"]].copy(deep=True)
del train_features["is_phishing"]

validation_features_labels = validation_features.loc[:, ["is_phishing"]].copy(deep=True)
validation_features = validation_features.loc[:, train_features.columns]

test_features_labels = test_features.loc[:, ["is_phishing"]].copy(deep=True)
test_features = test_features.loc[:, train_features.columns]

In [None]:
cuda_available = False
try:
    import torch
    cuda_available = torch.cuda.is_available()
except ImportError:
    pass

xgb_args = dict(
    seed=SEED,
    max_depth=6,
    scale_pos_weight=35,
    eta=0.025,
    subsample=0.5,
    colsample_bytree=0.9, 
    num_parallel_tree=10, 
    n_estimators=100, 
    early_stopping_rounds=10, 
    eval_metric="aucpr", 
    disable_default_eval_metric=True, 
    nthread=10,
    device="cpu", 
)
if cuda_available:
    xgb_args["device"] = "cuda"
    xgb_args["nthread"] = 2

xgb_fit_args = {
    "eval_set": [(validation_features, validation_features_labels["is_phishing"].values)],
    "verbose": True,
}

In [None]:
%%time

model = xgb.XGBClassifier(**xgb_args)
model.fit(
    train_features, train_features_labels["is_phishing"].values, 
    **xgb_fit_args
)
y_test_predicted = model.predict(test_features)
f1_test = f1_score(test_features_labels["is_phishing"], y_test_predicted) * 100
print(
    f"{SEED=}",
    f"f1={round(f1_test, 2)}",
    f"recall={round(recall_score(test_features_labels['is_phishing'], y_test_predicted) * 100, 2)}",
)
print(f1_test)