In [1]:
import json
import os
import pickle
import shutil
import sys
import time
from datetime import timedelta, datetime

import igraph as ig
import numpy as np
import pandas as pd
import xgboost as xgb
from pyspark.sql import functions as sf
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from sklearn.metrics import f1_score, recall_score
from sklearn.ensemble import IsolationForest

import settings as s

os.environ["EXT_DATA_TYPE_FOLDER"] = "ethereum"

from common import get_weights, MULTI_PROC_STAGING_LOCATION, delete_large_vars
from communities import get_communities_spark
from features import (
    generate_features_spark, generate_features_udf_wrapper, 
    SCHEMA_FEAT_UDF, FEATURE_TYPES
)

%load_ext autoreload
%autoreload 2

In [2]:
if (
    sys.version_info.major,
    sys.version_info.minor,
    sys.version_info.micro,
) != (3, 11, 8):
    raise EnvironmentError(
        "Only runs efficiently on Python 3.11.8 (Tested on: Conda 24.1.2 | Apple M3 Pro)"
    )

In [3]:
SPARK_CONF = [
    ("spark.driver.memory", "32g"),
    ("spark.worker.memory", "32g"),
    ("spark.driver.maxResultSize", "32g"),
    ("spark.driver.bindAddress", "127.0.0.1"),
    ("spark.sql.execution.arrow.pyspark.enabled", "true"),
    ("spark.network.timeout", "600s"),
]

shutil.rmtree("artifacts", ignore_errors=True)
spark = (
    SparkSession.builder.appName("testing")
    .config(conf=SparkConf().setAll(SPARK_CONF))
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/11 16:40:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
start = time.time()

In [5]:
data = pd.read_parquet(s.INPUT_DATA_FILE)
# Only interested when "target" is phishing
phishing_nodes = set(data.loc[data["is_phishing"], "target"].unique())
print(len(phishing_nodes))

1164


In [6]:
TRAIN_PERC = 0.65
VALIDATION_PERC = 0.15
TEST_PERC = 0.2

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

In [7]:
%%time

source_firsts = data.groupby("source").agg(first_trx=("timestamp", "min"))
target_firsts = data.groupby("target").agg(first_trx=("timestamp", "min"))
active_since = source_firsts.join(target_firsts, lsuffix="_left", how="outer").fillna(datetime.now())
active_since.loc[:, "active_since"] = active_since.apply(lambda x: min([x["first_trx_left"], x["first_trx"]]), axis=1)
active_since = active_since.loc[:, ["active_since"]]
active_since.sort_values("active_since", inplace=True)

number_of_train_accounts = int(np.floor(active_since.shape[0] * TRAIN_PERC))
number_of_validation_accounts = int(np.floor(active_since.shape[0] * VALIDATION_PERC))
train_accounts = set(active_since.head(number_of_train_accounts).index.tolist())
assert len(train_accounts) == number_of_train_accounts
remaining = active_since.loc[~active_since.index.isin(train_accounts), :].sort_values("active_since")
validation_accounts = set(remaining.head(number_of_validation_accounts).index.tolist())
assert len(validation_accounts) == number_of_validation_accounts
test_accounts = set(active_since.index) - train_accounts - validation_accounts
print(f"{len(train_accounts):,} | {len(validation_accounts):,} | {len(test_accounts):,}")
assert sorted(train_accounts | validation_accounts | test_accounts) == sorted(active_since.index)

1,932,767 | 446,023 | 594,699
CPU times: user 23.5 s, sys: 271 ms, total: 23.8 s
Wall time: 23.8 s


In [8]:
data_orig = pd.read_parquet(s.INPUT_DATA_ORIG_FILE)

train = data_orig.loc[data_orig["source"].isin(train_accounts) & data_orig["target"].isin(train_accounts), :]
validation = data_orig.loc[data_orig["source"].isin(validation_accounts) & data_orig["target"].isin(validation_accounts), :]
test = data_orig.loc[data_orig["source"].isin(test_accounts) & data_orig["target"].isin(test_accounts), :]
print(
    round(train.shape[0] / data_orig.shape[0], 2), 
    round(validation.shape[0] / data_orig.shape[0], 2), 
    round(test.shape[0] / data_orig.shape[0], 2)
)

assert set(train.index).intersection(validation.index) == set()
assert set(validation.index).intersection(test.index) == set()
assert set(train.index).intersection(test.index) == set()

0.74 0.05 0.03


In [9]:
location_main = "features"
# shutil.rmtree(location_main, ignore_errors=True)

location_flow_dispense = f"{location_main}{os.sep}flow_dispense.parquet"
location_flow_passthrough = f"{location_main}{os.sep}flow_passthrough.parquet"
location_flow_sink = f"{location_main}{os.sep}flow_sink.parquet"

location_comm_as_source_features = f"{location_main}{os.sep}comm_as_source_features.parquet"
location_comm_as_target_features = f"{location_main}{os.sep}comm_as_target_features.parquet"
location_comm_as_passthrough_features = f"{location_main}{os.sep}comm_as_passthrough_features.parquet"
location_comm_as_passthrough_features_reverse = f"{location_main}{os.sep}comm_as_passthrough_features_reverse.parquet"

location_features_node_level = f"{location_main}{os.sep}features_node_level.parquet"
location_features_edges = f"{location_main}{os.sep}features_edges.parquet"

location_features_edges_train = f"{location_main}{os.sep}features_edges_train.parquet"
location_features_edges_valid = f"{location_main}{os.sep}features_edges_valid.parquet"
location_features_edges_test = f"{location_main}{os.sep}features_edges_test.parquet"

location_train_trx_features = f"{location_main}{os.sep}train_trx_features.parquet"
location_valid_trx_features = f"{location_main}{os.sep}valid_trx_features.parquet"
location_test_trx_features = f"{location_main}{os.sep}test_trx_features.parquet"

location_train_features = f"{location_main}{os.sep}train_features.parquet"
location_valid_features = f"{location_main}{os.sep}valid_features.parquet"
location_test_features = f"{location_main}{os.sep}test_features.parquet"

try:
    os.makedirs(location_main)
except FileExistsError:
    pass

In [10]:
data = spark.createDataFrame(data)

In [11]:
# Later on, we will reset the variables (to free up memory), while still keeping these intact
to_keep = %who_ls
to_keep = list(to_keep)

In [12]:
def flow_stats_wrapper(group_type):
    def flow_stats(group):
        key = group.iloc[0][group_type]
        prefix = f"{s.G_FLOW_PREFIX}{group_type}_"
        cycle = group[(group["dispense"] == group["sink"]) & (group["dispense"] != group["passthrough"])]
        row = {
            "key": key,
            f"{prefix}amount_sum": group["amount_usd"].sum(),
            f"{prefix}amount_mean": group["amount_usd"].mean(),
            f"{prefix}amount_max": group["amount_usd"].max(),
            f"{prefix}dispense_count": group["dispense"].nunique(),
            f"{prefix}passthrough_count": group["passthrough"].nunique(),
            f"{prefix}sink_count": group["sink"].nunique(),
            f"{prefix}cycle_sum": cycle["amount_usd"].sum(),
            f"{prefix}cycle_mean": cycle["amount_usd"].mean(),
            f"{prefix}cycle_max": cycle["amount_usd"].max(),
            f"{prefix}cycle_passthrough_count": cycle["passthrough"].nunique(),
        }
        return pd.DataFrame([json.dumps(row)], columns=["features"])
    return flow_stats

In [13]:
%%time

left = data.select("source", "target", "timestamp", "amount_usd")
select = []
for column in left.columns:
    select.append(sf.col(column).alias(f"left_{column}"))
left = left.select(*select)
right = data.select("source", "target", "timestamp", "amount_usd")

flows_temporal = left.join(
    right,
    (left["left_target"] == right["source"]) &
    (left["left_timestamp"] <= right["timestamp"]),
    how="inner"
).groupby(["left_source", "left_target", "source", "target"]).agg(
    sf.sum("left_amount_usd").alias("left_amount_usd"),
    sf.sum("amount_usd").alias("amount_usd"),
).drop("left_target").select(
    sf.col("left_source").alias("dispense"),
    sf.col("source").alias("passthrough"),
    sf.col("target").alias("sink"),
    sf.least("left_amount_usd", "amount_usd").alias("amount_usd"),
).persist(StorageLevel.DISK_ONLY)
flows_temporal.count()

for flow_location, flow_type in [
    (location_flow_dispense, "dispense"), (location_flow_passthrough, "passthrough"), (location_flow_sink, "sink")
]:
    print(flow_type)
    flow_data = flows_temporal.groupby(flow_type).applyInPandas(
        flow_stats_wrapper(flow_type), schema=SCHEMA_FEAT_UDF
    ).toPandas()
    flow_data = pd.DataFrame(flow_data["features"].apply(json.loads).tolist()).set_index("key")
    flow_data.to_parquet(flow_location)
    del flow_data

_ = flows_temporal.unpersist()

                                                                                

dispense


                                                                                

passthrough


                                                                                

sink


                                                                                

CPU times: user 8.86 s, sys: 2.98 s, total: 11.8 s
Wall time: 21min 31s


In [14]:
%%time

data_input = data.select("*")
nodes_source = set(data.select("source").distinct().toPandas()["source"])
nodes_target = set(data.select("target").distinct().toPandas()["target"])
nodes_passthrough = nodes_source.intersection(nodes_target)

%run generate_flow_features.ipynb

comm_as_source_features.to_parquet(location_comm_as_source_features)
comm_as_target_features.to_parquet(location_comm_as_target_features)
comm_as_passthrough_features.to_parquet(location_comm_as_passthrough_features)
comm_as_passthrough_features_reverse.to_parquet(location_comm_as_passthrough_features_reverse)

del comm_as_source_features
del comm_as_target_features
del comm_as_passthrough_features
del comm_as_passthrough_features_reverse

                                                                                


Processing comm_as_source

Processed hop #1 | 4,097,207 | 2,113,092
Processed hop #2 | 28,186,799 | 1,061,379
Processed hop #3 | 27,626,031 | 906,951
Processed hop #4 | 33,319,933 | 902,616

Processing comm_as_target

Processed hop #1 | 1,595,960 | 1,119,024
Processed hop #2 | 36,986,915 | 1,109,888
Processed hop #3 | 37,572,633 | 1,088,264
Processed hop #4 | 50,059,052 | 1,084,408

Processing comm_as_passthrough

Processed hop #1 | 1,214,666 | 258,627
Processed hop #2 | 5,402,153 | 182,193
Processed hop #3 | 5,494,028 | 160,174
Processed hop #4 | 6,453,218 | 157,969

Processing comm_as_passthrough_reverse

Processed hop #1 | 537,548 | 258,628
Processed hop #2 | 8,850,244 | 250,410
Processed hop #3 | 9,209,746 | 247,019
Processed hop #4 | 11,328,001 | 246,129


comm_as_source_features

CPU times: user 2min 20s, sys: 8.67 s, total: 2min 29s
Wall time: 2min 24s

comm_as_target_features

CPU times: user 2min 5s, sys: 6.8 s, total: 2min 12s
Wall time: 2min 8s

comm_as_passthrough_features

In [15]:
%%time

KEEP_TOP_N = 100

data_agg_weights = get_weights(
    data.groupby(["source", "target"])
    .agg(
        sf.sum("amount").alias("amount")
    ).toPandas()
)

data_agg_weights_rev = data_agg_weights.rename(
    columns={"target": "source", "source": "target"}
).loc[:, ["source", "target", "weight"]]
data_agg_weights_ud = pd.concat([data_agg_weights, data_agg_weights_rev], ignore_index=True)
data_agg_weights_ud = data_agg_weights_ud.groupby(["source", "target"]).agg(weight=("weight", "sum")).reset_index()

data_agg_weights_ud.sort_values("weight", ascending=False, inplace=True)
grouped_ud = data_agg_weights_ud.groupby("source").head(KEEP_TOP_N).reset_index(drop=True)
grouped_ud = grouped_ud.groupby("source").agg(targets=("target", set))

total = grouped_ud.index.nunique()
nodes_neighborhoods = {}
for index, (source, targets) in enumerate(grouped_ud.iterrows()):
    community_candidates = {source}
    for target in targets["targets"]:
        community_candidates |= (grouped_ud.loc[target, "targets"] | {target})
    nodes_neighborhoods[source] = set(community_candidates)
    if not (index % 250_000):
        print(index, total)

del data_agg_weights_rev
del data_agg_weights_ud
del grouped_ud

                                                                                

0 2973489
250000 2973489
500000 2973489
750000 2973489
1000000 2973489
1250000 2973489
1500000 2973489
1750000 2973489
2000000 2973489
2250000 2973489
2500000 2973489
2750000 2973489
CPU times: user 2min 33s, sys: 4.3 s, total: 2min 38s
Wall time: 3min 57s


In [16]:
%%time

print("Constructing communities")

graph = ig.Graph.DataFrame(data_agg_weights, use_vids=False, directed=True)
communities = get_communities_spark(nodes_neighborhoods, graph, os.cpu_count(), spark)

del graph
del data_agg_weights
del nodes_neighborhoods

Constructing communities


                                                                                

CPU times: user 1min 14s, sys: 12.3 s, total: 1min 26s
Wall time: 13min 40s


In [17]:
%%time

ts_min = data.select(sf.min("timestamp").alias("x")).collect()[0]["x"] - timedelta(minutes=1)
data_graph_agg = data.groupby(["source", "target"]).agg(
    sf.sum("num_transactions").alias("num_transactions"),
    sf.sum("amount_usd").alias("amount_usd"),
    sf.collect_list(sf.array((sf.col("timestamp") - ts_min).cast("long"), sf.col("amount_usd"))).alias("timestamps_amounts"),
)
data_graph_agg_sdf = data_graph_agg.persist(StorageLevel.DISK_ONLY)
print(data_graph_agg_sdf.count())
data_graph_agg = data_graph_agg_sdf.toPandas()

                                                                                

5355155


                                                                                

CPU times: user 2.29 s, sys: 666 ms, total: 2.96 s
Wall time: 1min 28s


In [18]:
%%time

print("Communities features creation")

graph = ig.Graph.DataFrame(data_graph_agg, use_vids=False, directed=True)
features = generate_features_spark(communities, graph, spark)
features.columns = [f"{s.G_COMM_PREFIX}{x}" if x != "key" else x for x in features.columns]

del graph
del communities
del data_graph_agg

Communities features creation


                                                                                

CPU times: user 33min 13s, sys: 28min 1s, total: 1h 1min 15s
Wall time: 1h 22min 35s


In [19]:
%%time

print("1-hop-source features creation")

features_source = data_graph_agg_sdf.withColumn("key", sf.col("source")).groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_source = pd.DataFrame(features_source["features"].apply(json.loads).tolist())
types = {k: v for k, v in FEATURE_TYPES.items() if k in features_source.columns}
features_source = features_source.astype(types)
features_source.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_source.columns]

1-hop-source features creation


                                                                                

CPU times: user 6.43 s, sys: 625 ms, total: 7.05 s
Wall time: 11min 41s


In [20]:
%%time

print("1-hop-target features creation")

features_target = data_graph_agg_sdf.withColumn("key", sf.col("target")).groupby("key").applyInPandas(
    generate_features_udf_wrapper(False), schema=SCHEMA_FEAT_UDF
).toPandas()
features_target = pd.DataFrame(features_target["features"].apply(json.loads).tolist())
types = {k: v for k, v in FEATURE_TYPES.items() if k in features_target.columns}
features_target = features_target.astype(types)
features_target.columns = [f"{s.G_1HOP_PREFIX}{x}" if x != "key" else x for x in features_target.columns]

1-hop-target features creation


                                                                                

CPU times: user 3.4 s, sys: 347 ms, total: 3.75 s
Wall time: 6min 4s


In [21]:
%%time

all_features = features.set_index("key").join(
    features_source.set_index("key"), how="left", rsuffix=f"_1_hop_as_source"
)
all_features.index.name = "key"
all_features = all_features.reset_index()

all_features = all_features.set_index("key").join(
    features_target.set_index("key"), how="left", rsuffix=f"_1_hop_as_target"
)

all_features = all_features.join(
    pd.read_parquet(location_comm_as_source_features), how="left", rsuffix="_dispense"
).join(
    pd.read_parquet(location_comm_as_target_features), how="left", rsuffix="_sink"
).join(
    pd.read_parquet(location_comm_as_passthrough_features), how="left", rsuffix="_passthrough"
).join(
    pd.read_parquet(location_comm_as_passthrough_features_reverse), how="left", rsuffix="_passthrough_rev"
).join(
    pd.read_parquet(location_flow_dispense), how="left"
).join(
    pd.read_parquet(location_flow_passthrough), how="left"
).join(
    pd.read_parquet(location_flow_sink), how="left"
)

all_features.to_parquet(location_features_node_level)
del all_features

CPU times: user 18.4 s, sys: 3.25 s, total: 21.7 s
Wall time: 19.9 s


In [22]:
all_features = pd.read_parquet(location_features_node_level)

In [23]:
anomalies = all_features.loc[:, []]
anomalies.loc[:, "anomaly_score"] = IsolationForest().fit(
    all_features.fillna(0)
).decision_function(all_features.fillna(0))
anomalies.loc[:, "anomaly_score"] += abs(anomalies.loc[:, "anomaly_score"].min())

In [24]:
%%time

print(f"Generating edge features")

to_select = ["source", "target", "timestamp", "num_transactions", "amount", "amount_usd", "is_zero_transaction"]

edges_features_input = data.select(to_select).groupby(
    ["source", "target"]
).agg(
    sf.sum("num_transactions").alias("num_transactions"), 
    sf.sum("amount").alias("amount"),
    sf.sum("amount_usd").alias("amount_usd"),
    sf.count(sf.when(sf.col("is_zero_transaction"), 1).otherwise(0)).alias("count_zero_transactions"),
    sf.count(sf.when(sf.col("is_zero_transaction"), 0).otherwise(1)).alias("count_non_zero_transactions"),
    (sf.unix_timestamp(sf.max("timestamp")) - sf.unix_timestamp(sf.min("timestamp"))).alias("related_for"),
).persist(StorageLevel.DISK_ONLY)
_ = edges_features_input.count()

edge_features = edges_features_input.toPandas()
edge_features.to_parquet(location_features_edges)
del edge_features

Generating edge features


                                                                                

CPU times: user 2.05 s, sys: 433 ms, total: 2.48 s
Wall time: 1min 27s


In [25]:
edge_features = pd.read_parquet(location_features_edges)

In [26]:
%%time

train_edges = train.loc[:, ["source", "target"]].drop_duplicates().set_index(
    ["source", "target"]
)
valid_edges = validation.loc[:, ["source", "target"]].drop_duplicates().set_index(
    ["source", "target"]
)
test_edges = test.loc[:, ["source", "target"]].drop_duplicates().set_index(
    ["source", "target"]
)

train_features = train_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
validation_features = valid_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()
test_features = test_edges.join(
    edge_features.set_index(["source", "target"]), how="left"
).reset_index()

CPU times: user 17.1 s, sys: 668 ms, total: 17.8 s
Wall time: 18.4 s


In [27]:
def save_edge_features(features_in, location):
    features_in = features_in.set_index("target").join(
        anomalies, how="left"
    ).reset_index().set_index("source").join(
        anomalies, how="left", rsuffix="_source"
    ).reset_index().set_index("target").join(
        all_features, how="left"
    ).reset_index().set_index("source").join(
        all_features, how="left", rsuffix="_source"
    ).reset_index()
    features_in.loc[:, "anom_scores_diff"] = features_in.loc[:, "anomaly_score"] - features_in.loc[:, "anomaly_score_source"]
    features_in.loc[:, "anom_scores_min"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).min(axis=0)
    features_in.loc[:, "anom_scores_max"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).max(axis=0)
    features_in.loc[:, "anom_scores_mean"] = np.array(
        [
            features_in.loc[:, "anomaly_score"].values, 
            features_in.loc[:, "anomaly_score_source"].values
        ],
    ).mean(axis=0)
    features_in.to_parquet(location)

In [28]:
%%time

save_edge_features(train_features, location_features_edges_train)

CPU times: user 37.6 s, sys: 11.9 s, total: 49.4 s
Wall time: 45.9 s


In [29]:
%%time

save_edge_features(validation_features, location_features_edges_valid)

CPU times: user 4.89 s, sys: 734 ms, total: 5.63 s
Wall time: 5.31 s


In [30]:
%%time

save_edge_features(test_features, location_features_edges_test)

CPU times: user 4.38 s, sys: 314 ms, total: 4.7 s
Wall time: 4.45 s


In [31]:
def save_trx_features(data_in, location):
    columns = ["source", "target", "amount", "amount_usd", "is_zero_transaction", "is_phishing"]
    trx_features = data_in.loc[:, columns]
    trx_features.to_parquet(location)
    del trx_features

In [32]:
%%time

save_trx_features(train, location_train_trx_features)
save_trx_features(validation, location_valid_trx_features)
save_trx_features(test, location_test_trx_features)

CPU times: user 1.84 s, sys: 153 ms, total: 1.99 s
Wall time: 2.15 s


In [33]:
# To free up memory for training

to_reset = %who_ls
to_reset = list(to_reset)
to_reset.remove("to_keep")
to_reset = set(to_reset) - set(to_keep)
for var_to_reset in list(to_reset):
    var_to_reset = f"^{var_to_reset}$"
    %reset_selective -f {var_to_reset}

delete_large_vars(globals(), locals())

Deleted `global` DataFrame: source_firsts
Deleted `global` DataFrame: target_firsts
Deleted `global` DataFrame: active_since
Deleted `global` large object: train_accounts
Deleted `global` DataFrame: remaining
Deleted `global` large object: validation_accounts
Deleted `global` large object: test_accounts
Deleted `global` DataFrame: data_orig
Deleted `global` DataFrame: train
Deleted `global` DataFrame: validation
Deleted `global` DataFrame: test


True

In [34]:
def combine_features(location_features_trx, location_features_edges, location_features):
    features_input = pd.read_parquet(location_features_edges)
    trx_features_input = pd.read_parquet(location_features_trx)
    features_input = trx_features_input.set_index(["source", "target"]).join(
        features_input.set_index(["source", "target"]),
        how="left",
        lsuffix="_trx"
    ).reset_index()
    features_input.to_parquet(location_features)
    del features_input

In [35]:
%%time

combine_features(location_train_trx_features, location_features_edges_train, location_train_features)
combine_features(location_valid_trx_features, location_features_edges_valid, location_valid_features)
combine_features(location_test_trx_features, location_features_edges_test, location_test_features)

CPU times: user 1min 42s, sys: 43.3 s, total: 2min 25s
Wall time: 2min 10s


In [36]:
shutil.rmtree(MULTI_PROC_STAGING_LOCATION, ignore_errors=True)

In [37]:
print((time.time() - start) // 60)
start = time.time()

174.0


In [38]:
train_features = pd.read_parquet(location_train_features)
validation_features = pd.read_parquet(location_valid_features)
test_features = pd.read_parquet(location_test_features)

In [39]:
%%time

missing_columns = (
    (set(train_features.columns).symmetric_difference(validation_features.columns)) |
    (set(train_features.columns).symmetric_difference(test_features.columns)) |
    (set(test_features.columns).symmetric_difference(validation_features.columns))
)
for column in missing_columns:
    if missing in train_features.columns:
        print(f"Deleting missing column from train: {column}")
        del train_features[column]
    if missing in validation_features.columns:
        print(f"Deleting missing column from validation: {column}")
        del validation_features[column]
    if missing in test_features.columns:
        print(f"Deleting missing column from test: {column}")
        del test_features[column]

train_features_labels = train_features.loc[:, ["is_phishing"]].copy(deep=True)
del train_features["is_phishing"]
del train_features["source"]
del train_features["target"]

validation_features_labels = validation_features.loc[:, ["is_phishing"]].copy(deep=True)
validation_features = validation_features.loc[:, train_features.columns]

test_features_labels = test_features.loc[:, ["is_phishing"]].copy(deep=True)
test_features = test_features.loc[:, train_features.columns]

CPU times: user 225 ms, sys: 295 ms, total: 520 ms
Wall time: 588 ms


In [56]:
def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))


def train_model(x, y, x_, y_, cv=False):
    if cv:
        model = xgb.XGBClassifier(
            early_stopping_rounds=10, scale_pos_weight=10,
            eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=1, max_depth=6,
            colsample_bytree=1, subsample=1, n_estimators=200,
            enable_categorical=True,
        )
    else:
        model = xgb.XGBClassifier(
            early_stopping_rounds=10, scale_pos_weight=10,
            eval_metric=f1_eval, disable_default_eval_metric=True, 
            num_parallel_tree=10, max_depth=6,
            colsample_bytree=0.5, subsample=0.5, 
            n_estimators=200, enable_categorical=True,
        )
    model.fit(x, y, verbose=not cv, eval_set=[(x_, y_)])
    print(f"Best iteration: {model.best_iteration}\n")
    return model

In [55]:
%%time

model = train_model(
    train_features, train_features_labels["is_phishing"].values, 
    validation_features, validation_features_labels["is_phishing"].values,
)
y_test_predicted = model.predict(test_features)
f1_final = f1_score(test_features_labels["is_phishing"], y_test_predicted) * 100
print(
    round(f1_final, 2),
    round(recall_score(test_features_labels["is_phishing"], y_test_predicted) * 100, 2)
)
print()

[0]	validation_0-f1_eval:0.50215
[1]	validation_0-f1_eval:0.44037
[2]	validation_0-f1_eval:0.38793
[3]	validation_0-f1_eval:0.36963
[4]	validation_0-f1_eval:0.36568
[5]	validation_0-f1_eval:0.36762
[6]	validation_0-f1_eval:0.38913
[7]	validation_0-f1_eval:0.37905
[8]	validation_0-f1_eval:0.38101
[9]	validation_0-f1_eval:0.38016
[10]	validation_0-f1_eval:0.37641
[11]	validation_0-f1_eval:0.37503
[12]	validation_0-f1_eval:0.37472
[13]	validation_0-f1_eval:0.37604
[14]	validation_0-f1_eval:0.37558
[15]	validation_0-f1_eval:0.37610
[16]	validation_0-f1_eval:0.37860
[17]	validation_0-f1_eval:0.37894
[18]	validation_0-f1_eval:0.37788
[19]	validation_0-f1_eval:0.37825
[20]	validation_0-f1_eval:0.39230
[21]	validation_0-f1_eval:0.39169
[22]	validation_0-f1_eval:0.39159
[23]	validation_0-f1_eval:0.38846
[24]	validation_0-f1_eval:0.38608
[25]	validation_0-f1_eval:0.38597
[26]	validation_0-f1_eval:0.38411
[27]	validation_0-f1_eval:0.38232
[28]	validation_0-f1_eval:0.38446
[29]	validation_0-f1_eva

In [57]:
%%time

CV_FOLD_PERC = 0.8
N_FOLDS = 5

f1_scores = []
for fold in range(N_FOLDS):
    print("Fold", fold + 1)
    x_train = train_features.sample(frac=CV_FOLD_PERC)
    x_train_labels = x_train.loc[:, []].join(train_features_labels, how="left")
    x_validation = validation_features.sample(frac=CV_FOLD_PERC)
    x_validation_labels = x_validation.loc[:, []].join(validation_features_labels, how="left")
    model = train_model(
        x_train, x_train_labels["is_phishing"].values, 
        x_validation, x_validation_labels["is_phishing"].values,
        cv=True
    )
    y_test_predicted = model.predict(test_features)
    f1_cv = f1_score(test_features_labels["is_phishing"], y_test_predicted) * 100
    print(
        round(f1_cv, 2),
        round(recall_score(test_features_labels["is_phishing"], y_test_predicted) * 100, 2)
    )
    f1_scores.append(f1_cv)

Fold 1
Best iteration: 3

46.16 39.09
Fold 2
Best iteration: 11

59.4 52.6
Fold 3
Best iteration: 31

48.88 38.43
Fold 4
Best iteration: 2

45.24 38.02
Fold 5
Best iteration: 27

54.9 44.24
CPU times: user 40min 36s, sys: 2min 52s, total: 43min 28s
Wall time: 8min 25s


In [58]:
gfp_best = 51.49
gfp_std = 4.29

In [59]:
print(f"GFP best: {gfp_best} ± {gfp_std}")

GFP best: 51.49 ± 4.29


In [60]:
print(f"{round(f1_final, 2)} ±{round(np.std(f1_scores), 2)}")

62.85 ±5.42


In [61]:
uplift = round(((f1_final - gfp_best) / gfp_best) * 100, 2)
print(f"Uplift of {uplift}%")

Uplift of 22.06%


In [47]:
print((time.time() - start) // 60)

5.0
