In [1]:
import bisect
import os
import pickle
import random
import shutil
import sys
import time
import uuid
from glob import glob
from datetime import timedelta, datetime
from itertools import combinations

import igraph as ig
import numpy as np
import pandas as pd
import xgboost as xgb
import networkx as nx
from sklearn.metrics import f1_score, recall_score

import settings as s
from communities import get_communities_multi_proc
from features import get_features_multi_proc
from common import create_workload_for_multi_proc

%load_ext autoreload
%autoreload 2

Ethereum dataset [15, 82] with 1, 165 accounts labelled as phishing.
To enable transaction classification using the ETH Phishing dataset,
we label a transaction of this dataset as phishing if its destination
account is labelled as phishing. As a result, 0.278% of Ethereum
transactions are labelled as phishing.

In [2]:
%%time

with open("./data/MulDiGraph.pkl", "rb") as f:
    G = pickle.load(f)

nodes_mapping = {}
phishing_nodes = {}
for idx, nd in enumerate(nx.nodes(G)):
    nodes_mapping[nd] = f"id-{idx}"
    phishing_nodes[nodes_mapping[nd]] = G.nodes[nd]["isp"]

rows = []
for edge in nx.edges(G):
    source, target = edge
    attrs = G[source][target][0]
    amount, timestamp = attrs["amount"], attrs["timestamp"]
    source, target = nodes_mapping[source], nodes_mapping[target]
    rows.append(
        {
            "source": source, 
            "target": target,
            "timestamp": datetime.fromtimestamp(timestamp),
            "amount": amount, 
        }
    )
data = pd.DataFrame(rows)
data = data.sort_values("timestamp").reset_index(drop=True)
data.index.name = "transaction_id"
size_orig = data.shape[0]
print(size_orig)
print()

# Only interested when "target" is phishing
data.loc[:, "is_phishing"] = data.loc[:, "target"].apply(lambda x: phishing_nodes[x] == 1)
phishing_nodes_filtered = set(data.loc[data["is_phishing"], "target"].unique())
phishing_nodes_filtered = set(data.loc[data["is_phishing"], "target"].unique())
phishing_nodes = {k: 1 if {k}.intersection(phishing_nodes_filtered) else 0 for k in phishing_nodes.keys()}
print(sum(phishing_nodes.values()))
print()

data_orig_copy = data.loc[:, ["source", "target"]].copy(deep=True)

13551303

1164

CPU times: user 34 s, sys: 1.47 s, total: 35.4 s
Wall time: 35.4 s


In [3]:
TRAIN_PERC = 0.65
VALIDATION_PERC = 0.15
TEST_PERC = 0.2

NUM_PROCS = 10

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

In [4]:
%%time

source_firsts = data.groupby("source").agg(first_trx=("timestamp", "min"))
target_firsts = data.groupby("target").agg(first_trx=("timestamp", "min"))
active_since = source_firsts.join(target_firsts, lsuffix="_left", how="outer").fillna(datetime.now())
active_since.loc[:, "active_since"] = active_since.apply(lambda x: min([x["first_trx_left"], x["first_trx"]]), axis=1)
active_since = active_since.loc[:, ["active_since"]]
active_since.sort_values("active_since", inplace=True)

CPU times: user 25.4 s, sys: 217 ms, total: 25.6 s
Wall time: 25.6 s


In [5]:
number_of_train_accounts = int(np.floor(active_since.shape[0] * TRAIN_PERC))
number_of_validation_accounts = int(np.floor(active_since.shape[0] * VALIDATION_PERC))
train_accounts = set(active_since.head(number_of_train_accounts).index.tolist())
assert len(train_accounts) == number_of_train_accounts
remaining = active_since.loc[~active_since.index.isin(train_accounts), :].sort_values("active_since")
validation_accounts = set(remaining.head(number_of_validation_accounts).index.tolist())
assert len(validation_accounts) == number_of_validation_accounts
test_accounts = set(active_since.index) - train_accounts - validation_accounts
print(f"{len(train_accounts):,} | {len(validation_accounts):,} | {len(test_accounts):,}")
assert sorted(train_accounts | validation_accounts | test_accounts) == sorted(active_since.index)

1,932,767 | 446,023 | 594,699


In [6]:
data = data.groupby(["source", "target", "timestamp"]).agg(
    amount=("amount", "sum"),
    num_transactions=("amount", "count"),
).reset_index()
data = data.sort_values("timestamp").reset_index(drop=True)
data.index.name = "transaction_id"
size_aggd = data.shape[0]
print(size_aggd, round(size_aggd / size_orig, 2))

5355155 0.4


In [7]:
%%time

rates = pd.read_csv("data/rates.csv", sep=";")
rates.loc[:, "rate"] = (rates["low"] + rates["high"]) / 2
rates.index = pd.to_datetime(rates["timeOpen"]).dt.date
rates = rates["rate"].to_dict()
data.loc[:, "amount_usd"] = data.apply(lambda x: rates[x["timestamp"].date()] * x["amount"], axis=1)
data.loc[:, "is_zero_transaction"] = data.loc[:, "amount"] == 0

data.loc[data["amount"] < 1e-6, "amount"] = 1e-6
data.loc[data["amount_usd"] < 1e-6, "amount_usd"] = 1e-6
data = data.astype({"amount": np.float32, "amount_usd": np.float32})
columns = [
    "source", "target", "timestamp", "num_transactions", 
    "amount", "amount_usd", "is_zero_transaction",
]
data = data.loc[:, columns]

CPU times: user 19.6 s, sys: 226 ms, total: 19.9 s
Wall time: 19.9 s


In [8]:
# %%time

# num_unique = data["source"].nunique()
# source_dispensation = []
# for index, (_, group) in enumerate(data[["source", "amount_usd"]].groupby("source")):
#     group.loc[:, "source_dispensation"] = group["amount_usd"].cumsum()
#     source_dispensation.append(group)
#     if not (index % 200_000):
#         print(index, num_unique)
# source_dispensation = pd.concat(source_dispensation, ignore_index=False)
# source_dispensation.to_parquet("source_dispensation.parquet")

In [9]:
source_dispensation = pd.read_parquet("source_dispensation.parquet")

In [10]:
# %%time

# num_unique = data["target"].nunique()
# target_accumulation = []
# for index, (_, group) in enumerate(data[["target", "amount_usd"]].groupby("target")):
#     group.loc[:, "target_accumulation"] = group["amount_usd"].cumsum()
#     target_accumulation.append(group)
#     if not (index % 200_000):
#         print(index, num_unique)
# target_accumulation = pd.concat(target_accumulation, ignore_index=False)
# target_accumulation.to_parquet("target_accumulation.parquet")

In [11]:
target_accumulation = pd.read_parquet("target_accumulation.parquet")

In [12]:
data = source_dispensation[["source_dispensation"]].join(
    target_accumulation[["target_accumulation"]]
).join(data)
data.sort_index(inplace=True)

In [13]:
%%time

dispensation_mapping = {}
for source, group in data[["source", "source_dispensation"]].groupby("source"):
    dispensation_mapping[source] = (group.index.tolist(), group["source_dispensation"].tolist())

accumulation_mapping = {}
for target, group in data[["target", "target_accumulation"]].groupby("target"):
    accumulation_mapping[target] = (group.index.tolist(), group["target_accumulation"].tolist())

CPU times: user 53.4 s, sys: 5.68 s, total: 59.1 s
Wall time: 53.8 s


In [14]:
def get_dis_acc_data(node, mapping_dis, mapping_acc, trx_id):
    data_dis = mapping_dis.get(node)
    if data_dis is None:
        data_acc = mapping_acc[node]
        index_acc = bisect.bisect_right(data_acc[0], trx_id)
        if index_acc:
            index_acc -= 1
        else:
            return 0, 0
        return 0, data_acc[1][index_acc]
    data_acc = mapping_acc.get(node)
    if data_acc is None:
        data_dis = mapping_dis[node]
        index_dis = bisect.bisect_right(data_dis[0], trx_id)
        if index_dis:
            index_dis -= 1
        else:
            return 0, 0
        return data_dis[1][index_dis], 0
    index_dis = bisect.bisect_right(data_dis[0], trx_id)
    index_acc = bisect.bisect_right(data_acc[0], trx_id)
    so_far_dispensed = 0
    if index_dis:
        index_dis -= 1
        so_far_dispensed = data_dis[1][index_dis]
    so_far_accumulated = 0
    if index_acc:
        index_acc -= 1
        so_far_accumulated = data_acc[1][index_acc]
    return so_far_dispensed, so_far_accumulated

In [15]:
def source_dis_acc_data(row):
    return get_dis_acc_data(row["source"], dispensation_mapping, accumulation_mapping, row.name)


def target_dis_acc_data(row):
    return get_dis_acc_data(row["target"], dispensation_mapping, accumulation_mapping, row.name)

In [16]:
%%time

data.loc[:, "dis_acc_source"] = data.apply(source_dis_acc_data, axis=1)
data.loc[:, "dis_acc_target"] = data.apply(target_dis_acc_data, axis=1)

CPU times: user 35.7 s, sys: 562 ms, total: 36.3 s
Wall time: 36.3 s


In [17]:
data.loc[:, "source_more_dispensed"] = data.loc[:, "dis_acc_source"].apply(
    lambda x: x[0] > x[1]
)
data.loc[:, "source_dis_acc_ratio"] = data.loc[:, "dis_acc_source"].apply(
    lambda x: x[0] / (x[1] or 1) if x[0] < x[1] else 1
)
data.loc[:, "source_acc_dis_ratio"] = data.loc[:, "dis_acc_source"].apply(
    lambda x: x[1] / (x[0] or 1) if x[1] < x[0] else 1
)
data.loc[:, "source_positive_balance"] = data.loc[:, "dis_acc_source"].apply(
    lambda x: x[1] - x[0] if x[1] > x[0] else 0
)
data.loc[:, "source_negative_balance"] = data.loc[:, "dis_acc_source"].apply(
    lambda x: x[0] - x[1] if x[0] > x[1] else 0
)

data.loc[:, "target_more_dispensed"] = data.loc[:, "dis_acc_target"].apply(
    lambda x: x[0] > x[1]
)
data.loc[:, "target_dis_acc_ratio"] = data.loc[:, "dis_acc_target"].apply(
    lambda x: x[0] / (x[1] or 1) if x[0] < x[1] else 1
)
data.loc[:, "target_acc_dis_ratio"] = data.loc[:, "dis_acc_target"].apply(
    lambda x: x[1] / (x[0] or 1) if x[1] < x[0] else 1
)
data.loc[:, "target_positive_balance"] = data.loc[:, "dis_acc_target"].apply(
    lambda x: x[1] - x[0] if x[1] > x[0] else 0
)
data.loc[:, "target_negative_balance"] = data.loc[:, "dis_acc_target"].apply(
    lambda x: x[0] - x[1] if x[0] > x[1] else 0
)

In [18]:
del data["dis_acc_source"]
del data["dis_acc_target"]

In [19]:
%%time

active_since = active_since["active_since"].to_dict()
last_trx_ts = data["timestamp"].max() + timedelta(hours=1)
first_trx_ts = data["timestamp"].min() - timedelta(hours=1)
active_for = {k : (last_trx_ts - v).total_seconds() for k, v in active_since.items()}

data.loc[:, "source_active_for"] = data.apply(
    lambda x: (x["timestamp"] - active_since[x["source"]]).total_seconds(), axis=1
)
data.loc[:, "target_active_for"] = data.apply(
    lambda x: (x["timestamp"] - active_since[x["target"]]).total_seconds(), axis=1
)

CPU times: user 49.5 s, sys: 1.03 s, total: 50.6 s
Wall time: 50.6 s


In [20]:
def update_types(df):
    round_columns = [
        "source_dispensation",
        "target_accumulation",
        "amount",
        "amount_usd",
        "source_active_for",
        "target_active_for",
    ]
    for col in round_columns:
        df.loc[:, col] = np.ceil(df.loc[:, col])
    new_types = {
        "source_dispensation": np.uint64,
        "target_accumulation": np.uint64,
        "num_transactions": np.uint16,
        "amount": np.uint64,
        "amount_usd": np.uint64,
        "source_dis_acc_ratio": np.float16,
        "source_acc_dis_ratio": np.float16,
        "source_positive_balance": np.uint64,
        "source_negative_balance": np.uint64,
        "target_dis_acc_ratio": np.float16,
        "target_acc_dis_ratio": np.float16,
        "target_positive_balance": np.uint64,
        "target_negative_balance": np.uint64,
        "source_active_for": np.uint32,
        "target_active_for": np.uint32,
    }
    return df.astype(new_types)

In [21]:
data = update_types(data)
data.loc[:, "is_phishing"] = data.loc[:, "target"].apply(lambda x: bool(phishing_nodes[x]))

In [22]:
assert data.index.tolist() == list(range(data.shape[0]))

train = data.loc[data["source"].isin(train_accounts) & data["target"].isin(train_accounts), :]
validation = data.loc[data["source"].isin(validation_accounts) & data["target"].isin(validation_accounts), :]
train_validation = data.loc[
    data["source"].isin(train_accounts | validation_accounts) & 
    data["target"].isin(train_accounts | validation_accounts), :
]
test = data.loc[data["source"].isin(test_accounts) & data["target"].isin(test_accounts), :]
print(
    round(train.shape[0] / data.shape[0], 2), 
    round(validation.shape[0] / data.shape[0], 2), 
    round(test.shape[0] / data.shape[0], 2)
)

assert set(train.index).intersection(validation.index) == set()
assert set(validation.index).intersection(test.index) == set()
assert set(train.index).intersection(test.index) == set()

0.67 0.06 0.04


In [23]:
print(
    len(train_accounts.intersection([x for x, y in phishing_nodes.items() if y == 1])),
    len(validation_accounts.intersection([x for x, y in phishing_nodes.items() if y == 1])),
    len(test_accounts.intersection([x for x, y in phishing_nodes.items() if y == 1])),
    len([x for x, y in phishing_nodes.items() if y == 1]),
)

502 354 308 1164


In [24]:
def get_trx_features(df, source_target):
    trx_feats = df.groupby(source_target).agg({
        "source_dispensation": ["max"],
        "target_accumulation": ["max"],
        "amount_usd": ["mean", "median", "max", "std"],  # skew, kurtosis ?
        "num_transactions": ["sum", "count"],
        "is_zero_transaction": ["sum"],
        "source_dis_acc_ratio": ["mean", "std"],
        "source_acc_dis_ratio": ["mean", "std"],    
        "target_dis_acc_ratio": ["mean", "std"],
        "target_acc_dis_ratio": ["mean", "std"],
        "source_positive_balance": ["max", "mean", "std"],
        "source_negative_balance": ["max", "mean", "std"],
        "target_positive_balance": ["max", "mean", "std"],
        "target_negative_balance": ["max", "mean", "std"],
        "source_active_for": ["max", "std"],
        "target_active_for": ["max", "std"],
    })
    trx_feats.columns = [f"trx_feats_{source_target}_{col}_{stat}" for col, stat in trx_feats.columns]
    trx_feats.index.name = "key"
    return trx_feats

In [25]:
%%time

train_trx_features = get_trx_features(train, "source").join(
    get_trx_features(train, "target"), how="outer"
)
validation_trx_features = get_trx_features(train_validation, "source").join(
    get_trx_features(train_validation, "target"), how="outer"
)
test_trx_features = get_trx_features(data, "source").join(
    get_trx_features(data, "target"), how="outer"
)

CPU times: user 15.2 s, sys: 1.79 s, total: 17 s
Wall time: 17.2 s


In [26]:
start = time.time()

In [27]:
# shutil.rmtree(location_main_features, ignore_errors=True)

In [28]:
location_main_features = "features"
location_train = f"{location_main_features}{os.sep}train{os.sep}"
location_validation = f"{location_main_features}{os.sep}validation{os.sep}"
location_test = f"{location_main_features}{os.sep}test{os.sep}"

try:
    os.makedirs(location_main_features)
except FileExistsError:
    pass

In [29]:
%%time

data_agg = (
    train.groupby(["source", "target"])
    .agg(
        amount=("amount_usd", "sum")
    )
).reset_index()
nodes_source = set(train["source"].unique())
nodes_target = set(train["target"].unique())
nodes_passthrough = nodes_source.intersection(nodes_target)

%run communities_global.ipynb

communities_as_source_features.to_parquet(f"{location_main_features}/train_communities_as_source_features.parquet")
communities_as_target_features.to_parquet(f"{location_main_features}/train_communities_as_target_features.parquet")
communities_as_passthrough_features.to_parquet(f"{location_main_features}/train_communities_as_passthrough_features.parquet")

Processing communities_as_source

Processed hop #1 | 2,731,722 | 1,317,135
Processed hop #2 | 19,331,689 | 717,007
Processed hop #3 | 19,170,949 | 593,559
Processed hop #4 | 22,535,755 | 590,463
Processing communities_as_target

Processed hop #1 | 1,133,184 | 803,740
Processed hop #2 | 25,956,156 | 796,967
Processed hop #3 | 24,870,974 | 778,478
Processed hop #4 | 35,492,511 | 774,624
Processing communities_as_passthrough

Processed hop #1 | 910,130 | 188,108
Processed hop #2 | 3,653,972 | 132,093
Processed hop #3 | 3,791,310 | 112,810
Processed hop #4 | 4,441,974 | 111,536
CPU times: user 13min 22s, sys: 1min 23s, total: 14min 46s
Wall time: 15min 1s


In [30]:
communities_as_source_features = pd.read_parquet(f"{location_main_features}/train_communities_as_source_features.parquet")
communities_as_target_features = pd.read_parquet(f"{location_main_features}/train_communities_as_target_features.parquet")
communities_as_passthrough_features = pd.read_parquet(f"{location_main_features}/train_communities_as_passthrough_features.parquet")

In [31]:
%%time

try:
    os.makedirs(location_train)
except FileExistsError:
    pass

in_scope_window = train.copy(deep=True)
in_scope_nodes = list(set(train["source"].unique()).union(train["target"].unique()))
%run model_experiment_nested.ipynb
train.set_index("target").join(
    communities_as_source_features, how="left", rsuffix="_sf_target"
).reset_index().set_index("source").join(
    communities_as_source_features, how="left", rsuffix="_sf_source"
).reset_index().set_index("target").join(
    communities_as_target_features, how="left", rsuffix="_tf_target"
).reset_index().set_index("source").join(
    communities_as_target_features, how="left", rsuffix="_tf_source"
).reset_index().set_index("target").join(
    communities_as_passthrough_features, how="left", rsuffix="_pf_target"
).reset_index().set_index("source").join(
    communities_as_passthrough_features, how="left", rsuffix="_pf_source"
).reset_index().set_index("target").join(
    train_trx_features, how="left", rsuffix="_trx_target"
).reset_index().set_index("source").join(
    train_trx_features, how="left", rsuffix="_trx_source"
).reset_index().set_index("target").join(
    features_all.set_index("key"), how="left", rsuffix="_gf_target"
).reset_index().set_index("source").join(
    features_all.set_index("key"), how="left", rsuffix="_gf_source"
).reset_index().to_parquet(f"{location_train}data.parquet")

del train_trx_features

5 91 9033324
CPU times: user 4min 16s, sys: 1min 10s, total: 5min 26s
Wall time: 2h 10min 32s


In [32]:
%%time

data_agg = (
    train_validation.groupby(["source", "target"])
    .agg(
        amount=("amount_usd", "sum")
    )
).reset_index()
nodes_source = set(validation["source"].unique())
nodes_target = set(validation["target"].unique())
nodes_passthrough = nodes_source.intersection(nodes_target)

%run communities_global.ipynb

communities_as_source_features.to_parquet(f"{location_main_features}/valid_communities_as_source_features.parquet")
communities_as_target_features.to_parquet(f"{location_main_features}/valid_communities_as_target_features.parquet")
communities_as_passthrough_features.to_parquet(f"{location_main_features}/valid_communities_as_passthrough_features.parquet")

Processing communities_as_source

Processed hop #1 | 357,609 | 178,569
Processed hop #2 | 1,081,137 | 87,342
Processed hop #3 | 740,762 | 77,555
Processed hop #4 | 646,741 | 76,702
Processing communities_as_target

Processed hop #1 | 67,761 | 29,249
Processed hop #2 | 766,632 | 25,546
Processed hop #3 | 269,533 | 18,632
Processed hop #4 | 756,821 | 18,336
Processing communities_as_passthrough

Processed hop #1 | 31,536 | 5,519
Processed hop #2 | 66,064 | 3,346
Processed hop #3 | 54,074 | 2,543
Processed hop #4 | 67,521 | 2,285
CPU times: user 32.7 s, sys: 4.35 s, total: 37.1 s
Wall time: 45.2 s


In [33]:
communities_as_source_features = pd.read_parquet(f"{location_main_features}/valid_communities_as_source_features.parquet")
communities_as_target_features = pd.read_parquet(f"{location_main_features}/valid_communities_as_target_features.parquet")
communities_as_passthrough_features = pd.read_parquet(f"{location_main_features}/valid_communities_as_passthrough_features.parquet")

In [34]:
%%time

try:
    os.makedirs(location_validation)
except FileExistsError:
    pass

in_scope_window = train_validation.copy(deep=True)
in_scope_nodes = list(set(validation["source"].unique()).union(validation["target"].unique()))
%run model_experiment_nested.ipynb
validation.set_index("target").join(
    communities_as_source_features, how="left", rsuffix="_sf_target"
).reset_index().set_index("source").join(
    communities_as_source_features, how="left", rsuffix="_sf_source"
).reset_index().set_index("target").join(
    communities_as_target_features, how="left", rsuffix="_tf_target"
).reset_index().set_index("source").join(
    communities_as_target_features, how="left", rsuffix="_tf_source"
).reset_index().set_index("target").join(
    communities_as_passthrough_features, how="left", rsuffix="_pf_target"
).reset_index().set_index("source").join(
    communities_as_passthrough_features, how="left", rsuffix="_pf_source"
).reset_index().set_index("target").join(
    validation_trx_features, how="left", rsuffix="_trx_target"
).reset_index().set_index("source").join(
    validation_trx_features, how="left", rsuffix="_trx_source"
).reset_index().set_index("target").join(
    features_all.set_index("key"), how="left", rsuffix="_gf_target"
).reset_index().set_index("source").join(
    features_all.set_index("key"), how="left", rsuffix="_gf_source"
).reset_index().to_parquet(f"{location_validation}data.parquet")

del validation_trx_features

5 92 912230
CPU times: user 3min 24s, sys: 13.4 s, total: 3min 37s
Wall time: 31min 37s


In [35]:
%%time

data_agg = (
    data.groupby(["source", "target"])
    .agg(
        amount=("amount_usd", "sum")
    )
).reset_index()
nodes_source = set(test["source"].unique())
nodes_target = set(test["target"].unique())
nodes_passthrough = nodes_source.intersection(nodes_target)

%run communities_global.ipynb

communities_as_source_features.to_parquet(f"{location_main_features}/test_communities_as_source_features.parquet")
communities_as_target_features.to_parquet(f"{location_main_features}/test_communities_as_target_features.parquet")
communities_as_passthrough_features.to_parquet(f"{location_main_features}/test_communities_as_passthrough_features.parquet")

Processing communities_as_source

Processed hop #1 | 229,905 | 146,344
Processed hop #2 | 2,156,008 | 86,402
Processed hop #3 | 816,499 | 77,163
Processed hop #4 | 1,834,126 | 74,966
Processing communities_as_target

Processed hop #1 | 92,315 | 44,292
Processed hop #2 | 1,611,384 | 42,234
Processed hop #3 | 490,965 | 40,678
Processed hop #4 | 1,655,175 | 40,446
Processing communities_as_passthrough

Processed hop #1 | 31,946 | 7,629
Processed hop #2 | 237,115 | 6,589
Processed hop #3 | 95,876 | 5,946
Processed hop #4 | 230,346 | 5,333
CPU times: user 42.6 s, sys: 1.44 s, total: 44.1 s
Wall time: 44.2 s


In [36]:
communities_as_source_features = pd.read_parquet(f"{location_main_features}/test_communities_as_source_features.parquet")
communities_as_target_features = pd.read_parquet(f"{location_main_features}/test_communities_as_target_features.parquet")
communities_as_passthrough_features = pd.read_parquet(f"{location_main_features}/test_communities_as_passthrough_features.parquet")

In [37]:
%%time

try:
    os.makedirs(location_test)
except FileExistsError:
    pass

in_scope_window = data.copy(deep=True)
in_scope_nodes = list(set(test["source"].unique()).union(test["target"].unique()))
%run model_experiment_nested.ipynb
test.set_index("target").join(
    communities_as_source_features, how="left", rsuffix="_sf_target"
).reset_index().set_index("source").join(
    communities_as_source_features, how="left", rsuffix="_sf_source"
).reset_index().set_index("target").join(
    communities_as_target_features, how="left", rsuffix="_tf_target"
).reset_index().set_index("source").join(
    communities_as_target_features, how="left", rsuffix="_tf_source"
).reset_index().set_index("target").join(
    communities_as_passthrough_features, how="left", rsuffix="_pf_target"
).reset_index().set_index("source").join(
    communities_as_passthrough_features, how="left", rsuffix="_pf_source"
).reset_index().set_index("target").join(
    test_trx_features, how="left", rsuffix="_trx_target"
).reset_index().set_index("source").join(
    test_trx_features, how="left", rsuffix="_trx_source"
).reset_index().set_index("target").join(
    features_all.set_index("key"), how="left", rsuffix="_gf_target"
).reset_index().set_index("source").join(
    features_all.set_index("key"), how="left", rsuffix="_gf_source"
).reset_index().to_parquet(f"{location_test}data.parquet")

del test_trx_features

4 84 789229
CPU times: user 4min 7s, sys: 15 s, total: 4min 22s
Wall time: 34min 30s


In [38]:
print((time.time() - start) // 60)

213.0


In [39]:
def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))

In [40]:
def train_model(x, y, x_, y_):
    model = xgb.XGBClassifier(
        early_stopping_rounds=20, scale_pos_weight=10,
        eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=15,
        colsample_bytree=0.5, subsample=0.5, max_depth=6,
    )
    model.fit(x, y, verbose=False, eval_set=[(x_, y_)])
    print(f"Best iteration: {model.best_iteration}\n")
    return model

In [41]:
%%time

label_columns = ["source", "target", "timestamp", "is_phishing"]

train_features = pd.read_parquet(f"{location_train}data.parquet")
train_features_labels = train_features.loc[:, label_columns]
for col in label_columns:
    del train_features[col]

validation_features = pd.read_parquet(f"{location_validation}data.parquet")
validation_features_labels = validation_features.loc[:, label_columns]
for col in label_columns:
    del validation_features[col]

test_features = pd.read_parquet(f"{location_test}data.parquet")
test_features_labels = test_features.loc[:, label_columns]
for col in label_columns:
    del test_features[col]

model = train_model(
    train_features, train_features_labels["is_phishing"].values,
    validation_features, validation_features_labels["is_phishing"].values
)

y_test_predicted = model.predict(test_features)
print("F1", round(f1_score(test_features_labels["is_phishing"], y_test_predicted) * 100, 2))
print("Recall", round(recall_score(test_features_labels["is_phishing"], y_test_predicted) * 100, 2))

test_features_labels.loc[:, "predicted"] = y_test_predicted
test_labels_orig = test_features_labels.set_index(["source", "target"]).join(
    data_orig_copy.set_index(["source", "target"]), how="inner"
).reset_index()

print()
f1_final = round(f1_score(test_labels_orig["is_phishing"], test_labels_orig["predicted"]) * 100, 2)
print("F1", f1_final)
print("Recall", round(recall_score(test_labels_orig["is_phishing"], test_labels_orig["predicted"]) * 100, 2))
print()

Best iteration: 15

F1 68.62
Recall 59.2

F1 64.11
Recall 54.54

CPU times: user 34min 19s, sys: 11.9 s, total: 34min 31s
Wall time: 4min


In [42]:
%%time

CV_FOLD_PERC = 0.8
N_FOLDS = 5

f1_scores = []
for fold in range(N_FOLDS):
    print("Fold", fold + 1)
    x_train = train_features.sample(frac=CV_FOLD_PERC)
    x_train_labels = x_train.loc[:, []].join(train_features_labels, how="left")
    x_validation = validation_features.sample(frac=CV_FOLD_PERC)
    x_validation_labels = x_validation.loc[:, []].join(validation_features_labels, how="left")
    model = train_model(
        x_train, x_train_labels["is_phishing"].values, 
        x_validation, x_validation_labels["is_phishing"].values
    )
    y_test_predicted = model.predict(test_features)
    test_features_labels.loc[:, "predicted"] = y_test_predicted
    test_labels_orig = test_features_labels.set_index(["source", "target"]).join(
        data_orig_copy.set_index(["source", "target"]), how="inner"
    ).reset_index()
    f1_cv = f1_score(test_labels_orig["is_phishing"], test_labels_orig["predicted"]) * 100
    print(
        round(f1_cv, 2),
        round(recall_score(test_labels_orig["is_phishing"], test_labels_orig["predicted"]) * 100, 2)
    )
    f1_scores.append(f1_cv)

Fold 1
Best iteration: 22

63.12 52.04
Fold 2
Best iteration: 26

62.47 52.29
Fold 3
Best iteration: 20

64.11 54.18
Fold 4
Best iteration: 23

62.04 51.53
Fold 5
Best iteration: 15

64.25 54.59
CPU times: user 3h 6min 21s, sys: 1min 23s, total: 3h 7min 45s
Wall time: 21min 7s


In [43]:
print(f"{f1_final} ±{round(np.std(f1_scores), 2)}")

64.11 ±0.87
