In [1]:
import bisect
import os
import pickle
import random
import shutil
import sys
import time
import uuid
from glob import glob
from datetime import timedelta, datetime
from itertools import combinations

import igraph as ig
import numpy as np
import pandas as pd
import xgboost as xgb
import networkx as nx
from sklearn.metrics import f1_score, recall_score

import settings as s
from communities import get_communities_multi_proc
from features import get_features_multi_proc, pov_features
from common import create_workload_for_multi_proc

%load_ext autoreload
%autoreload 2

In [2]:
%%time

with open("./data/MulDiGraph.pkl", "rb") as f:
    G = pickle.load(f)

nodes_mapping = {}
phishing_nodes = {}
for idx, nd in enumerate(nx.nodes(G)):
    nodes_mapping[nd] = f"id-{idx}"
    phishing_nodes[nodes_mapping[nd]] = G.nodes[nd]["isp"]

rows = []
for edge in nx.edges(G):
    source, target = edge
    attrs = G[source][target][0]
    amount, timestamp = attrs["amount"], attrs["timestamp"]
    source, target = nodes_mapping[source], nodes_mapping[target]
    rows.append(
        {
            "source": source, 
            "target": target,
            "timestamp": datetime.fromtimestamp(timestamp),
            "amount": amount, 
        }
    )
data = pd.DataFrame(rows)
data = data.sort_values("timestamp").reset_index(drop=True)
data.index.name = "transaction_id"
size_orig = data.shape[0]
print(size_orig)
print()

13551303

CPU times: user 31.7 s, sys: 1.43 s, total: 33.1 s
Wall time: 33.1 s


In [3]:
TRAIN_PERC = 0.65
VALIDATION_PERC = 0.15
TEST_PERC = 0.2

NUM_PROCS = 10

assert(sum([TRAIN_PERC, VALIDATION_PERC, TEST_PERC]) == 1)

In [4]:
%%time

source_firsts = data.groupby("source").agg(first_trx=("timestamp", "min"))
target_firsts = data.groupby("target").agg(first_trx=("timestamp", "min"))
active_since = source_firsts.join(target_firsts, lsuffix="_left", how="outer").fillna(datetime.now())
active_since.loc[:, "active_since"] = active_since.apply(lambda x: min([x["first_trx_left"], x["first_trx"]]), axis=1)
active_since = active_since.loc[:, ["active_since"]]
active_since.sort_values("active_since", inplace=True)

CPU times: user 25.4 s, sys: 239 ms, total: 25.7 s
Wall time: 25.7 s


In [5]:
number_of_train_accounts = int(np.floor(active_since.shape[0] * TRAIN_PERC))
number_of_validation_accounts = int(np.floor(active_since.shape[0] * VALIDATION_PERC))
train_accounts = set(active_since.head(number_of_train_accounts).index.tolist())
assert len(train_accounts) == number_of_train_accounts
remaining = active_since.loc[~active_since.index.isin(train_accounts), :].sort_values("active_since")
validation_accounts = set(remaining.head(number_of_validation_accounts).index.tolist())
assert len(validation_accounts) == number_of_validation_accounts
test_accounts = set(active_since.index) - train_accounts - validation_accounts
print(f"{len(train_accounts):,} | {len(validation_accounts):,} | {len(test_accounts):,}")
assert sorted(train_accounts | validation_accounts | test_accounts) == sorted(active_since.index)

1,932,767 | 446,023 | 594,699


In [6]:
data = data.groupby(["source", "target", "timestamp"]).agg(
    amount=("amount", "sum"),
    num_transactions=("amount", "count"),
).reset_index()
data = data.sort_values("timestamp").reset_index(drop=True)
data.index.name = "transaction_id"
size_aggd = data.shape[0]
print(size_aggd, round(size_aggd / size_orig, 2))

5355155 0.4


In [7]:
%%time

rates = pd.read_csv("data/rates.csv", sep=";")
rates.loc[:, "rate"] = (rates["low"] + rates["high"]) / 2
rates.index = pd.to_datetime(rates["timeOpen"]).dt.date
rates = rates["rate"].to_dict()
data.loc[:, "amount_usd"] = data.apply(lambda x: rates[x["timestamp"].date()] * x["amount"], axis=1)
data.loc[:, "is_zero_transaction"] = data.loc[:, "amount"] == 0

data.loc[data["amount"] < 1e-6, "amount"] = 1e-6
data.loc[data["amount_usd"] < 1e-6, "amount_usd"] = 1e-6
data = data.astype({"amount": np.float32, "amount_usd": np.float32})
columns = [
    "source", "target", "timestamp", "num_transactions", 
    "amount", "amount_usd", "is_zero_transaction",
]
data = data.loc[:, columns]

CPU times: user 19.2 s, sys: 213 ms, total: 19.4 s
Wall time: 19.4 s


In [8]:
%%time

num_unique = data["source"].nunique()
source_dispensation = []
for index, (_, group) in enumerate(data[["source", "amount_usd"]].groupby("source")):
    group.loc[:, "source_dispensation"] = group["amount_usd"].cumsum()
    source_dispensation.append(group)
    if not (index % 200_000):
        print(index, num_unique)
source_dispensation = pd.concat(source_dispensation, ignore_index=False)

0 2113093
200000 2113093
400000 2113093
600000 2113093
800000 2113093
1000000 2113093
1200000 2113093
1400000 2113093
1600000 2113093
1800000 2113093
2000000 2113093
CPU times: user 5min 21s, sys: 11.6 s, total: 5min 32s
Wall time: 5min 27s


In [9]:
%%time

num_unique = data["target"].nunique()
target_accumulation = []
for index, (_, group) in enumerate(data[["target", "amount_usd"]].groupby("target")):
    group.loc[:, "target_accumulation"] = group["amount_usd"].cumsum()
    target_accumulation.append(group)
    if not (index % 200_000):
        print(index, num_unique)
target_accumulation = pd.concat(target_accumulation, ignore_index=False)

0 1119024
200000 1119024
400000 1119024
600000 1119024
800000 1119024
1000000 1119024
CPU times: user 2min 41s, sys: 4.88 s, total: 2min 46s
Wall time: 2min 43s


In [10]:
data = source_dispensation[["source_dispensation"]].join(
    target_accumulation[["target_accumulation"]]
).join(data)
data.sort_index(inplace=True)

In [11]:
%%time

dispensation_mapping = {}
for source, group in data[["source", "source_dispensation"]].groupby("source"):
    dispensation_mapping[source] = (group.index.tolist(), group["source_dispensation"].tolist())

accumulation_mapping = {}
for target, group in data[["target", "target_accumulation"]].groupby("target"):
    accumulation_mapping[target] = (group.index.tolist(), group["target_accumulation"].tolist())

CPU times: user 44.1 s, sys: 4.59 s, total: 48.7 s
Wall time: 44.6 s


In [12]:
def get_dis_acc_data(node, mapping_dis, mapping_acc, trx_id):
    data_dis = mapping_dis.get(node)
    if data_dis is None:
        data_acc = mapping_acc[node]
        index_acc = bisect.bisect_right(data_acc[0], trx_id)
        if index_acc:
            index_acc -= 1
        else:
            return 0, 0
        return 0, data_acc[1][index_acc]
    data_acc = mapping_acc.get(node)
    if data_acc is None:
        data_dis = mapping_dis[node]
        index_dis = bisect.bisect_right(data_dis[0], trx_id)
        if index_dis:
            index_dis -= 1
        else:
            return 0, 0
        return data_dis[1][index_dis], 0
    index_dis = bisect.bisect_right(data_dis[0], trx_id)
    index_acc = bisect.bisect_right(data_acc[0], trx_id)
    so_far_dispensed = 0
    if index_dis:
        index_dis -= 1
        so_far_dispensed = data_dis[1][index_dis]
    so_far_accumulated = 0
    if index_acc:
        index_acc -= 1
        so_far_accumulated = data_acc[1][index_acc]
    return so_far_dispensed, so_far_accumulated

In [13]:
def source_dis_acc_data(row):
    return get_dis_acc_data(row["source"], dispensation_mapping, accumulation_mapping, row.name)


def target_dis_acc_data(row):
    return get_dis_acc_data(row["target"], dispensation_mapping, accumulation_mapping, row.name)

In [14]:
%%time

data.loc[:, "dis_acc_source"] = data.apply(source_dis_acc_data, axis=1)
data.loc[:, "dis_acc_target"] = data.apply(target_dis_acc_data, axis=1)

CPU times: user 35.6 s, sys: 485 ms, total: 36.1 s
Wall time: 36.1 s


In [15]:
data.loc[:, "source_more_dispensed"] = data.loc[:, "dis_acc_source"].apply(
    lambda x: x[0] > x[1]
)
data.loc[:, "source_dis_acc_ratio"] = data.loc[:, "dis_acc_source"].apply(
    lambda x: x[0] / (x[1] or 1) if x[0] < x[1] else 1
)
data.loc[:, "source_acc_dis_ratio"] = data.loc[:, "dis_acc_source"].apply(
    lambda x: x[1] / (x[0] or 1) if x[1] < x[0] else 1
)
data.loc[:, "source_positive_balance"] = data.loc[:, "dis_acc_source"].apply(
    lambda x: x[1] - x[0] if x[1] > x[0] else 0
)
data.loc[:, "source_negative_balance"] = data.loc[:, "dis_acc_source"].apply(
    lambda x: x[0] - x[1] if x[0] > x[1] else 0
)

data.loc[:, "target_more_dispensed"] = data.loc[:, "dis_acc_target"].apply(
    lambda x: x[0] > x[1]
)
data.loc[:, "target_dis_acc_ratio"] = data.loc[:, "dis_acc_target"].apply(
    lambda x: x[0] / (x[1] or 1) if x[0] < x[1] else 1
)
data.loc[:, "target_acc_dis_ratio"] = data.loc[:, "dis_acc_target"].apply(
    lambda x: x[1] / (x[0] or 1) if x[1] < x[0] else 1
)
data.loc[:, "target_positive_balance"] = data.loc[:, "dis_acc_target"].apply(
    lambda x: x[1] - x[0] if x[1] > x[0] else 0
)
data.loc[:, "target_negative_balance"] = data.loc[:, "dis_acc_target"].apply(
    lambda x: x[0] - x[1] if x[0] > x[1] else 0
)

In [16]:
del data["dis_acc_source"]
del data["dis_acc_target"]

In [17]:
%%time

active_since = active_since["active_since"].to_dict()
last_trx_ts = data["timestamp"].max() + timedelta(hours=1)
first_trx_ts = data["timestamp"].min() - timedelta(hours=1)
active_for = {k : (last_trx_ts - v).total_seconds() for k, v in active_since.items()}

data.loc[:, "source_active_for"] = data.apply(
    lambda x: (x["timestamp"] - active_since[x["source"]]).total_seconds(), axis=1
)
data.loc[:, "target_active_for"] = data.apply(
    lambda x: (x["timestamp"] - active_since[x["target"]]).total_seconds(), axis=1
)

CPU times: user 49.1 s, sys: 884 ms, total: 50 s
Wall time: 50 s


In [18]:
assert data.index.tolist() == list(range(data.shape[0]))

train = data.loc[data["source"].isin(train_accounts) & data["target"].isin(train_accounts), :]
validation = data.loc[data["source"].isin(validation_accounts) & data["target"].isin(validation_accounts), :]
train_validation = data.loc[
    data["source"].isin(train_accounts | validation_accounts) & 
    data["target"].isin(train_accounts | validation_accounts), :
]
test = data.loc[data["source"].isin(test_accounts) & data["target"].isin(test_accounts), :]
print(
    round(train.shape[0] / data.shape[0], 2), 
    round(validation.shape[0] / data.shape[0], 2), 
    round(test.shape[0] / data.shape[0], 2)
)

assert set(train.index).intersection(validation.index) == set()
assert set(validation.index).intersection(test.index) == set()
assert set(train.index).intersection(test.index) == set()

0.67 0.06 0.04


In [19]:
print(
    len(train_accounts.intersection([x for x, y in phishing_nodes.items() if y == 1])),
    len(validation_accounts.intersection([x for x, y in phishing_nodes.items() if y == 1])),
    len(test_accounts.intersection([x for x, y in phishing_nodes.items() if y == 1])),
    len([x for x, y in phishing_nodes.items() if y == 1]),
)

502 354 309 1165


In [20]:
def get_trx_features(df, source_target):
    trx_feats = df.groupby(source_target).agg({
        "source_dispensation": ["max"],
        "target_accumulation": ["max"],
        "amount_usd": ["mean", "median", "max", "std"],  # skew, kurtosis ?
        "num_transactions": ["sum", "count"],
        "is_zero_transaction": ["sum"],
        "source_dis_acc_ratio": ["mean", "std"],
        "source_acc_dis_ratio": ["mean", "std"],    
        "target_dis_acc_ratio": ["mean", "std"],
        "target_acc_dis_ratio": ["mean", "std"],
        "source_positive_balance": ["max", "mean", "std"],
        "source_negative_balance": ["max", "mean", "std"],
        "target_positive_balance": ["max", "mean", "std"],
        "target_negative_balance": ["max", "mean", "std"],
        "source_active_for": ["max", "std"],
        "target_active_for": ["max", "std"],
    })
    trx_feats.columns = [f"trx_feats_{source_target}_{col}_{stat}" for col, stat in trx_feats.columns]
    trx_feats.index.name = "key"
    return trx_feats

In [21]:
%%time

train_trx_features = get_trx_features(train, "source").join(
    get_trx_features(train, "target"), how="outer"
)
validation_trx_features = get_trx_features(train_validation, "source").join(
    get_trx_features(train_validation, "target"), how="outer"
)
test_trx_features = get_trx_features(data, "source").join(
    get_trx_features(data, "target"), how="outer"
)

CPU times: user 13.6 s, sys: 1.33 s, total: 14.9 s
Wall time: 14.9 s


In [22]:
start = time.time()

In [23]:
location_main_features = "features"

location_train = f"{location_main_features}{os.sep}train{os.sep}"
location_validation = f"{location_main_features}{os.sep}validation{os.sep}"
location_test = f"{location_main_features}{os.sep}test{os.sep}"

In [24]:
# shutil.rmtree(location_main_features, ignore_errors=True)

In [25]:
%%time

try:
    os.makedirs(location_train)
except FileExistsError:
    pass

in_scope_window = train.copy(deep=True)
in_scope_nodes = list(set(train["source"].unique()).union(train["target"].unique()))
%run model_experiment_nested.ipynb
features_all = features_all.join(train_trx_features, how="left")
features_all.to_parquet(f"{location_train}data.parquet")

5 74 9079121
0 194
20 194
40 194
60 194
80 194
100 194
120 194
140 194
160 194
180 194
CPU times: user 7min 17s, sys: 26.4 s, total: 7min 43s
Wall time: 59min 14s


In [26]:
%%time

try:
    os.makedirs(location_validation)
except FileExistsError:
    pass

in_scope_window = train_validation.copy(deep=True)
in_scope_nodes = list(set(validation["source"].unique()).union(validation["target"].unique()))
%run model_experiment_nested.ipynb
features_all = features_all.join(validation_trx_features, how="left")
features_all.to_parquet(f"{location_validation}data.parquet")

4 65 871854
0 21
20 21
CPU times: user 4min 52s, sys: 13.5 s, total: 5min 6s
Wall time: 25min 51s


In [27]:
%%time

try:
    os.makedirs(location_test)
except FileExistsError:
    pass

in_scope_window = data.copy(deep=True)
in_scope_nodes = list(set(test["source"].unique()).union(test["target"].unique()))
%run model_experiment_nested.ipynb
features_all = features_all.join(test_trx_features, how="left")
features_all.to_parquet(f"{location_test}data.parquet")

4 81 791044
0 19
CPU times: user 5min 42s, sys: 15.9 s, total: 5min 58s
Wall time: 30min 11s


In [28]:
print((time.time() - start) // 60)

115.0


In [29]:
def f1_eval_multiclass(y, y_):
    labels_predicted = y_.argmax(axis=1)
    class_1 = labels_predicted.copy()
    class_1[class_1 == 2] = 0
    class_2 = labels_predicted.copy()
    class_2[class_2 == 1] = 0
    class_2[class_2 == 2] = 1
    labels_actual = y.copy()
    labels_actual[labels_actual == 2] = 0
    f1_error_1 = 1 - f1_score(labels_actual, class_1)
    labels_actual = y.copy()
    labels_actual[labels_actual == 1] = 0
    labels_actual[labels_actual == 2] = 1
    f1_error_2 = 1 - f1_score(labels_actual, class_2)
    return f1_error_1 + f1_error_2


def f1_eval(y, y_):
    return 1 - f1_score(y, np.round(y_))

In [30]:
def load_and_merge_features(features_normal, features_graph, feat_for):
    features_normal.loc[:, "transaction_id"] = features_normal.index.tolist()
    if "is_phishing" in features_normal.columns:
        del features_normal["is_phishing"]
    features_normal.loc[:, "is_phishing"] = features_normal.loc[:, feat_for].apply(lambda x: bool(phishing_nodes[x]))
    features_graph_source = features_normal.set_index("source").join(
        features_graph.set_index("key"), how="left", rsuffix="_source"
    ).reset_index()
    features_graph_target = features_normal[["transaction_id", "target"]].set_index("target").join(
        features_graph.set_index("key"), how="left", rsuffix="_target"
    ).reset_index()
    del features_graph_target["target"]
    result = features_graph_source.set_index("transaction_id").join(
        features_graph_target.set_index("transaction_id"), how="left", rsuffix="_target"
    )
    labels = result.loc[:, label_columns].copy(deep=True)
    for c in label_columns:
        del result[c]
    return result, labels

In [31]:
train_features = pd.read_parquet(f"{location_train}data.parquet").fillna(0)
train_features.loc[:, "is_phishing"] = train_features.loc[:, "key"].apply(lambda x: bool(phishing_nodes[x]))
train_features_labels = train_features.loc[:, ["key", "is_phishing"]]
del train_features["key"]
del train_features["is_phishing"]

In [50]:
%%time

all_test_nodes = set(test["source"].unique()).union(test["target"].unique())
phishers_data = pd.DataFrame(all_test_nodes, columns=["node"])
phishers_data.loc[:, "is_phisher"] = False
phishers_data.loc[:, "is_phisher"] = phishers_data["node"].apply(lambda x: bool(phishing_nodes[x]))
phishers_data.loc[:, "is_phisher_predicted"] = False

train_features = pd.read_parquet(f"{location_train}data.parquet").fillna(0)
train_features.loc[:, "is_phishing"] = train_features.loc[:, "key"].apply(lambda x: bool(phishing_nodes[x]))
train_features_labels = train_features.loc[:, ["key", "is_phishing"]]
del train_features["key"]
del train_features["is_phishing"]

validation_features = pd.read_parquet(f"{location_validation}data.parquet").fillna(0)
validation_features.loc[:, "is_phishing"] = validation_features.loc[:, "key"].apply(lambda x: bool(phishing_nodes[x]))
validation_features_labels = validation_features.loc[:, ["key", "is_phishing"]]
del validation_features["key"]
del validation_features["is_phishing"]

test_features = pd.read_parquet(f"{location_test}data.parquet").fillna(0)
test_features.loc[:, "is_phishing"] = test_features.loc[:, "key"].apply(lambda x: bool(phishing_nodes[x]))
test_features_labels = test_features.loc[:, ["key", "is_phishing"]]
del test_features["key"]
del test_features["is_phishing"]

# scale_pos_weight = int(train_features_labels.shape[0] / (train_features_labels["is_phishing"].sum() or 1))
scale_pos_weight = 10

# model = xgb.XGBClassifier(
#     early_stopping_rounds=100,
#     objective="multi:softprob", num_class=3, eval_metric=f1_eval, disable_default_eval_metric=True
# )
# model.fit(
#     train_features, train_features_labels["is_phishing"].values,
#     eval_set=[
#         # (train_features, train_features_labels["is_laundering"].values), 
#         (validation_features, validation_features_labels["is_phishing"].values)
#     ]
# )

model = xgb.XGBClassifier(
    early_stopping_rounds=20, scale_pos_weight=scale_pos_weight,
    eval_metric=f1_eval, disable_default_eval_metric=True, num_parallel_tree=10,
    colsample_bytree=0.5, subsample=0.5, max_depth=6,
)
model.fit(
    train_features, train_features_labels["is_phishing"].values, verbose=False,
    eval_set=[
        # (train_features, train_features_labels["is_laundering"].values), 
        (validation_features, validation_features_labels["is_phishing"].values)
    ]
)
print(f"Best iteration: {model.best_iteration}")

y_test_predicted = model.predict(test_features)
print("F1", round(f1_score(test_features_labels["is_phishing"], y_test_predicted), 8))
print("Recall", round(recall_score(test_features_labels["is_phishing"], y_test_predicted), 8))

Best iteration: 14
F1 0.62567812
Recall 0.60278746
CPU times: user 2min 21s, sys: 2.48 s, total: 2min 24s
Wall time: 19.5 s


In [35]:
test_features_labels["is_phishing"].sum(), y_test_predicted.sum()

(287, 227)

In [36]:
309 - 287

22

In [37]:
k = list(test_features_labels["is_phishing"]) + ([1]*22)

In [38]:
l = list(y_test_predicted) + ([0] * 22)

In [39]:
f1_score(k, l)

0.582089552238806