In [1]:
import bisect
import os
import pickle
import time
from collections import defaultdict
from datetime import datetime, timedelta

import networkx as nx
import numpy as np
import pandas as pd

import settings as s

In [2]:
start = time.time()

In [3]:
%%time

with open(s.INPUT_GRAPH_FILE, "rb") as f:
    G = pickle.load(f)

nodes_mapping = {}
phishing_nodes = {}
for idx, nd in enumerate(nx.nodes(G)):
    nodes_mapping[nd] = f"id-{idx}"
    phishing_nodes[nodes_mapping[nd]] = G.nodes[nd]["isp"]

CPU times: user 12.1 s, sys: 931 ms, total: 13 s
Wall time: 13.1 s


In [4]:
%%time

rows = []
for edge in nx.edges(G):
    source, target = edge
    attrs = G[source][target][0]
    amount, timestamp = attrs["amount"], attrs["timestamp"]
    source, target = nodes_mapping[source], nodes_mapping[target]
    rows.append(
        {
            "source": source,
            "target": target,
            "timestamp": datetime.fromtimestamp(timestamp),
            "amount": amount,
            "num_transactions": 1,
        }
    )
data = pd.DataFrame(rows)
data = data.sort_values("timestamp").reset_index(drop=True)
data.index.name = "transaction_id"
size_orig = data.shape[0]

CPU times: user 19.7 s, sys: 716 ms, total: 20.4 s
Wall time: 20.5 s


In [5]:
%%time

rates = pd.read_csv(s.INPUT_RATES_FILE, sep=";")
rates.loc[:, "rate"] = (rates["low"] + rates["high"]) / 2
rates.index = pd.to_datetime(rates["timeOpen"]).dt.date
rates = rates["rate"].to_dict()

data.loc[:, "amount_usd"] = data.apply(lambda x: rates[x["timestamp"].date()] * x["amount"], axis=1)
data.loc[:, "is_zero_transaction"] = data.loc[:, "amount"] == 0
columns = [
    "source", "target", "timestamp", "num_transactions", 
    "amount", "amount_usd", "is_zero_transaction",
]
data = data.loc[:, columns]
data.loc[:, "is_phishing"] = data.loc[:, "target"].apply(lambda x: phishing_nodes[x] == 1)

CPU times: user 42.1 s, sys: 601 ms, total: 42.7 s
Wall time: 42.7 s


In [6]:
location_main = os.path.abspath(f".{os.sep}data")

location_source_dispensation = os.path.join(location_main, "source_dispensation.parquet")
location_target_accumulation = os.path.join(location_main, "target_accumulation.parquet")

In [7]:
%%time

num_unique = data["source"].nunique()
source_dispensation = []
for index, (_, group) in enumerate(data[["source", "amount_usd"]].groupby("source")):
    group.loc[:, "source_dispensation"] = group["amount_usd"].cumsum()
    source_dispensation.append(group)
    if not (index % 200_000):
        print(index, num_unique)
source_dispensation = pd.concat(source_dispensation, ignore_index=False)
source_dispensation.to_parquet(location_source_dispensation)

0 2113093
200000 2113093
400000 2113093
600000 2113093
800000 2113093
1000000 2113093
1200000 2113093
1400000 2113093
1600000 2113093
1800000 2113093
2000000 2113093
CPU times: user 4min 22s, sys: 9.15 s, total: 4min 31s
Wall time: 4min 32s


In [8]:
source_dispensation = pd.read_parquet(location_source_dispensation)

In [9]:
%%time

num_unique = data["target"].nunique()
target_accumulation = []
for index, (_, group) in enumerate(data[["target", "amount_usd"]].groupby("target")):
    group.loc[:, "target_accumulation"] = group["amount_usd"].cumsum()
    target_accumulation.append(group)
    if not (index % 200_000):
        print(index, num_unique)
target_accumulation = pd.concat(target_accumulation, ignore_index=False)
target_accumulation.to_parquet(location_target_accumulation)

0 1119024
200000 1119024
400000 1119024
600000 1119024
800000 1119024
1000000 1119024
CPU times: user 2min 17s, sys: 8.26 s, total: 2min 26s
Wall time: 2min 27s


In [10]:
target_accumulation = pd.read_parquet(location_target_accumulation)

In [11]:
data = source_dispensation[["source_dispensation"]].join(
    target_accumulation[["target_accumulation"]], how="outer"
).join(data)
data.sort_index(inplace=True)

In [12]:
%%time

dispensation_mapping = {}
for source, group in data[["source", "source_dispensation"]].groupby("source"):
    dispensation_mapping[source] = (group.index.tolist(), group["source_dispensation"].tolist())

accumulation_mapping = {}
for target, group in data[["target", "target_accumulation"]].groupby("target"):
    accumulation_mapping[target] = (group.index.tolist(), group["target_accumulation"].tolist())

CPU times: user 43.1 s, sys: 7.1 s, total: 50.2 s
Wall time: 44.8 s


In [13]:
def get_dis_acc_data(node, mapping_dis, mapping_acc, trx_id):
    data_dis = mapping_dis.get(node)
    if data_dis is None:
        data_acc = mapping_acc[node]
        index_acc = bisect.bisect_right(data_acc[0], trx_id)
        if index_acc:
            index_acc -= 1
        else:
            return 0, 0
        return 0, data_acc[1][index_acc]
    data_acc = mapping_acc.get(node)
    if data_acc is None:
        data_dis = mapping_dis[node]
        index_dis = bisect.bisect_right(data_dis[0], trx_id)
        if index_dis:
            index_dis -= 1
        else:
            return 0, 0
        return data_dis[1][index_dis], 0
    index_dis = bisect.bisect_right(data_dis[0], trx_id)
    index_acc = bisect.bisect_right(data_acc[0], trx_id)
    so_far_dispensed = 0
    if index_dis:
        index_dis -= 1
        so_far_dispensed = data_dis[1][index_dis]
    so_far_accumulated = 0
    if index_acc:
        index_acc -= 1
        so_far_accumulated = data_acc[1][index_acc]
    return so_far_dispensed, so_far_accumulated


def source_dis_acc_data(row):
    return get_dis_acc_data(row["source"], dispensation_mapping, accumulation_mapping, row.name)


def target_dis_acc_data(row):
    return get_dis_acc_data(row["target"], dispensation_mapping, accumulation_mapping, row.name)

In [14]:
%%time

data.loc[:, "dis_acc_source"] = data.apply(source_dis_acc_data, axis=1)
data.loc[:, "dis_acc_target"] = data.apply(target_dis_acc_data, axis=1)

data.loc[:, "source_positive_balance"] = data.loc[:, "dis_acc_source"].apply(
    lambda x: x[1] - x[0] if x[1] > x[0] else 0
)
data.loc[:, "source_negative_balance"] = data.loc[:, "dis_acc_source"].apply(
    lambda x: x[0] - x[1] if x[0] > x[1] else 0
)
data.loc[:, "target_positive_balance"] = data.loc[:, "dis_acc_target"].apply(
    lambda x: x[1] - x[0] if x[1] > x[0] else 0
)
data.loc[:, "target_negative_balance"] = data.loc[:, "dis_acc_target"].apply(
    lambda x: x[0] - x[1] if x[0] > x[1] else 0
)

del data["dis_acc_source"]
del data["dis_acc_target"]

CPU times: user 1min 26s, sys: 6.57 s, total: 1min 33s
Wall time: 1min 35s


In [15]:
%%time

source_firsts = data.groupby("source").agg(first_trx=("timestamp", "min"))
target_firsts = data.groupby("target").agg(first_trx=("timestamp", "min"))
active_since = source_firsts.join(target_firsts, lsuffix="_left", how="outer").fillna(datetime.now())
active_since.loc[:, "active_since"] = active_since.apply(lambda x: min([x["first_trx_left"], x["first_trx"]]), axis=1)
active_since = active_since.loc[:, ["active_since"]]
active_since.sort_values("active_since", inplace=True)

active_since = active_since["active_since"].to_dict()
last_trx_ts = data["timestamp"].max() + timedelta(hours=1)
first_trx_ts = data["timestamp"].min() - timedelta(hours=1)
active_for = {k : (last_trx_ts - v).total_seconds() for k, v in active_since.items()}

data.loc[:, "source_active_for"] = data.apply(
    lambda x: (x["timestamp"] - active_since[x["source"]]).total_seconds(), axis=1
)
data.loc[:, "target_active_for"] = data.apply(
    lambda x: (x["timestamp"] - active_since[x["target"]]).total_seconds(), axis=1
)

CPU times: user 2min 5s, sys: 9.4 s, total: 2min 15s
Wall time: 2min 19s


In [16]:
types = [
    ("source", "str"),
    ("target", "str"),
    ("timestamp", "<M8[ns]"),
    ("amount", np.float32),
    ("amount_usd", np.float32),
    ("num_transactions", np.int64),
    ("is_zero_transaction", bool),
    ("is_phishing", bool),
    ("source_dispensation", np.float32),
    ("target_accumulation", np.float32),
    ("source_positive_balance", np.float32),
    ("source_negative_balance", np.float32),
    ("target_positive_balance", np.float32),
    ("target_negative_balance", np.float32),
    ("source_active_for", np.int64),
    ("target_active_for", np.int64),
]

data.loc[data["amount"] < 1e-6, "amount"] = 1e-6
data.loc[data["amount_usd"] < 1e-6, "amount_usd"] = 1e-6

data = data.loc[:, [x[0] for x in types]].astype(dict(types))

In [17]:
assert data.shape[0] == size_orig

In [18]:
data.to_parquet(s.INPUT_DATA_FILE)

In [19]:
print((time.time() - start) // 60)

13.0
