In [1]:
import pickle
import time
from collections import defaultdict
from datetime import datetime

import networkx as nx
import numpy as np
import pandas as pd

import settings as s

In [2]:
start = time.time()

In [3]:
%%time

with open(s.INPUT_GRAPH_FILE, "rb") as f:
    G = pickle.load(f)

nodes_mapping = {}
phishing_nodes = {}
for idx, nd in enumerate(nx.nodes(G)):
    nodes_mapping[nd] = f"id-{idx}"
    phishing_nodes[nodes_mapping[nd]] = G.nodes[nd]["isp"]

CPU times: user 11 s, sys: 1.5 s, total: 12.5 s
Wall time: 14 s


In [4]:
%%time

rows_orig = []
rows = defaultdict(int)
for count, (src, trg) in enumerate({x for x in nx.edges(G)}):
    count += 1
    source, target = nodes_mapping[src], nodes_mapping[trg]
    for index, attrs in G[src][trg].items():
        amount, timestamp = attrs["amount"], int(attrs["timestamp"])
        rows[(source, target, timestamp, amount)] += 1
        rows_orig.append((source, target, timestamp, amount))
    if not (count % 500_000):
        print(count)

data = pd.DataFrame(
    [list(k) + [v] for k, v in rows.items()], 
    columns=["source", "target", "ts", "amount", "num_transactions"]
)
del rows
data.loc[:, "timestamp"] = data.loc[:, "ts"].apply(datetime.fromtimestamp)
del data["ts"]
data_orig = pd.DataFrame(rows_orig, columns=["source", "target", "ts", "amount"])
del rows_orig
data_orig.loc[:, "timestamp"] = data_orig.loc[:, "ts"].apply(datetime.fromtimestamp)
del data_orig["ts"]

500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
5000000
CPU times: user 51.3 s, sys: 1.57 s, total: 52.9 s
Wall time: 53.4 s


In [5]:
%%time

rates = pd.read_csv(s.INPUT_RATES_FILE, sep=";")
rates.loc[:, "rate"] = (rates["low"] + rates["high"]) / 2
rates.index = pd.to_datetime(rates["timeOpen"]).dt.date
rates = rates["rate"].to_dict()

data.loc[:, "amount_usd"] = data.apply(lambda x: rates[x["timestamp"].date()] * x["amount"], axis=1)
data.loc[:, "is_zero_transaction"] = data.loc[:, "amount"] == 0

data_orig.loc[:, "amount_usd"] = data_orig.apply(lambda x: rates[x["timestamp"].date()] * x["amount"], axis=1)
data_orig.loc[:, "is_zero_transaction"] = data_orig.loc[:, "amount"] == 0
data_orig.loc[:, "is_phishing"] = data_orig.loc[:, "target"].apply(lambda x: phishing_nodes[x] == 1)

data.loc[data["amount"] < 1e-6, "amount"] = 1e-6
data.loc[data["amount_usd"] < 1e-6, "amount_usd"] = 1e-6
data = data.astype({"amount": np.float32, "amount_usd": np.float32})
columns = [
    "source", "target", "timestamp", "num_transactions", 
    "amount", "amount_usd", "is_zero_transaction",
]
data = data.loc[:, columns]
data.loc[:, "is_phishing"] = data.loc[:, "target"].apply(lambda x: phishing_nodes[x] == 1)
data = data.sort_values("timestamp").reset_index(drop=True)
data.index.name = "transaction_id"

data.to_parquet(s.INPUT_DATA_FILE)
data_orig.to_parquet(s.INPUT_DATA_ORIG_FILE)

CPU times: user 1min 38s, sys: 1.57 s, total: 1min 39s
Wall time: 1min 39s


In [6]:
print((time.time() - start) // 60)

2.0
