In [1]:
import networkx as nx
import pandas as pd
import tqdm

from dataset import extract_features

In [2]:
trx = pd.read_csv('data/mettl/transactions.csv')
trx = trx[trx.from_account.notnull()][trx.to_account.notnull()]
trx = trx.astype({'from_account': str, 'to_account': str, 'transaction_time_utc': str, 'value': float})
test_acc = pd.read_csv('data/mettl/test_accounts.csv')
test_acc_address = test_acc.account.values
train_acc = pd.read_csv('data/mettl/train_accounts.csv')
train_acc_address = train_acc.account.values

In [3]:
G = nx.from_pandas_edgelist(trx, source='from_account', target='to_account', create_using=nx.MultiGraph, edge_attr=True)

In [4]:
valid_train = [acc for acc in train_acc_address if acc in G]
valid_test = [acc for acc in test_acc_address if acc in G]

In [5]:
import warnings
warnings.filterwarnings('ignore')

train_features = extract_features(G, valid_train)
test_features = extract_features(G, valid_test)

Extracting features for account a09863: 100%|███████████████████████████████████████████████████████████████████| 25198/25198 [05:25<00:00, 77.44it/s]
Extracting features for account a03148: 100%|█████████████████████████████████████████████████████████████████████| 6300/6300 [01:13<00:00, 85.73it/s]


In [6]:
train_features.to_csv('data/mettl/train_cleaned_features.csv', index=False)
test_features.to_csv('data/mettl/test_cleaned_features.csv', index=False)

In [7]:
train_feat_flag = train_features.merge(train_acc, on='account')
cols = train_feat_flag.columns.tolist()
train_feat_flag = train_feat_flag[cols[:1] + cols[-1:] + cols[1:-1]]
train_feat_flag.to_csv('data/mettl/train_feat_flag.csv', index=False)

In [8]:
test_feat_flag = test_features.merge(test_acc, on='account')
cols = test_feat_flag.columns.tolist()
test_feat_flag.to_csv('data/mettl/test_feat_flag.csv', index=False)

In [9]:
import warnings
warnings.filterwarnings('ignore')
train_feat_flag = pd.read_csv('data/mettl/train_feat_flag.csv')
test_feat_flag = pd.read_csv('data/mettl/test_feat_flag.csv')
from dataset import aggregate_neighbors_features
train_all_features = None
for acc in tqdm.tqdm(train_acc_address):
    neighbors = G[acc]
    neighbors = set(neighbors).intersection(set(train_acc_address))
    agg_feat_neighbors = train_feat_flag[train_feat_flag.account.isin(list(neighbors))]
    row = train_feat_flag[train_feat_flag.account == acc]
    row_agg = aggregate_neighbors_features(df=agg_feat_neighbors)
    row_agg['account'] = acc
    row = row.merge(pd.DataFrame([row_agg]), on='account')
    if train_all_features is None:
        train_all_features = row
    else:
        train_all_features = pd.concat([train_all_features, row], axis=0, ignore_index=True)

100%|██████████| 25198/25198 [04:21<00:00, 96.30it/s] 


In [10]:
test_all_features = None
for acc in tqdm.tqdm(test_acc_address):
    neighbors = G[acc]
    neighbors = set(neighbors).intersection(set(train_acc_address))
    agg_feat_neighbors = train_feat_flag[train_feat_flag.account.isin(list(neighbors))]
    row = test_feat_flag[test_feat_flag.account == acc]
    row_agg = aggregate_neighbors_features(df=agg_feat_neighbors)
    row_agg['account'] = acc
    row = row.merge(pd.DataFrame([row_agg]), on='account')
    if test_all_features is None:
        test_all_features = row
    else:
        test_all_features = pd.concat([test_all_features, row], axis=0, ignore_index=True)

100%|██████████| 6300/6300 [00:47<00:00, 132.77it/s]


In [11]:
train_all_features.to_csv('data/mettl/train_agg.csv', index=False)
test_all_features.to_csv('data/mettl/test_agg.csv', index=False)