# Data Preparation

In [1]:
import os
import random
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_colwidth', None)
random_seed = 2024

## Load Data
Source: https://chartalist.org/eth/TaskTypePrediction.html

In [3]:
data_dir = '../data/ethereum-exchanges'
os.listdir(data_dir)

['exchangeLabels.csv',
 'token_transfers_full.csv',
 'alphacore_labels_stablecoin.csv',
 'README.md',
 '.ipynb_checkpoints',
 'train_data.csv',
 'eth_trans_graph.pickle',
 'test_data.csv']

In [4]:
# Load transaction data
df_trans = pd.read_csv(
    os.path.join(data_dir, "token_transfers_full.csv"), encoding="windows-1252"
)
print(df_trans.shape)
df_trans.head()

(38901039, 7)


Unnamed: 0,token_address,from_address,to_address,value,transaction_hash,log_index,block_number
0,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0xc20d6d7d2e7cde1cabc7f20c553fe93dada380f3,209000000,0x430d2a02f678d28bb6e441cd383a6ddd02f30c05378d64fb4e34a17cf0162735,29,6525300
1,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0xc4c49dbf1b4bc997c9a9758c1b1f86d33da4232d,191000000,0x5e138acd30d124a6ff031326fa91e1c4d0a345a2955d4a8be9f753fce73abd09,33,6525300
2,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0xab194b0c3e3423ec6cbe44fdf096cb43251d1ccd,181000000,0xacccc26bb6ed8e03b7b6b17e1a7ee9a209ad0e3d7d19bac5703f62dd31d6762a,36,6525300
3,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0xe29b47e4d0cc68b847226c652f2a5e73d1a94343,204000000,0x0600e6d229b403adb01bb6e54bebf0af329e30016427cc959fb91833986d23eb,37,6525300
4,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0x5d0a1a7c7507142f06c04ffd604ab7b85b442f98,209000000,0xd4527c3f59fb6fa066646ab6aff57de86bb688c93569d47dfbec90d84abc9982,38,6525300


In [5]:
# Load Exchange data
df_exchange = pd.read_csv(os.path.join(data_dir, 'exchangeLabels.csv'))
print(df_exchange.shape)
df_exchange.head()

(296, 3)


Unnamed: 0,type,address,name
0,dex,0x11111254369792b2ca5d084ab5eea397ca8fa48b,1inch.exchange
1,dex,0xe8fff15bb5e14095bfdfa8bb85d83cc900c23c56,AfroDex
2,dex,0x4572f2554421bd64bef1c22c8a81840e8d496bea,AirSwap
3,dex,0xdc1882f350b42ac9a23508996254b1915c78b204,Allbit 1
4,dex,0xff6b1cdfd2d3e37977d7938aa06b6d89d6675e27,Allbit 2


## Generate Train and Test Data

In [6]:
df_ex_train, df_ex_test = train_test_split(
    df_exchange, test_size=0.4, random_state=random_seed
)

print(df_ex_train.shape)
print(df_ex_test.shape)

(177, 3)
(119, 3)


In [7]:
# Preprocess exchange data: Ensure that exchange addresses exists in transaction data

all_add_lst = list(set(df_trans["from_address"].tolist() + df_trans["to_address"].tolist()))
print(f"Total Addresses: {len(all_add_lst)}")

df_ex_train = df_ex_train.loc[(df_ex_train['address'].isin(all_add_lst))].reset_index(drop=True)
df_ex_test = df_ex_test.loc[(df_ex_test['address'].isin(all_add_lst))].reset_index(drop=True)

print(f"train: {df_ex_train.shape}")
print(f"test: {df_ex_test.shape}")

Total Addresses: 6083422
train: (162, 3)
test: (108, 3)


In [8]:
# Generate fair evaluation dataset. 
# Test dataset must contain (i) Exchange Addresses and (ii) Non Exchange Addresses

n_neg_samples_train = 1000
n_neg_samples_test = 1000

tmp_all_add_lst = list(set(all_add_lst) - set(df_ex_train['address'].tolist() + df_ex_test['address'].tolist()))
tmp_all_add_lst = random.sample(tmp_all_add_lst, n_neg_samples_train + n_neg_samples_test)
train_node_lst = tmp_all_add_lst[:n_neg_samples_train] + df_ex_train['address'].tolist()
test_node_lst = tmp_all_add_lst[n_neg_samples_train:] + df_ex_test['address'].tolist()

print(len(train_node_lst))
print(len(test_node_lst))

1162
1108


In [9]:
# Generate training labels
df_train = pd.DataFrame({'address': train_node_lst})
df_train['lbl'] = 0
df_train.loc[(df_train['address'].isin(df_exchange['address'].tolist())), 'lbl'] = 1
print(df_train.shape)
df_train.head()

(1162, 2)


Unnamed: 0,address,lbl
0,0xb9ce8fdd626309ab1e642d30dc14ad05a27c9b23,0
1,0xc5858e01e42bb82c0ab3fc9f9523c3d40c4075b9,0
2,0x3af3e15ac95f38dd606eda46f67c1d4f379ed634,0
3,0xa5b606acae97c95cc61b9b955e41aa3431028db1,0
4,0xb4b079ea5537cf9436215a62a409740795da8c47,0


In [10]:
# Generate testing labels
df_test = pd.DataFrame({'address': test_node_lst})
df_test['lbl'] = 0
df_test.loc[(df_test['address'].isin(df_exchange['address'].tolist())), 'lbl'] = 1
print(df_test.shape)
df_test.head()

(1108, 2)


Unnamed: 0,address,lbl
0,0x41e8a0e3fad348186bb3bdd3a0d013d040e16d6d,0
1,0x9524a46630aedcb5a4a3a32fba936149bba3217d,0
2,0x7ab88e36dd39d2f13f1eb108540a5b8fed4db5d0,0
3,0xc5dad67d940ffbc6031220920dfee4a28ff3e569,0
4,0xd015f7932ddd3117279f60e88692d2a1712cae84,0


## Save Train and Test Datasets

In [12]:
train_fp = os.path.join(data_dir, 'train_data.csv')
test_fp = os.path.join(data_dir, 'test_data.csv')

In [13]:
df_train.to_csv(train_fp, index=False)
print(f"Saved: {train_fp}")

Saved: ../data/ethereum-exchanges/train_data.csv


In [14]:
df_test.to_csv(test_fp, index=False)
print(f"Saved: {test_fp}")

Saved: ../data/ethereum-exchanges/test_data.csv
