# Classify Ethereum Exchange Addresses
Each wallet on the Ethereum (ETH) blockchain is identified by a unique 42-character hexadecimal addresses (e.g. 0xDe12C3d2257fc9bB1c1A00d409f292eecD55fFaF). 

In [1]:
import re
import os

from tqdm import tqdm
import pandas as pd

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
random_seed = 2024

In [None]:
data_dir = '../data/ethereum-exchanges'
os.listdir(data_dir)

In [4]:
train_fp = os.path.join(data_dir, 'train_data.csv')
test_fp = os.path.join(data_dir, 'test_data.csv')

## Preprocess Data

In [None]:
# Load transaction data
df_trans = pd.read_csv(
    os.path.join(data_dir, "token_transfers_full.csv"), encoding="windows-1252"
)
print(df_trans.shape)
df_trans.head()

In [None]:
# Each Token has a unique token contract address
# Data contains 28 unique token contract address

df_trans['token_address'].value_counts()

In [7]:
# Token details obtained from the ETH blockchain using web3.py and Infura.

token_details_mapper = {
    "0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac": ("StorjToken", "STORJ", 8),
    "0xa15c7ebe1f07caf6bff097d8a589fb8ac49ae5b3": ("Pundi X Token", "NPXS", 18),
    "0x1f573d6fb3f13d689ff844b4ce37794d79a7ff1c": ("Bancor Network Token", "BNT", 18),
    "0xb8c77482e45f1f44de1745f52c74426c631bdd52": ("BNB", "BNB", 18),
    "0xe41d2489571d322189246dafa5ebde1f4699f498": ("0x Protocol Token", "ZRX", 18),
    "0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2": ("Wrapped Ether", "WETH", 18),
    "0x514910771af9ca656af840dff83e8264ecf986ca": ("ChainLink Token", "LINK", 18),
    "0xdd974d5c2e2928dea5f71b9825b8b646686bd200": ("Kyber Network Crystal", "KNC", 18),
    "0xd26114cd6ee289accf82350c8d8487fedb8a0c07": ("OMGToken", "OMG", 18),
    "0x89d24a6b4ccb1b6faa2625fe562bdd9a23260359": ("Dai Stablecoin v1.0", "DAI", 18),
    "0x0f5d2fb29fb7d3cfee444a200298f468908cc942": ("Decentraland MANA", "MANA", 18),
    "0x8e1b448ec7adfc7fa35fc2e885678bd323176e34": ("Egretia", "EGT", 18),
    "0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48": ("USD//C", "USDC", 18),
    "0xf629cbd94d3791c9250152bd8dfbdf380e2a3b9c": ("Enjin Coin", "ENJ", 18),
    "0x9f8f72aa9304c8b593d555f12ef6589cc3a579a2": ("Maker", "MKR", 18),
    "0x0d8775f648430679a709e98d2b0cb6250d2887ef": ("Basic Attention Token", "BAT", 18),
    "0x8e870d67f660d95d5be530380d0ec0bd388289e1": ("Paxos Token", "PAX", 18),
    "0x6f259637dcd74c767781e37bc6133cd6a68aa161": ("HuobiToken", "HT", 18),
    "0x4dc3643dbc642b72c158e7f3d2ff232df61cb6ce": ("Amber Token", "AMB", 18),
    "0x8971f9fd7196e5cee2c1032b50f656855af7dd26": ("Lambda", "LAMB", 18),
    "0x0000000000085d4780b73119b644ae5ecd22b376": ("TrueUSD", "TUSD", 18),
    "0x77fe30b2cf39245267c0a5084b66a560f1cf9e1f": ("Azbit", "AZ", 18),
    "0x174bfa6600bf90c885c7c01c7031389ed1461ab9": ("More Gold Coin", "MGC", 18),
    "0x8e766f57f7d16ca50b4a0b90b88f6468a09b0439": ("Maximine Coin", "MXM", 18),
    "0xc12d1c73ee7dc3615ba4e37e4abfdbddfa38907e": ("KickToken", "KICK", 8),
    "0xbddab785b306bcd9fb056da189615cc8ece1d823": ("Ebakus", "EBK", 18),
    "0x6b175474e89094c44da98b954eedeac495271d0f": ("Dai Stablecoin", "DAI", 18),
    "0x2b591e99afe9f32eaa6214f7b7629768c40eeb39": ("HEX", "HEX", 8),
}

In [8]:
df_trans["token_symbol"] = df_trans["token_address"].apply(
    lambda x: token_details_mapper[x][1]
)
df_trans["token_decimal"] = df_trans["token_address"].apply(
    lambda x: token_details_mapper[x][-1]
)

# Cast to float
df_trans["value"] = df_trans["value"].astype(float)
df_trans["token_decimal"] = df_trans["token_decimal"].astype(float)

# Calculate adjusted token value
df_trans["token_value"] = df_trans.apply(
    lambda x: x.value * (10**-x.token_decimal), axis=1
)

## Generate Features

In [None]:
df_trans['token_symbol'].value_counts()

In [10]:
top_n_token_lst = [
    "STORJ",
    "NPXS",
    "BNT",
    "BNB",
    "ZRX",
    "WETH",
    "LINK",
    "KNC",
    "OMG",
    "DAI",
    "MANA",
    "EGT",
    "USDC",
    "ENJ",
    "MKR",
    "BAT",
    "PAX",
    "HT",
    "AMB",
    "LAMB",
    "TUSD",
    "AZ",
    "MGC",
    "MXM",
    "KICK",
    "EBK",
    "HEX",
]

In [None]:
df_trans = df_trans.loc[
    (df_trans["token_symbol"].isin(top_n_token_lst))
].reset_index(drop=True)

print(df_trans.shape)
df_trans.head()

In [None]:
%%time
# Generate Address-level features
df_trans_out_agg = (
    df_trans.groupby(["to_address", "token_symbol"])
    .agg(
        f_out_total_token_value=("token_value", "sum"),
        f_out_mean_token_value=("token_value", "mean"),
        f_out_median_token_value=("token_value", "median"),
        f_out_min_token_value=("token_value", "min"),
        f_out_max_token_value=("token_value", "max"),
        # Number of addresses trades were made from
        f_out_address_count=("from_address", "count"),
        f_out_address_unique_count=("from_address", "nunique"),
        # Total Duration of trade
        f_out_block_number_diff=("block_number", lambda x: x.max() - x.min()),
    )
    .reset_index()
)

df_trans_out_agg = df_trans_out_agg.pivot_table(
    index="to_address",
    columns="token_symbol",
    values=[
        "f_out_total_token_value",
        "f_out_mean_token_value",
        "f_out_median_token_value",
        "f_out_min_token_value",
        "f_out_max_token_value",
        "f_out_address_count",
        "f_out_address_unique_count",
        "f_out_block_number_diff",
    ],
)


df_trans_out_agg.columns = [f'{stat}_{token}' for stat, token in df_trans_out_agg.columns]
df_trans_out_agg = df_trans_out_agg.reset_index()
df_trans_out_agg.rename(columns={'to_address': 'address'}, inplace=True)
df_trans_out_agg.head(3)

In [None]:
%%time
df_trans_in_agg = (
    df_trans.groupby(["from_address", "token_symbol"])
    .agg(
        f_in_total_token_value=("token_value", "sum"),
        f_in_mean_token_value=("token_value", "mean"),
        f_in_median_token_value=("token_value", "median"),
        f_in_mf_in_token_value=("token_value", "min"),
        f_in_max_token_value=("token_value", "max"),
        f_in_address_count=("to_address", "count"),
        f_in_address_unique_count=("to_address", "nunique"),
        f_in_block_number_diff=("block_number", lambda x: x.max() - x.min()),
    )
    .reset_index()
)


df_trans_in_agg = df_trans_in_agg.pivot_table(
    index="from_address",
    columns="token_symbol",
    values=[
        "f_in_total_token_value",
        "f_in_mean_token_value",
        "f_in_median_token_value",
        "f_in_mf_in_token_value",
        "f_in_max_token_value",
        "f_in_address_count",
        "f_in_address_unique_count",
        "f_in_block_number_diff",
    ],
)

df_trans_in_agg.columns = [f'{stat}_{token}' for stat, token in df_trans_in_agg.columns]
df_trans_in_agg = df_trans_in_agg.reset_index()
df_trans_in_agg.rename(columns={'from_address': 'address'}, inplace=True)
df_trans_in_agg.head(3)

In [None]:
# train_node_lst = df_ex_train['address'].tolist()
df_train = pd.read_csv(train_fp)
df_train = pd.merge(df_train, df_trans_in_agg, how='left', on=['address'])
df_train = pd.merge(df_train, df_trans_out_agg, how='left', on=['address'])
df_train.head()

In [None]:
# test_node_lst = df_ex_test['address'].tolist()
df_test = pd.read_csv(test_fp)
df_test = pd.merge(df_test, df_trans_in_agg, how='left', on=['address'])
df_test = pd.merge(df_test, df_trans_out_agg, how='left', on=['address'])
print(df_test.shape)
df_test.head()

In [None]:
f_cols = [x for x in df_train.columns if bool(re.search("f_", x))]
drop_cols = []
f_cols = list(set(f_cols) - set(drop_cols))
print(f_cols)

lbl_col = "lbl"

In [17]:
final_f_cols = list(set(df_train.columns).intersection(f_cols))
x_train = df_train[final_f_cols]
y_train = df_train[lbl_col]

In [18]:
x_test = df_test[final_f_cols]
y_test = df_test[lbl_col]

In [None]:
all_columns_match = (x_test.columns == x_train.columns).all()
print(f"all_columns_match: {all_columns_match}")

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.2, random_state=random_seed
)
print(f"train: {x_train.shape}, {y_train.shape}")
print(f"val: {x_val.shape}, {y_val.shape}")

## Modelling

In [None]:
# Focus on feature engineering, not on model fine-tuning
model = CatBoostClassifier(
    learning_rate=0.05,
    iterations=2000,        # Equivalent to n_estimators
    depth=8,                # Equivalent to max_depth
    subsample=0.8,
    rsm=0.8,                # Equivalent to feature_fraction (RSM = Random Subspace Method)
    bagging_temperature=1.0, # Similar to bagging_fraction but with a different mechanism
    random_seed=random_seed,
    verbose=0,              # Silence the training output
#     early_stopping_rounds=50 # Enable early stopping
)

model.fit(
    x_train,
    y_train,
    eval_set=(x_val, y_val),  # Validation set for early stopping
    use_best_model=True,      # Saves the best model during training
    verbose=True             # Suppress logs
)


In [None]:
y_pred_proba = model.predict_proba(x_test)
df_result = pd.DataFrame(
    {
        "address": df_test["address"].tolist(),
        "pred_pos": y_pred_proba[:, 1],
        "pred_neg": y_pred_proba[:, 0],
        "lbl": y_test
    }
)

df_result['pred'] = 0
df_result.loc[(df_result['pred_pos']>df_result['pred_neg']), 'pred'] = 1
df_result

In [None]:
prec = precision_score(df_result['lbl'], df_result['pred'])
recall = recall_score(df_result['lbl'], df_result['pred'])
f1 = f1_score(df_result['lbl'], df_result['pred'])

print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

## Next Steps: 
- Threshold based on predicted probability
- Reduce sparse features