# Classify Ethereum Exchange Addresses
Each wallet on the Ethereum (ETH) blockchain is identified by a unique 42-character hexadecimal addresses (e.g. 0xDe12C3d2257fc9bB1c1A00d409f292eecD55fFaF). 

In [1]:
import re
import os

from tqdm import tqdm
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
random_seed = 2024

In [3]:
data_dir = '../data/ethereum-exchanges'
os.listdir(data_dir)

['exchangeLabels.csv',
 'token_transfers_full.csv',
 'alphacore_labels_stablecoin.csv',
 'README.md',
 '.ipynb_checkpoints',
 'train_data.csv',
 'eth_trans_graph.pickle',
 'test_data.csv']

In [4]:
train_fp = os.path.join(data_dir, 'train_data.csv')
test_fp = os.path.join(data_dir, 'test_data.csv')

## Preprocess Data

In [5]:
# Load transaction data
df_trans = pd.read_csv(
    os.path.join(data_dir, "token_transfers_full.csv"), encoding="windows-1252"
)
print(df_trans.shape)
df_trans.head()

(38901039, 7)


Unnamed: 0,token_address,from_address,to_address,value,transaction_hash,log_index,block_number
0,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0xc20d6d7d2e7cde1cabc7f20c553fe93dada380f3,209000000,0x430d2a02f678d28bb6e441cd383a6ddd02f30c05378d...,29,6525300
1,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0xc4c49dbf1b4bc997c9a9758c1b1f86d33da4232d,191000000,0x5e138acd30d124a6ff031326fa91e1c4d0a345a2955d...,33,6525300
2,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0xab194b0c3e3423ec6cbe44fdf096cb43251d1ccd,181000000,0xacccc26bb6ed8e03b7b6b17e1a7ee9a209ad0e3d7d19...,36,6525300
3,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0xe29b47e4d0cc68b847226c652f2a5e73d1a94343,204000000,0x0600e6d229b403adb01bb6e54bebf0af329e30016427...,37,6525300
4,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0x5d0a1a7c7507142f06c04ffd604ab7b85b442f98,209000000,0xd4527c3f59fb6fa066646ab6aff57de86bb688c93569...,38,6525300


In [6]:
# Each Token has a unique token contract address
# Data contains 28 unique token contract address

df_trans['token_address'].value_counts()

token_address
0x174bfa6600bf90c885c7c01c7031389ed1461ab9    8947221
0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2    3712482
0x89d24a6b4ccb1b6faa2625fe562bdd9a23260359    2845098
0x514910771af9ca656af840dff83e8264ecf986ca    2806558
0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48    2278533
0x6b175474e89094c44da98b954eedeac495271d0f    2047728
0x8e766f57f7d16ca50b4a0b90b88f6468a09b0439    1628182
0x8e870d67f660d95d5be530380d0ec0bd388289e1    1538689
0x1f573d6fb3f13d689ff844b4ce37794d79a7ff1c    1508981
0x0d8775f648430679a709e98d2b0cb6250d2887ef    1447519
0xc12d1c73ee7dc3615ba4e37e4abfdbddfa38907e    1311534
0xa15c7ebe1f07caf6bff097d8a589fb8ac49ae5b3     779563
0xf629cbd94d3791c9250152bd8dfbdf380e2a3b9c     750574
0x2b591e99afe9f32eaa6214f7b7629768c40eeb39     694213
0xe41d2489571d322189246dafa5ebde1f4699f498     682927
0x0000000000085d4780b73119b644ae5ecd22b376     650468
0x9f8f72aa9304c8b593d555f12ef6589cc3a579a2     639482
0xd26114cd6ee289accf82350c8d8487fedb8a0c07     537507
0xb8c77482e45f

In [7]:
# Token details obtained from the ETH blockchain using web3.py and Infura.

token_details_mapper = {
    "0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac": ("StorjToken", "STORJ", 8),
    "0xa15c7ebe1f07caf6bff097d8a589fb8ac49ae5b3": ("Pundi X Token", "NPXS", 18),
    "0x1f573d6fb3f13d689ff844b4ce37794d79a7ff1c": ("Bancor Network Token", "BNT", 18),
    "0xb8c77482e45f1f44de1745f52c74426c631bdd52": ("BNB", "BNB", 18),
    "0xe41d2489571d322189246dafa5ebde1f4699f498": ("0x Protocol Token", "ZRX", 18),
    "0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2": ("Wrapped Ether", "WETH", 18),
    "0x514910771af9ca656af840dff83e8264ecf986ca": ("ChainLink Token", "LINK", 18),
    "0xdd974d5c2e2928dea5f71b9825b8b646686bd200": ("Kyber Network Crystal", "KNC", 18),
    "0xd26114cd6ee289accf82350c8d8487fedb8a0c07": ("OMGToken", "OMG", 18),
    "0x89d24a6b4ccb1b6faa2625fe562bdd9a23260359": ("Dai Stablecoin v1.0", "DAI", 18),
    "0x0f5d2fb29fb7d3cfee444a200298f468908cc942": ("Decentraland MANA", "MANA", 18),
    "0x8e1b448ec7adfc7fa35fc2e885678bd323176e34": ("Egretia", "EGT", 18),
    "0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48": ("USD//C", "USDC", 18),
    "0xf629cbd94d3791c9250152bd8dfbdf380e2a3b9c": ("Enjin Coin", "ENJ", 18),
    "0x9f8f72aa9304c8b593d555f12ef6589cc3a579a2": ("Maker", "MKR", 18),
    "0x0d8775f648430679a709e98d2b0cb6250d2887ef": ("Basic Attention Token", "BAT", 18),
    "0x8e870d67f660d95d5be530380d0ec0bd388289e1": ("Paxos Token", "PAX", 18),
    "0x6f259637dcd74c767781e37bc6133cd6a68aa161": ("HuobiToken", "HT", 18),
    "0x4dc3643dbc642b72c158e7f3d2ff232df61cb6ce": ("Amber Token", "AMB", 18),
    "0x8971f9fd7196e5cee2c1032b50f656855af7dd26": ("Lambda", "LAMB", 18),
    "0x0000000000085d4780b73119b644ae5ecd22b376": ("TrueUSD", "TUSD", 18),
    "0x77fe30b2cf39245267c0a5084b66a560f1cf9e1f": ("Azbit", "AZ", 18),
    "0x174bfa6600bf90c885c7c01c7031389ed1461ab9": ("More Gold Coin", "MGC", 18),
    "0x8e766f57f7d16ca50b4a0b90b88f6468a09b0439": ("Maximine Coin", "MXM", 18),
    "0xc12d1c73ee7dc3615ba4e37e4abfdbddfa38907e": ("KickToken", "KICK", 8),
    "0xbddab785b306bcd9fb056da189615cc8ece1d823": ("Ebakus", "EBK", 18),
    "0x6b175474e89094c44da98b954eedeac495271d0f": ("Dai Stablecoin", "DAI", 18),
    "0x2b591e99afe9f32eaa6214f7b7629768c40eeb39": ("HEX", "HEX", 8),
}

In [8]:
df_trans["token_symbol"] = df_trans["token_address"].apply(
    lambda x: token_details_mapper[x][1]
)
df_trans["token_decimal"] = df_trans["token_address"].apply(
    lambda x: token_details_mapper[x][-1]
)

# Cast to float
df_trans["value"] = df_trans["value"].astype(float)
df_trans["token_decimal"] = df_trans["token_decimal"].astype(float)

# Calculate adjusted token value
df_trans["token_value"] = df_trans.apply(
    lambda x: x.value * (10**-x.token_decimal), axis=1
)

## Generate Features

In [9]:
df_trans['token_symbol'].value_counts()

token_symbol
MGC      8947221
DAI      4892826
WETH     3712482
LINK     2806558
USDC     2278533
MXM      1628182
PAX      1538689
BNT      1508981
BAT      1447519
KICK     1311534
NPXS      779563
ENJ       750574
HEX       694213
ZRX       682927
TUSD      650468
MKR       639482
OMG       537507
BNB       459492
KNC       452316
STORJ     448603
EGT       443271
LAMB      438828
EBK       403344
AMB       401293
AZ        355295
MANA      346920
HT        344418
Name: count, dtype: int64

In [10]:
top_n_token_lst = [
    "STORJ",
    "NPXS",
    "BNT",
    "BNB",
    "ZRX",
    "WETH",
    "LINK",
    "KNC",
    "OMG",
    "DAI",
    "MANA",
    "EGT",
    "USDC",
    "ENJ",
    "MKR",
    "BAT",
    "PAX",
    "HT",
    "AMB",
    "LAMB",
    "TUSD",
    "AZ",
    "MGC",
    "MXM",
    "KICK",
    "EBK",
    "HEX",
]

In [11]:
df_trans = df_trans.loc[
    (df_trans["token_symbol"].isin(top_n_token_lst))
].reset_index(drop=True)

print(df_trans.shape)
df_trans.head()

(38901039, 10)


Unnamed: 0,token_address,from_address,to_address,value,transaction_hash,log_index,block_number,token_symbol,token_decimal,token_value
0,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0xc20d6d7d2e7cde1cabc7f20c553fe93dada380f3,209000000.0,0x430d2a02f678d28bb6e441cd383a6ddd02f30c05378d...,29,6525300,STORJ,8.0,2.09
1,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0xc4c49dbf1b4bc997c9a9758c1b1f86d33da4232d,191000000.0,0x5e138acd30d124a6ff031326fa91e1c4d0a345a2955d...,33,6525300,STORJ,8.0,1.91
2,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0xab194b0c3e3423ec6cbe44fdf096cb43251d1ccd,181000000.0,0xacccc26bb6ed8e03b7b6b17e1a7ee9a209ad0e3d7d19...,36,6525300,STORJ,8.0,1.81
3,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0xe29b47e4d0cc68b847226c652f2a5e73d1a94343,204000000.0,0x0600e6d229b403adb01bb6e54bebf0af329e30016427...,37,6525300,STORJ,8.0,2.04
4,0xb64ef51c888972c908cfacf59b47c1afbc0ab8ac,0x78903c52de3923d203881f1b766160cba5a37f59,0x5d0a1a7c7507142f06c04ffd604ab7b85b442f98,209000000.0,0xd4527c3f59fb6fa066646ab6aff57de86bb688c93569...,38,6525300,STORJ,8.0,2.09


In [12]:
%%time
# Generate Address-level features
df_trans_out_agg = (
    df_trans.groupby(["to_address", "token_symbol"])
    .agg(
        feat_out_total_token_value=("token_value", "sum"),
        feat_out_mean_token_value=("token_value", "mean"),
        feat_out_median_token_value=("token_value", "median"),
        feat_out_min_token_value=("token_value", "min"),
        feat_out_max_token_value=("token_value", "max"),
        # Number of addresses trades were made from
        feat_out_address_count=("from_address", "count"),
        feat_out_address_unique_count=("from_address", "nunique"),
        # Total Duration of trade
        feat_out_block_number_diff=("block_number", lambda x: x.max() - x.min()),
    )
    .reset_index()
)

df_trans_out_agg = df_trans_out_agg.pivot_table(
    index="to_address",
    columns="token_symbol",
    values=[
        "feat_out_total_token_value",
        "feat_out_mean_token_value",
        "feat_out_median_token_value",
        "feat_out_min_token_value",
        "feat_out_max_token_value",
        "feat_out_address_count",
        "feat_out_address_unique_count",
        "feat_out_block_number_diff",
    ],
)


df_trans_out_agg.columns = [f'{stat}_{token}' for stat, token in df_trans_out_agg.columns]
df_trans_out_agg = df_trans_out_agg.reset_index()
df_trans_out_agg.rename(columns={'to_address': 'address'}, inplace=True)
df_trans_out_agg.head(3)

CPU times: user 2min, sys: 4min 44s, total: 6min 45s
Wall time: 8min 14s


Unnamed: 0,address,feat_out_address_count_AMB,feat_out_address_count_AZ,feat_out_address_count_BAT,feat_out_address_count_BNB,feat_out_address_count_BNT,feat_out_address_count_DAI,feat_out_address_count_EBK,feat_out_address_count_EGT,feat_out_address_count_ENJ,...,feat_out_total_token_value_MKR,feat_out_total_token_value_MXM,feat_out_total_token_value_NPXS,feat_out_total_token_value_OMG,feat_out_total_token_value_PAX,feat_out_total_token_value_STORJ,feat_out_total_token_value_TUSD,feat_out_total_token_value_USDC,feat_out_total_token_value_WETH,feat_out_total_token_value_ZRX
0,0x0000000000000000000000000000000000000000,13.0,,4.0,,,143044.0,,,,...,1e-07,,,8904.71119,1318087000.0,0.059119,749633800.0,0.001491694,0.004571,1.6
1,0x0000000000000000000000000000000000000001,3.0,1.0,2.0,,,9.0,1.0,,1.0,...,,,,1.000152,,,,2.21143e-13,3.8e-05,
2,0x0000000000000000000000000000000000000005,1.0,,,,,,,,,...,,,,,,,,,,


In [13]:
%%time
df_trans_in_agg = (
    df_trans.groupby(["from_address", "token_symbol"])
    .agg(
        feat_in_total_token_value=("token_value", "sum"),
        feat_in_mean_token_value=("token_value", "mean"),
        feat_in_median_token_value=("token_value", "median"),
        feat_in_mfeat_in_token_value=("token_value", "min"),
        feat_in_max_token_value=("token_value", "max"),
        feat_in_address_count=("to_address", "count"),
        feat_in_address_unique_count=("to_address", "nunique"),
        feat_in_block_number_diff=("block_number", lambda x: x.max() - x.min()),
    )
    .reset_index()
)


df_trans_in_agg = df_trans_in_agg.pivot_table(
    index="from_address",
    columns="token_symbol",
    values=[
        "feat_in_total_token_value",
        "feat_in_mean_token_value",
        "feat_in_median_token_value",
        "feat_in_mfeat_in_token_value",
        "feat_in_max_token_value",
        "feat_in_address_count",
        "feat_in_address_unique_count",
        "feat_in_block_number_diff",
    ],
)

df_trans_in_agg.columns = [f'{stat}_{token}' for stat, token in df_trans_in_agg.columns]
df_trans_in_agg = df_trans_in_agg.reset_index()
df_trans_in_agg.rename(columns={'from_address': 'address'}, inplace=True)
df_trans_in_agg.head(3)

CPU times: user 1min 14s, sys: 3min 4s, total: 4min 18s
Wall time: 5min 15s


Unnamed: 0,address,feat_in_address_count_AMB,feat_in_address_count_AZ,feat_in_address_count_BAT,feat_in_address_count_BNB,feat_in_address_count_BNT,feat_in_address_count_DAI,feat_in_address_count_EBK,feat_in_address_count_EGT,feat_in_address_count_ENJ,...,feat_in_total_token_value_MKR,feat_in_total_token_value_MXM,feat_in_total_token_value_NPXS,feat_in_total_token_value_OMG,feat_in_total_token_value_PAX,feat_in_total_token_value_STORJ,feat_in_total_token_value_TUSD,feat_in_total_token_value_USDC,feat_in_total_token_value_WETH,feat_in_total_token_value_ZRX
0,0x0000000000000000000000000000000000000000,,1.0,,,,123374.0,1.0,,3.0,...,,,,,1520904000.0,,681521200.0,0.002169,,
1,0x0000000000000000000000000000000000000027,,,,,,,,,,...,,,,,,,4976680.0,,,
2,0x000000000000000000000000000000000000002e,,,,,,,,,,...,,,,,,,10315.0,,,


In [14]:
# train_node_lst = df_ex_train['address'].tolist()
df_train = pd.read_csv(train_fp)
df_train = pd.merge(df_train, df_trans_in_agg, how='left', on=['address'])
df_train = pd.merge(df_train, df_trans_out_agg, how='left', on=['address'])
df_train.head()

Unnamed: 0,address,lbl,feat_in_address_count_AMB,feat_in_address_count_AZ,feat_in_address_count_BAT,feat_in_address_count_BNB,feat_in_address_count_BNT,feat_in_address_count_DAI,feat_in_address_count_EBK,feat_in_address_count_EGT,...,feat_out_total_token_value_MKR,feat_out_total_token_value_MXM,feat_out_total_token_value_NPXS,feat_out_total_token_value_OMG,feat_out_total_token_value_PAX,feat_out_total_token_value_STORJ,feat_out_total_token_value_TUSD,feat_out_total_token_value_USDC,feat_out_total_token_value_WETH,feat_out_total_token_value_ZRX
0,0xb9ce8fdd626309ab1e642d30dc14ad05a27c9b23,0,,,,,,,,,...,,,,,100.0,,,,,
1,0xc5858e01e42bb82c0ab3fc9f9523c3d40c4075b9,0,,,,,,,,,...,,,963231.340744,,,,,,,
2,0x3af3e15ac95f38dd606eda46f67c1d4f379ed634,0,,,,,,,,,...,,,,,,,,,,
3,0xa5b606acae97c95cc61b9b955e41aa3431028db1,0,,,,,,,,,...,,,,,,,,,,
4,0xb4b079ea5537cf9436215a62a409740795da8c47,0,,,,,,,,,...,,,,,,,,,,


In [15]:
# test_node_lst = df_ex_test['address'].tolist()
df_test = pd.read_csv(test_fp)
df_test = pd.merge(df_test, df_trans_in_agg, how='left', on=['address'])
df_test = pd.merge(df_test, df_trans_out_agg, how='left', on=['address'])
print(df_test.shape)
df_test.head()

(1108, 434)


Unnamed: 0,address,lbl,feat_in_address_count_AMB,feat_in_address_count_AZ,feat_in_address_count_BAT,feat_in_address_count_BNB,feat_in_address_count_BNT,feat_in_address_count_DAI,feat_in_address_count_EBK,feat_in_address_count_EGT,...,feat_out_total_token_value_MKR,feat_out_total_token_value_MXM,feat_out_total_token_value_NPXS,feat_out_total_token_value_OMG,feat_out_total_token_value_PAX,feat_out_total_token_value_STORJ,feat_out_total_token_value_TUSD,feat_out_total_token_value_USDC,feat_out_total_token_value_WETH,feat_out_total_token_value_ZRX
0,0x41e8a0e3fad348186bb3bdd3a0d013d040e16d6d,0,,,,,,,,,...,,,,,,,,,,
1,0x9524a46630aedcb5a4a3a32fba936149bba3217d,0,,,,,,,,,...,,,,,,,,,,
2,0x7ab88e36dd39d2f13f1eb108540a5b8fed4db5d0,0,,,2.0,,,,,,...,,,10936590.0,,,,,,,
3,0xc5dad67d940ffbc6031220920dfee4a28ff3e569,0,,,,,,,,,...,,,,,,,,,,
4,0xd015f7932ddd3117279f60e88692d2a1712cae84,0,,,,,,,,,...,,,,,,,,5.000697e-08,,


In [16]:
feat_cols = [x for x in df_train.columns if bool(re.search("feat_", x))]
drop_cols = []
feat_cols = list(set(feat_cols) - set(drop_cols))
print(feat_cols)

lbl_col = "lbl"

['feat_in_mean_token_value_LINK', 'feat_in_max_token_value_HT', 'feat_in_total_token_value_MANA', 'feat_out_median_token_value_OMG', 'feat_in_address_unique_count_DAI', 'feat_out_address_unique_count_LINK', 'feat_in_address_count_OMG', 'feat_in_mean_token_value_WETH', 'feat_out_mean_token_value_DAI', 'feat_out_median_token_value_HT', 'feat_in_median_token_value_BNT', 'feat_out_address_count_BNT', 'feat_in_address_count_MGC', 'feat_out_mean_token_value_HEX', 'feat_in_block_number_diff_EBK', 'feat_out_median_token_value_AZ', 'feat_in_median_token_value_KICK', 'feat_in_max_token_value_ZRX', 'feat_out_total_token_value_ENJ', 'feat_in_address_count_EBK', 'feat_in_total_token_value_HEX', 'feat_in_block_number_diff_NPXS', 'feat_in_address_unique_count_LAMB', 'feat_in_max_token_value_USDC', 'feat_out_block_number_diff_EGT', 'feat_out_total_token_value_LAMB', 'feat_out_max_token_value_AMB', 'feat_out_address_count_AMB', 'feat_out_address_unique_count_WETH', 'feat_out_address_count_KICK', 'feat_

In [17]:
final_feat_cols = list(set(df_train.columns).intersection(feat_cols))
x_train = df_train[final_feat_cols]
y_train = df_train[lbl_col]

In [18]:
x_test = df_test[final_feat_cols]
y_test = df_test[lbl_col]

In [19]:
all_columns_match = (x_test.columns == x_train.columns).all()
print(f"all_columns_match: {all_columns_match}")

all_columns_match: True


In [20]:
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.2, random_state=random_seed
)
print(f"train: {x_train.shape}, {y_train.shape}")
print(f"val: {x_val.shape}, {y_val.shape}")

train: (929, 432), (929,)
val: (233, 432), (233,)


## Modelling

In [21]:
# Default parameters for LGBM. Focus is on feature generate not fine-tuning hyperparam
model = LGBMClassifier(
    objective="binary",
#     num_leaves=5,
    learning_rate=0.05,
    n_estimators=1000,
    subsample=0.8,
    max_depth=6,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    silent=True,
    is_unbalance=True,
#     early_stopping_rounds=50,
    seed=random_seed,
)


In [22]:
model.fit(
    x_train,
    y_train,
    eval_set=[(x_val, y_val)],
    eval_metric="auc",
)

[LightGBM] [Info] Number of positive: 137, number of negative: 792
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003941 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6340
[LightGBM] [Info] Number of data points in the train set: 929, number of used features: 360
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.147470 -> initscore=-1.754580
[LightGBM] [Info] Start training from score -1.754580


In [23]:
df_feat_imp = pd.DataFrame(
    {
        "feature_names": final_feat_cols,
        "importance": model.booster_.feature_importance(importance_type="gain"),
    }
)

df_feat_imp["importance"] = df_feat_imp["importance"] / df_feat_imp["importance"].sum()
df_feat_imp.sort_values("importance", ascending=False, inplace=True)
df_feat_imp.reset_index(drop=True, inplace=True)
df_feat_imp.head(10)

Unnamed: 0,feature_names,importance
0,feat_out_min_token_value_HEX,0.213275
1,feat_out_block_number_diff_KICK,0.09491
2,feat_out_address_count_KICK,0.075034
3,feat_out_mean_token_value_EBK,0.067096
4,feat_out_address_unique_count_BAT,0.035672
5,feat_out_mean_token_value_HEX,0.031892
6,feat_out_min_token_value_EBK,0.031713
7,feat_out_total_token_value_HEX,0.02522
8,feat_out_block_number_diff_EBK,0.022898
9,feat_out_min_token_value_AZ,0.021537


In [24]:
y_pred_proba = model.predict_proba(x_test)
df_result = pd.DataFrame(
    {
        "address": df_test["address"].tolist(),
        "pred_pos": y_pred_proba[:, 1],
        "pred_neg": y_pred_proba[:, 0],
        "lbl": y_test
    }
)

df_result['pred'] = 0
df_result.loc[(df_result['pred_pos']>df_result['pred_neg']), 'pred'] = 1
df_result



Unnamed: 0,address,pred_pos,pred_neg,lbl,pred
0,0x41e8a0e3fad348186bb3bdd3a0d013d040e16d6d,0.000184,0.999816,0,0
1,0x9524a46630aedcb5a4a3a32fba936149bba3217d,0.087545,0.912455,0,0
2,0x7ab88e36dd39d2f13f1eb108540a5b8fed4db5d0,0.006892,0.993108,0,0
3,0xc5dad67d940ffbc6031220920dfee4a28ff3e569,0.210720,0.789280,0,0
4,0xd015f7932ddd3117279f60e88692d2a1712cae84,0.065879,0.934121,0,0
...,...,...,...,...,...
1103,0x629a7144235259336ea2694167f3c8b856edd7dc,0.371216,0.628784,1,0
1104,0x4aea7cf559f67cedcad07e12ae6bc00f07e8cf65,0.000025,0.999975,1,0
1105,0x65f9b2e4d7aaeb40ffea8c6f5844d5ad7da257e0,0.002493,0.997507,1,0
1106,0x2faf487a4414fe77e2327f0bf4ae2a264a776ad2,0.999999,0.000001,1,1


In [25]:
prec = precision_score(df_result['lbl'], df_result['pred'])
recall = recall_score(df_result['lbl'], df_result['pred'])
f1 = f1_score(df_result['lbl'], df_result['pred'])

print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Precision: 0.7876106194690266
Recall: 0.8240740740740741
F1: 0.8054298642533937


## Next Steps: 
- Threshold based on predicted probability
- Reduce sparse features