In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn import preprocessing, metrics, model_selection
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split

In [97]:
collector_path = "collectors.csv"
flippers_path = "flippers.csv"
profitablt_flippers_path = "profitable_flippers.csv"
whales_path = "whales.csv"

In [98]:
def load_local_data():
    collectors = pd.read_csv(collector_path)
    flippers = pd.read_csv(flippers_path)
    profitable_flippers = pd.read_csv(profitablt_flippers_path)
    whales = pd.read_csv(whales_path)
    return collectors, flippers, profitable_flippers, whales

In [99]:
collectors, flippers, profitable_flippers, whales = load_local_data()
collectors.columns = [collectors.columns[0]] + [col + '_c' for col in collectors.columns[1:]]
flippers.columns = [flippers.columns[0]] + [col + '_f' for col in flippers.columns[1:]]
profitable_flippers.columns = [profitable_flippers.columns[0]] + [col + '_pf' for col in profitable_flippers.columns[1:]]
whales.columns = [whales.columns[0]] + [col + '_w' for col in whales.columns[1:]]

# Merge the tables on 'wallet_address' column
merged_df = collectors.merge(flippers, on='wallet_address', how='outer')
merged_df = merged_df.merge(profitable_flippers, on='wallet_address')
df = merged_df.merge(whales, on='wallet_address', how='outer')

In [100]:
df.head()

Unnamed: 0,wallet_address,num_high_value_c,num_collections_c,num_owned_c,num_flipped_f,num_collection_f,num_profit_f,avg_profit_f,tot_profit_f,num_flipped_pf,num_collection_pf,num_profit_pf,avg_profit_pf,tot_profit_pf,num_high_value_w,num_collections_w,num_owned_w
0,0x05e5a014067a5f01cbc128ca99f631e91dec2fd9,0.0,45.0,124.0,50.0,31.0,54.0,0.023049,1.705601,50.0,31.0,54.0,0.023049,1.705601,,,
1,0x07ea687c51103d3ea021c019e2b2fffa40fca432,0.0,35.0,61.0,19.0,27.0,30.0,0.018498,0.721422,19.0,27.0,30.0,0.018498,0.721422,,,
2,0x08ea5aea8517312705d98ecda3d9bcc06ecc85e8,0.0,36.0,102.0,32.0,28.0,33.0,0.016255,0.715213,32.0,28.0,33.0,0.016255,0.715213,,,
3,0x0cdbd3c78615d7f9f94db33bd9a435b42e92a721,0.0,29.0,61.0,29.0,26.0,27.0,0.035074,1.192518,29.0,26.0,27.0,0.035074,1.192518,,,
4,0x1178c316e3f64edcc29ade6bf5e3f559499bfc64,0.0,40.0,79.0,25.0,31.0,41.0,0.023951,1.38916,25.0,31.0,41.0,0.023951,1.38916,,,


In [101]:
df['collector'] = df['wallet_address'].isin(collectors['wallet_address']).astype(int)
df['flipper'] = df['wallet_address'].isin(flippers['wallet_address']).astype(int)
df['profitable_flipper'] = df['wallet_address'].isin(profitable_flippers['wallet_address']).astype(int)
df['whale'] = df['wallet_address'].isin(whales['wallet_address']).astype(int)
df = df.fillna(0)

In [102]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 811 entries, 0 to 810
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   wallet_address      811 non-null    object 
 1   num_high_value_c    811 non-null    float64
 2   num_collections_c   811 non-null    float64
 3   num_owned_c         811 non-null    float64
 4   num_flipped_f       811 non-null    float64
 5   num_collection_f    811 non-null    float64
 6   num_profit_f        811 non-null    float64
 7   avg_profit_f        811 non-null    float64
 8   tot_profit_f        811 non-null    float64
 9   num_flipped_pf      811 non-null    float64
 10  num_collection_pf   811 non-null    float64
 11  num_profit_pf       811 non-null    float64
 12  avg_profit_pf       811 non-null    float64
 13  tot_profit_pf       811 non-null    float64
 14  num_high_value_w    811 non-null    float64
 15  num_collections_w   811 non-null    float64
 16  num_owne

Unnamed: 0,num_high_value_c,num_collections_c,num_owned_c,num_flipped_f,num_collection_f,num_profit_f,avg_profit_f,tot_profit_f,num_flipped_pf,num_collection_pf,num_profit_pf,avg_profit_pf,tot_profit_pf,num_high_value_w,num_collections_w,num_owned_w,collector,flipper,profitable_flipper,whale
count,811.0,811.0,811.0,811.0,811.0,811.0,811.0,811.0,811.0,811.0,811.0,811.0,811.0,811.0,811.0,811.0,811.0,811.0,811.0,811.0
mean,0.135635,7.373613,16.861899,7.346486,6.102343,9.246609,0.008045,0.503963,7.346486,6.102343,9.246609,0.008045,0.503963,3.78545,11.193588,27.89889,0.451295,0.219482,0.187423,0.837238
std,1.091539,18.717187,49.032171,19.950112,14.752045,24.810409,0.072644,4.640787,19.950112,14.752045,24.810409,0.072644,4.640787,7.30708,16.061556,60.854631,0.497929,0.414151,0.390491,0.369376
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,6.0,9.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,8.0,15.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,12.0,29.5,1.0,0.0,0.0,1.0
max,20.0,172.0,638.0,259.0,139.0,298.0,2.019387,129.24075,259.0,139.0,298.0,2.019387,129.24075,149.0,305.0,1280.0,1.0,1.0,1.0,1.0


In [103]:
# df = df.drop('wallet_address', axis=1)
df

Unnamed: 0,wallet_address,num_high_value_c,num_collections_c,num_owned_c,num_flipped_f,num_collection_f,num_profit_f,avg_profit_f,tot_profit_f,num_flipped_pf,...,num_profit_pf,avg_profit_pf,tot_profit_pf,num_high_value_w,num_collections_w,num_owned_w,collector,flipper,profitable_flipper,whale
0,0x05e5a014067a5f01cbc128ca99f631e91dec2fd9,0.0,45.0,124.0,50.0,31.0,54.0,0.023049,1.705601,50.0,...,54.0,0.023049,1.705601,0.0,0.0,0.0,1,1,1,0
1,0x07ea687c51103d3ea021c019e2b2fffa40fca432,0.0,35.0,61.0,19.0,27.0,30.0,0.018498,0.721422,19.0,...,30.0,0.018498,0.721422,0.0,0.0,0.0,1,1,1,0
2,0x08ea5aea8517312705d98ecda3d9bcc06ecc85e8,0.0,36.0,102.0,32.0,28.0,33.0,0.016255,0.715213,32.0,...,33.0,0.016255,0.715213,0.0,0.0,0.0,1,1,1,0
3,0x0cdbd3c78615d7f9f94db33bd9a435b42e92a721,0.0,29.0,61.0,29.0,26.0,27.0,0.035074,1.192518,29.0,...,27.0,0.035074,1.192518,0.0,0.0,0.0,1,1,1,0
4,0x1178c316e3f64edcc29ade6bf5e3f559499bfc64,0.0,40.0,79.0,25.0,31.0,41.0,0.023951,1.389160,25.0,...,41.0,0.023951,1.389160,0.0,0.0,0.0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806,0xff1777854bdc3507d581e3a762b0f85832302611,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,6.0,6.0,13.0,0,0,0,1
807,0xff567d26a66a556afe5b9183db01370aa78d7bda,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,7.0,12.0,25.0,0,0,0,1
808,0xffb6d97bd1e7b7bd08595096d15037401a1f416b,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,5.0,6.0,10.0,0,0,0,1
809,0xffba913bb056544b75e57312ec3eae2528c285e1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,2.0,6.0,8.0,0,0,0,1


In [104]:
features = [x for x in collectors.columns if x not in ['wallet_address', 'collector', 'flipper', 'profitable_flipper', 'whale']]
labels = ['collector', 'flipper', 'profitable_flipper', 'whale']

In [105]:
df['labels'] = df.apply(lambda row: ','.join(col for col in labels if row[col]), axis=1)
lbl_enc = LabelEncoder()
df['labels'] = lbl_enc.fit_transform(df['labels'].values)

In [106]:
for feat in features:
    lbl_enc = preprocessing.LabelEncoder()
    df[feat] = lbl_enc.fit_transform(df[feat].values)

In [107]:
df

Unnamed: 0,wallet_address,num_high_value_c,num_collections_c,num_owned_c,num_flipped_f,num_collection_f,num_profit_f,avg_profit_f,tot_profit_f,num_flipped_pf,...,avg_profit_pf,tot_profit_pf,num_high_value_w,num_collections_w,num_owned_w,collector,flipper,profitable_flipper,whale,labels
0,0x05e5a014067a5f01cbc128ca99f631e91dec2fd9,0,29,67,50.0,31.0,54.0,0.023049,1.705601,50.0,...,0.023049,1.705601,0.0,0.0,0.0,1,1,1,0,0
1,0x07ea687c51103d3ea021c019e2b2fffa40fca432,0,20,30,19.0,27.0,30.0,0.018498,0.721422,19.0,...,0.018498,0.721422,0.0,0.0,0.0,1,1,1,0,0
2,0x08ea5aea8517312705d98ecda3d9bcc06ecc85e8,0,21,55,32.0,28.0,33.0,0.016255,0.715213,32.0,...,0.016255,0.715213,0.0,0.0,0.0,1,1,1,0,0
3,0x0cdbd3c78615d7f9f94db33bd9a435b42e92a721,0,16,30,29.0,26.0,27.0,0.035074,1.192518,29.0,...,0.035074,1.192518,0.0,0.0,0.0,1,1,1,0,0
4,0x1178c316e3f64edcc29ade6bf5e3f559499bfc64,0,25,39,25.0,31.0,41.0,0.023951,1.389160,25.0,...,0.023951,1.389160,0.0,0.0,0.0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806,0xff1777854bdc3507d581e3a762b0f85832302611,0,0,0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,6.0,6.0,13.0,0,0,0,1,7
807,0xff567d26a66a556afe5b9183db01370aa78d7bda,0,0,0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,7.0,12.0,25.0,0,0,0,1,7
808,0xffb6d97bd1e7b7bd08595096d15037401a1f416b,0,0,0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,5.0,6.0,10.0,0,0,0,1,7
809,0xffba913bb056544b75e57312ec3eae2528c285e1,0,0,0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,2.0,6.0,8.0,0,0,0,1,7


In [108]:
# for feat in features:
#     lbl_enc = preprocessing.LabelEncoder()
#     df[feat] = lbl_enc.fit_transform(df[feat].values)

In [109]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [117]:
def create_model(data, catcols):    
    inputs = []
    outputs = []
    for c in catcols:
        if c in data.columns:
            num_unique_values = int(data[c].nunique())
            embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
            inp = tf.keras.layers.Input(shape=(1,))
            out = tf.keras.layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
            out = tf.keras.layers.SpatialDropout1D(0.3)(out)
            out = tf.keras.layers.Reshape(target_shape=(embed_dim, ))(out)
            inputs.append(inp)
            outputs.append(out)
    
    if len(outputs) == 0:
        raise ValueError("No valid categorical columns found in data")
    
    x = tf.keras.layers.Concatenate()(outputs)
    x = tf.keras.layers.BatchNormalization()(x)
    
    x = tf.keras.layers.Dense(300, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    x = tf.keras.layers.Dense(300, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    y = tf.keras.layers.Dense(4, activation="softmax")(x)

    model = tf.keras.Model(inputs=inputs, outputs=y)
    return model

In [111]:
def auc(y_true, y_pred):
    def fallback_auc(y_true, y_pred):
        try:
            return metrics.roc_auc_score(y_true, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)

In [118]:
model = create_model(df, features)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[auc])

In [119]:
X_train = [train.loc[:, features].values[:, k] for k in range(train.loc[:, features].values.shape[1])]
X_test = [test.loc[:, features].values[:, k] for k in range(test.loc[:, features].values.shape[1])]

In [120]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001, patience=5,
                                 verbose=1, mode='max', baseline=None, restore_best_weights=True)

rlr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5,
                                      patience=3, min_lr=1e-6, mode='max', verbose=1)

In [121]:
model.fit(X_train, 
          train[labels], 
          verbose=1,
          epochs=100, 
          batch_size=64, 
          validation_split=0.2,
          callbacks=[es, rlr])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 5/100
Epoch 6/100
Epoch 6: early stopping


<keras.callbacks.History at 0x7fee71a2d670>

In [122]:
# Evaluate the model on test set
model.evaluate(X_test, test[labels])



[2.179177761077881, 0.8444618582725525]