In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
import joblib

In [4]:
import utils

In [5]:
sns.set_theme(style="ticks")

In [6]:
import random

In [7]:
import gc

In [8]:
date_string = "20210720"

### Import data

In [9]:
y = np.load(utils.get_base_path("evaluation/{}_outcomes.npy".format(date_string)))

In [10]:
X = pd.read_csv(utils.get_base_path("evaluation/{}-combined.csv".format(date_string)), index_col=0, dtype=utils.get_X_dtypes())

  mask |= (ar1 == a)


In [11]:
X["ct_epoch"] = X["ct_block_height"] // 1008

In [12]:
X.columns

Index(['optimal_change', 'optimal_change_with_fee', 'address_type',
       'power_of_ten_2', 'power_of_ten_3', 'power_of_ten_4', 'power_of_ten_5',
       'power_of_ten_6', 'power_of_ten_7', 'fp_inout_count', 'fp_output_count',
       'fp_zeroconf', 'fp_multisig', 'fp_p2pkh', 'fp_absolute_fee',
       'fp_relative_fee', 'fp_version', 'fp_locktime', 'fp_rbf', 'fp_segwit',
       'fp_possible_segwit', 'fp_ordered_inouts', 'fp_address_type', 'fp_p2sh',
       'fp_p2wsh', 'fp_p2wpkh', 'co_output_value', 'co_is_larger_output',
       'co_output_value_ratio', 'co_output_index', 'co_fresh_output',
       'co_other_fresh', 'ct_fee', 'ct_fee_per_byte', 'ct_tx_value',
       'ct_version', 'ct_segwit_tx', 'ct_has_locktime', 'ct_block_height',
       'ct_input_count', 'ct_epoch'],
      dtype='object')

In [13]:
X.drop(columns=["ct_fee", "co_output_value", "ct_block_height", "co_is_larger_output", "co_fresh_output", "co_other_fresh"], inplace=True)

In [14]:
X_cols = X.columns

In [15]:
len(X.columns)

35

In [17]:
X_cols

Index(['optimal_change', 'optimal_change_with_fee', 'address_type',
       'power_of_ten_2', 'power_of_ten_3', 'power_of_ten_4', 'power_of_ten_5',
       'power_of_ten_6', 'power_of_ten_7', 'fp_inout_count', 'fp_output_count',
       'fp_zeroconf', 'fp_multisig', 'fp_p2pkh', 'fp_absolute_fee',
       'fp_relative_fee', 'fp_version', 'fp_locktime', 'fp_rbf', 'fp_segwit',
       'fp_possible_segwit', 'fp_ordered_inouts', 'fp_address_type', 'fp_p2sh',
       'fp_p2wsh', 'fp_p2wpkh', 'co_output_value_ratio', 'co_output_index',
       'ct_fee_per_byte', 'ct_tx_value', 'ct_version', 'ct_segwit_tx',
       'ct_has_locktime', 'ct_input_count', 'ct_epoch'],
      dtype='object')

In [21]:
FINGERPRINT_COLS = [x for x in X_cols if x[:3] == "fp_"]

In [22]:
CHARACTERISTICS_COLS = [x for x in X_cols if x[:3] == "co_" or x[:3] == "ct_"]

### Remove transactions that do not have a single prediction

In [23]:
mask_full = (X.drop(columns=CHARACTERISTICS_COLS) != 0).any(1)

In [24]:
assert (mask_full == np.load(utils.get_base_path("evaluation/{}_mask.npy".format(date_string)))).all()

In [25]:
np.sum(mask_full) // 2

34398846

In [26]:
len(X) // 2

35257428

In [27]:
assert len(mask_full) == len(X)

In [28]:
X1 = X[mask_full].copy()
y1 = y[mask_full]

In [29]:
assert len(X1) == np.sum(mask_full)

### Train classifier for normal & fingerprint

In [31]:
rfc1 = RandomForestClassifier(random_state=1337, n_jobs=34, n_estimators=100, max_features=7, min_samples_split=20, min_samples_leaf=10)

In [33]:
rfc1.fit(X1, y1)

RandomForestClassifier(max_features=7, min_samples_leaf=10,
                       min_samples_split=20, n_jobs=34, random_state=1337)

In [34]:
joblib.dump(X1.columns, utils.get_base_path('model/X_columns_full.pkl'))

['/n/fs/scratch/mmoeser/2021/model/X_columns_full.pkl']

In [35]:
joblib.dump(rfc1, utils.get_base_path("model/rfc-full.pkl"))

['/n/fs/scratch/mmoeser/2021/model/rfc-full.pkl']

In [36]:
del X1, y1

In [37]:
del rfc1

### Select only transactions that have a prediction from a "normal" heuristic

In [38]:
X2 = X.copy()
y2 = y

In [39]:
X2 = X2.drop(columns=FINGERPRINT_COLS)

In [40]:
len(X2)

70514856

In [41]:
mask_nofp = (X2.drop(columns=CHARACTERISTICS_COLS) != 0).any(1)

In [42]:
assert (mask_nofp == np.load(utils.get_base_path("evaluation/{}_mask_nofp.npy".format(date_string)))).all()

In [43]:
np.sum(mask_nofp) // 2

27493455

In [44]:
assert len(mask_nofp) == len(X2)
assert len(mask_nofp) == len(y2)

In [45]:
X2 = X2[mask_nofp]
y2 = y2[mask_nofp]

In [46]:
assert len(X2) == np.sum(mask_nofp)
assert len(y2) == np.sum(mask_nofp)

### Train classifier for normal only

In [47]:
rfc2 = RandomForestClassifier(random_state=1337, n_jobs=34, n_estimators=100, max_features=6, min_samples_split=20, min_samples_leaf=10)

In [48]:
rfc2.fit(X2, y2)

RandomForestClassifier(max_features=6, min_samples_leaf=10,
                       min_samples_split=20, n_jobs=34, random_state=1337)

In [49]:
joblib.dump(X2.columns, utils.get_base_path('model/X_columns_nofp.pkl'))

['/n/fs/scratch/mmoeser/2021/model/X_columns_nofp.pkl']

In [50]:
joblib.dump(rfc2, utils.get_base_path("model/rfc-nofp.pkl"))

['/n/fs/scratch/mmoeser/2021/model/rfc-nofp.pkl']

In [51]:
del X2, y2

In [52]:
del rfc2