In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sklearn.model_selection import KFold, GroupKFold, cross_val_score, train_test_split, ShuffleSplit, GroupShuffleSplit

In [4]:
from sklearn.metrics import roc_curve, plot_roc_curve, RocCurveDisplay, auc, roc_auc_score

In [5]:
from sklearn.experimental import enable_halving_search_cv

In [6]:
from sklearn.model_selection import HalvingGridSearchCV

In [7]:
sns.set_theme(style="ticks")

In [8]:
import random

In [9]:
import collections

In [10]:
import gc

In [11]:
import utils

In [12]:
date_string = "20210720"

### Import data

In [13]:
y = np.load(utils.get_base_path("evaluation/{}_outcomes.npy".format(date_string)))

In [14]:
X = pd.read_csv(utils.get_base_path("evaluation/{}-combined.csv".format(date_string)), index_col=0, dtype=utils.get_X_dtypes())

  mask |= (ar1 == a)


In [15]:
X["ct_epoch"] = X["ct_block_height"] // 1008

In [16]:
X.columns

Index(['optimal_change', 'optimal_change_with_fee', 'address_type',
       'power_of_ten_2', 'power_of_ten_3', 'power_of_ten_4', 'power_of_ten_5',
       'power_of_ten_6', 'power_of_ten_7', 'fp_inout_count', 'fp_output_count',
       'fp_zeroconf', 'fp_multisig', 'fp_p2pkh', 'fp_absolute_fee',
       'fp_relative_fee', 'fp_version', 'fp_locktime', 'fp_rbf', 'fp_segwit',
       'fp_possible_segwit', 'fp_ordered_inouts', 'fp_address_type', 'fp_p2sh',
       'fp_p2wsh', 'fp_p2wpkh', 'co_output_value', 'co_is_larger_output',
       'co_output_value_ratio', 'co_output_index', 'co_fresh_output',
       'co_other_fresh', 'ct_fee', 'ct_fee_per_byte', 'ct_tx_value',
       'ct_version', 'ct_segwit_tx', 'ct_has_locktime', 'ct_block_height',
       'ct_input_count', 'ct_epoch'],
      dtype='object')

In [17]:
X.drop(columns=["ct_fee", "co_output_value", "ct_block_height", "co_is_larger_output", "co_fresh_output", "co_other_fresh"], inplace=True)

In [18]:
X_cols = X.columns

In [19]:
FINGERPRINT_COLS = [x for x in X_cols if x[:3] == "fp_"]

In [20]:
X.drop(columns=FINGERPRINT_COLS, inplace=True)

In [21]:
mask_nofp = np.load(utils.get_base_path("evaluation/{}_mask_nofp.npy".format(date_string)))

In [22]:
np.sum(mask_nofp) // 2

27493455

In [23]:
len(X) // 2

35257428

In [24]:
assert len(mask_nofp) == len(X)

In [25]:
X = X[mask_nofp].copy()
y = y[mask_nofp]

In [26]:
assert len(X) == np.sum(mask_nofp)
assert len(y) == np.sum(mask_nofp)

In [27]:
cluster_ids = np.load(utils.get_base_path("heuristics/{}-cluster-ids.npy".format(date_string)))

In [28]:
assert len(cluster_ids) == len(mask_nofp)

In [29]:
cluster_groups = cluster_ids[mask_nofp]

In [30]:
assert len(cluster_groups) == np.sum(mask_nofp)

### Apply to Out of Bag Sample (without having to rerun previous code)

In [31]:
shuffler = GroupShuffleSplit(n_splits=21, test_size=.2, random_state=1337).split(X=X, groups=cluster_groups)

In [32]:
true_outcomes = []
predictions = []

In [33]:
iteration = 0
for ib_idx, oob_idx in shuffler:
    assert len(ib_idx) + len(oob_idx) == len(X)
    assert len(ib_idx) > len(oob_idx)
    
    X_ib = X.iloc[ib_idx].copy()
    y_ib = y[ib_idx]
    X_oob  = X.iloc[oob_idx].copy()
    y_oob  = y[oob_idx]
    
    clusters_ib = cluster_groups[ib_idx]
    clusters_oob = cluster_groups[oob_idx]
    
    assert len(X_ib) == len(y_ib)
    assert len(X_oob) == len(y_oob)
    assert len(X_ib) + len(X_oob) == len(X)
    assert len(set(X_ib.index).intersection(set(X_oob.index))) == 0
    assert len(set(clusters_ib).intersection(set(clusters_oob))) == 0
    
    rfc = RandomForestClassifier(random_state=1337+iteration, n_jobs=34, n_estimators=100, max_features=6, min_samples_split=20, min_samples_leaf=10)
    rfc.fit(X_ib, y_ib)
    oob_prediction = rfc.predict_proba(X_oob)[:, 1]
    true_outcomes.append(y_oob)
    predictions.append(oob_prediction)
    print(iteration, roc_auc_score(y_oob, oob_prediction))
    iteration += 1

0 0.9980716988737255
1 0.9848762788010387
2 0.9977125286421359
3 0.9978089530351082
4 0.9978804579234758
5 0.9977909055578175
6 0.9979569188723146
7 0.9971553650605947
8 0.9975684020653233
9 0.9981406684502269
10 0.9971710571374522
11 0.9975609562363963
12 0.9981079901888215
13 0.9976827235263912
14 0.9981598232562869
15 0.9975216649986716
16 0.9977670733990103
17 0.9864515146850925
18 0.9982901207622831
19 0.9973245847368104
20 0.9975808211404055


In [34]:
len(predictions)

21

In [35]:
auc_scores = [roc_auc_score(y_true, y_pred) for y_true, y_pred in zip(true_outcomes, predictions)]

In [36]:
auc_scores

[0.9980716988737255,
 0.9848762788010387,
 0.9977125286421359,
 0.9978089530351082,
 0.9978804579234758,
 0.9977909055578175,
 0.9979569188723146,
 0.9971553650605947,
 0.9975684020653233,
 0.9981406684502269,
 0.9971710571374522,
 0.9975609562363963,
 0.9981079901888215,
 0.9976827235263912,
 0.9981598232562869,
 0.9975216649986716,
 0.9977670733990103,
 0.9864515146850925,
 0.9982901207622831,
 0.9973245847368104,
 0.9975808211404055]

In [37]:
np.mean(auc_scores[1:]), np.std(auc_scores[1:])

(0.9965254404237829, 0.0036416234821124494)