In [1]:
import pandas as pd
import numpy as np

In [2]:
import collections
import random

In [3]:
import gc

In [4]:
import utils

In [5]:
date_string = "20210720"

### Import predictions

In [8]:
df_predictions = pd.read_csv(utils.get_base_path("heuristics/{}_smart_prediction.csv".format(date_string)), index_col=0)

  mask |= (ar1 == a)


In [9]:
df_predictions.columns

Index(['true_change', 'optimal_change', 'optimal_change_with_fee',
       'address_type', 'power_of_ten_2', 'power_of_ten_3', 'power_of_ten_4',
       'power_of_ten_5', 'power_of_ten_6', 'power_of_ten_7', 'pow10',
       'fp_inout_count', 'fp_output_count', 'fp_zeroconf', 'fp_multisig',
       'fp_p2pkh', 'fp_absolute_fee', 'fp_relative_fee', 'two_hops',
       'fp_version', 'fp_locktime', 'fp_rbf', 'fp_segwit',
       'fp_possible_segwit', 'fp_ordered_inouts', 'fp_address_type', 'fp_p2sh',
       'fp_p2wsh', 'fp_p2wpkh'],
      dtype='object')

In [10]:
# drop pow10, because we can use the individual pow10 heuristics with random forest
# drop two_hops, because it may be biased towards our ground truth set (due to higher rate of clustering)
df_predictions.drop(columns=["pow10", "two_hops"], inplace=True)

### Create a boolean mask for outputs where the transaction has at least one vote

In [11]:
has_prediction_full = (df_predictions.drop(columns=["true_change"]) != -1).any(1)

In [12]:
has_prediction_full_outputs = np.array([x for x in has_prediction_full for _ in (0,1)])

In [13]:
del has_prediction_full

In [14]:
FP_COLUMNS = [x for x in df_predictions.columns if x[:3] == "fp_"]

In [15]:
FP_COLUMNS

['fp_inout_count',
 'fp_output_count',
 'fp_zeroconf',
 'fp_multisig',
 'fp_p2pkh',
 'fp_absolute_fee',
 'fp_relative_fee',
 'fp_version',
 'fp_locktime',
 'fp_rbf',
 'fp_segwit',
 'fp_possible_segwit',
 'fp_ordered_inouts',
 'fp_address_type',
 'fp_p2sh',
 'fp_p2wsh',
 'fp_p2wpkh']

In [16]:
has_prediction_nofp = (df_predictions.drop(columns=(["true_change"] + FP_COLUMNS)) != -1).any(1)

In [17]:
has_prediction_nofp_outputs = np.array([x for x in has_prediction_nofp for _ in (0,1)])

In [15]:
del has_prediction_nofp

In [18]:
np.save(utils.get_base_path("evaluation/{0}_mask.npy".format(date_string)), has_prediction_full_outputs)

In [19]:
np.save(utils.get_base_path("evaluation/{0}_mask_nofp.npy".format(date_string)), has_prediction_nofp_outputs)

In [20]:
del has_prediction_full_outputs, has_prediction_nofp_outputs

### Convert tx-based predictions into output-based predictions

Input: index of output that heuristic votes for, or -1 otherwise.

Output: True/False (as 1/0) if the respective output is the change, or -1 otherwise

In [21]:
for row in df_predictions.itertuples():
    assert list(row._fields)[1:] == list(df_predictions.columns)
    break

In [22]:
values, indexes = [], []

for row in df_predictions.itertuples():
    indexes.append(row[0]*10)
    indexes.append(row[0]*10+1)
    values.append([int(x == 0) if x != -1 else -1 for x in row[1:]])
    values.append([int(x == 1) if x != -1 else -1 for x in row[1:]])

In [23]:
df_predictions_outs = pd.DataFrame(values, index=indexes)

In [24]:
df_predictions_outs.columns = df_predictions.columns

In [25]:
df_predictions_outs.head(6)

Unnamed: 0,true_change,optimal_change,optimal_change_with_fee,address_type,power_of_ten_2,power_of_ten_3,power_of_ten_4,power_of_ten_5,power_of_ten_6,power_of_ten_7,...,fp_version,fp_locktime,fp_rbf,fp_segwit,fp_possible_segwit,fp_ordered_inouts,fp_address_type,fp_p2sh,fp_p2wsh,fp_p2wpkh
364850,1,-1,-1,0,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,0,-1,-1,-1
364851,0,-1,-1,1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,-1
386190,1,1,1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,-1
386191,0,0,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,0,-1,-1,-1
444470,0,0,0,-1,-1,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,0,-1,-1,-1
444471,1,1,1,-1,-1,-1,-1,-1,-1,0,...,-1,-1,-1,-1,-1,-1,1,-1,-1,-1


In [26]:
df_predictions.head(3)

Unnamed: 0,true_change,optimal_change,optimal_change_with_fee,address_type,power_of_ten_2,power_of_ten_3,power_of_ten_4,power_of_ten_5,power_of_ten_6,power_of_ten_7,...,fp_version,fp_locktime,fp_rbf,fp_segwit,fp_possible_segwit,fp_ordered_inouts,fp_address_type,fp_p2sh,fp_p2wsh,fp_p2wpkh
36485,0,-1,-1,1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,-1
38619,0,0,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,0,-1,-1,-1
44447,1,1,1,-1,-1,-1,-1,-1,-1,0,...,-1,-1,-1,-1,-1,-1,1,-1,-1,-1


In [27]:
del df_predictions

In [28]:
del values, indexes

In [29]:
gc.collect()

66

### Split into features and outcome

In [30]:
y = df_predictions_outs["true_change"]

In [31]:
y = y.astype("uint8")

In [32]:
np.save(utils.get_base_path("evaluation/{0}_outcomes.npy".format(date_string)), y.values)

In [33]:
X = df_predictions_outs.drop("true_change", axis=1)

In [34]:
# reorder variables
X = X.replace([0, -1], [-1, 0])

In [35]:
X = X.apply(lambda x: x.astype('category'))

In [36]:
X.head()

Unnamed: 0,optimal_change,optimal_change_with_fee,address_type,power_of_ten_2,power_of_ten_3,power_of_ten_4,power_of_ten_5,power_of_ten_6,power_of_ten_7,fp_inout_count,...,fp_version,fp_locktime,fp_rbf,fp_segwit,fp_possible_segwit,fp_ordered_inouts,fp_address_type,fp_p2sh,fp_p2wsh,fp_p2wpkh
364850,0,0,-1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,-1,0,0,0
364851,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
386190,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
386191,-1,-1,0,0,0,0,0,0,0,-1,...,0,0,0,0,0,0,-1,0,0,0
444470,-1,-1,0,0,0,0,0,0,1,-1,...,0,0,0,0,0,0,-1,0,0,0


In [37]:
del df_predictions_outs

### Concatenate with tx/output characteristics

See notebook 9a) for export of characteristics

In [38]:
df_output_dtypes = {
    "co_output_value": np.uint64,
    "co_output_value_ratio": np.float64,
    "co_is_larger_output": np.bool,
    "co_output_index": np.uint8,
    "ct_fee": np.uint64,
    "ct_fee_per_byte": np.uint64,
    "ct_tx_value": np.uint64,
    "ct_segwit_tx": np.uint8,
    "ct_has_locktime": np.bool,
    "ct_block_height": np.uint32,
    "ct_input_count": np.uint32,
    "co_fresh_output": np.bool,
    "co_other_fresh": np.bool
}

In [39]:
df_outputs = pd.read_csv(utils.get_base_path("heuristics/{}-output-features.csv".format(date_string)), index_col=0, dtype=df_output_dtypes)

  mask |= (ar1 == a)


In [40]:
assert len(X) == len(df_outputs)

In [41]:
df_outputs.index = X.index

In [42]:
X = pd.concat([X, df_outputs], axis=1)

In [43]:
del df_outputs

In [44]:
len(X)

70514856

### Export

In [46]:
X.to_csv(utils.get_base_path("heuristics/{}-combined.csv".format(date_string)))