# Export Ground Truth

Filter and select ground truth transactions from the set of transactions with revealed change.

In [1]:
import blocksci

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [3]:
import collections
import random

In [4]:
import datetime

In [5]:
import utils

In [6]:
chain = blocksci.Blockchain(utils.blocksci_config())

In [7]:
my_cm = blocksci.cluster.ClusterManager(utils.latest_clustering(), chain)

### Heuristics and helpers

In [8]:
cluster_member = blocksci.heuristics.change.cluster_member(my_cm)

In [9]:
address_reuse = blocksci.heuristics.change.address_reuse

In [10]:
cluster_id = blocksci.heuristics.cluster_id(my_cm)

In [11]:
output_cluster_id = blocksci.heuristics.output_cluster_id(my_cm)

### Statistics for both fresh and non-fresh outputs

In [12]:
chain.blocks.txes.where(lambda tx: blocksci.heuristics.is_standard_tx(tx) & (address_reuse(tx).size == 0) & (cluster_member(tx).size > 0)).size

53412629

In [15]:
# alternative counting where we exclude txes with unspent outputs from the > 0 selection
chain.blocks.txes.where(lambda tx: blocksci.heuristics.is_standard_tx(tx) & (tx.outputs.all(lambda o: o.is_spent)) & (address_reuse(tx).size == 0) & (cluster_member(tx).size > 0)).size

52328198

In [16]:
# fresh
chain.blocks.txes.where(lambda tx: blocksci.heuristics.is_standard_tx(tx) & (tx.outputs.all(lambda o: o.is_spent)) & (address_reuse(tx).size == 0) & (cluster_member(tx).size == 1)).where(lambda tx: cluster_member(tx).all(lambda o: o.address.first_tx == o.tx)).size

34501985

In [17]:
# not fresh
chain.blocks.txes.where(lambda tx: blocksci.heuristics.is_standard_tx(tx) & (tx.outputs.all(lambda o: o.is_spent)) & (address_reuse(tx).size == 0) & (cluster_member(tx).size == 1)).where(lambda tx: cluster_member(tx).all(lambda o: o.address.first_tx != o.tx)).size

16858612

### Selection criteria
- 2 outputs
- all outputs are spent
- no direct address reuse
- exactly one cluster member (change)
- change does not have to be fresh

In [18]:
def two_out_txes():
    return chain.blocks.txes.where(lambda tx: (tx.output_count == 2 ) & (tx.outputs.all(lambda o: o.is_spent)) & (address_reuse(tx).size == 0) & (cluster_member(tx).size == 1))

In [19]:
gt_0 = blocksci.GroundTruth.create(chain, two_out_txes().index, "/home/ubuntu/Data/groundtruth/20210715-0-2out")

In [20]:
assert 51360597 == gt_0.transactions().size

### Identify clusters where too many transactions have outputs in the same cluster (collapse)

In [21]:
def two_out_txes_with_collapse():
    return chain.blocks.txes.where(lambda tx: (tx.output_count == 2 ) & (tx.outputs.all(lambda o: o.is_spent)) & (address_reuse(tx).size == 0) & (cluster_member(tx).size > 1))

In [22]:
collapsed_clusters = two_out_txes_with_collapse().map(lambda tx: cluster_id(tx))

In [23]:
len(collapsed_clusters)

967601

In [24]:
cluster_counter = collections.Counter(collapsed_clusters)

##### Identify instances where the number of transactions with such collapse exceeds a threshold. We use 10%.

In [25]:
filtered_clusters = []
collapse_percentage = []
for cid, collapse_count in cluster_counter.most_common():
    cluster = my_cm.clusters()[cid]
    collapse_share = round(collapse_count / cluster.tx_count * 100, 2)
    if collapse_share >= 10:
        collapse_percentage.append(collapse_share)
        filtered_clusters.append(cid)

In [26]:
len(filtered_clusters), len(set(filtered_clusters))

(9967, 9967)

In [27]:
np.save("/home/ubuntu/Data/groundtruth/filtered-clusters-collapse.npy", filtered_clusters)

In [28]:
filtered_clusters = set(filtered_clusters)

##### Remove transactions from those clusters from our ground truth set

In [29]:
full_gt_indexes = gt_0.transactions().index
full_gt_cluster_ids = gt_0.transactions().map(lambda tx: cluster_id(tx))

In [30]:
index_mask = np.array([x not in filtered_clusters for x in full_gt_cluster_ids])

In [31]:
gt_0.transactions().size - np.sum(index_mask)

480845

In [32]:
filtered_indexes = full_gt_indexes[index_mask]

In [33]:
assert len(filtered_indexes) == np.sum(index_mask)

In [34]:
gt_1 = blocksci.GroundTruth.create(chain, filtered_indexes, "/home/ubuntu/Data/groundtruth/20210715-1-2out-nobothclustered")

In [35]:
assert gt_1.transactions().size == gt_0.transactions().size - 480845

In [36]:
# check indexes are sorted
assert (gt_1.transactions().index == np.sort(gt_1.transactions().index)).all()

In [37]:
del cluster_counter
del filtered_clusters
del collapsed_clusters
del collapse_percentage
del full_gt_indexes
del full_gt_cluster_ids
del index_mask
del filtered_indexes

### Ignore MtGox supercluster

This cluster already has cluster collapse, due to users being able to import their own private keys.

In [39]:
MT_GOX_CLUSTER_ID = my_cm.cluster_with_address(chain.address_from_string("1LNWw6yCxkUmkhArb2Nf2MPw6vG7u5WG7q")).index

In [40]:
MT_GOX_CLUSTER_ID

221

In [41]:
no_mt_gox = gt_1.transactions().where(lambda tx: cluster_id(tx) != MT_GOX_CLUSTER_ID).index

In [42]:
gt_1.transactions().size - len(no_mt_gox)

366926

In [43]:
gt_2 = blocksci.GroundTruth.create(chain, no_mt_gox, "/home/ubuntu/Data/groundtruth/20210715-2-2out-nobothclustered-nomtgox")

In [44]:
del no_mt_gox

In [45]:
assert gt_1.transactions().size - gt_2.transactions().size == 366926

### No cluster collapse

Exclude potential pre-existing cluster collapse based on address tags from walletexplorer.com

See notebook `8b` for details and to re-run the web scraping.

In [46]:
LOCAL_BITCOINS_OLD_CLUSTER_ID = my_cm.cluster_with_address(chain.address_from_string("16B33Afe8hPHLwaoaQu3m3XFTagQLAAAGh")).index

In [47]:
LOCAL_BITCOINS_OLD_CLUSTER_ID

91784202

In [48]:
no_cluster_collapse = gt_2.transactions().where(lambda tx: cluster_id(tx) != LOCAL_BITCOINS_OLD_CLUSTER_ID).index

In [49]:
len(no_cluster_collapse)

50424879

In [50]:
gt_2.transactions().size - len(no_cluster_collapse)

87947

In [51]:
gt_3 = blocksci.GroundTruth.create(chain, no_cluster_collapse, "/home/ubuntu/Data/groundtruth/20210715-3-2out-nobothclustered-nomtgox-nolocalbitcoins")

In [52]:
del no_cluster_collapse

### Separate into fresh and non-fresh change

In [53]:
fresh_indexes    = gt_3.transactions().where(lambda tx: cluster_member(tx).all(lambda o: o.address.first_tx == o.tx)).index

In [54]:
nonfresh_indexes = gt_3.transactions().where(lambda tx: cluster_member(tx).all(lambda o: o.address.first_tx != o.tx)).index

In [55]:
len(fresh_indexes)

33714510

In [56]:
np.save("/home/ubuntu/Data/groundtruth/20210715-fresh-indexes.npy", fresh_indexes)

In [57]:
len(nonfresh_indexes)

16710369

In [58]:
np.save("/home/ubuntu/Data/groundtruth/20210715-nonfresh-indexes.npy", nonfresh_indexes)

In [59]:
nonfresh_gt = blocksci.GroundTruth.create(chain, nonfresh_indexes, "/home/ubuntu/Data/groundtruth/20210715-nonfresh")

In [60]:
assert len(nonfresh_indexes) == nonfresh_gt.transactions().size

In [61]:
assert (nonfresh_gt.transactions().index == np.sort(nonfresh_gt.transactions().index)).all()

In [62]:
assert (nonfresh_indexes == nonfresh_gt.transactions().index).all()

### Filter non-fresh ground truth

We exclude transactions where the change was already known through clustering at the time of the transaction

In [63]:
%time filtered_txes_cm = blocksci.cluster.ClusterManager.filter_txes(chain, nonfresh_gt, my_cm)

Linking nested addresses
314031 distinct clusters
99.97% done
CPU times: user 4min 55s, sys: 13min 6s, total: 18min 1s
Wall time: 3h 34min 30s


In [64]:
len(filtered_txes_cm)

1542918

In [65]:
len(fresh_indexes), len(filtered_txes_cm)

(33714510, 1542918)

In [66]:
filtered_nonfresh_indexes = sorted([tx.index for tx in filtered_txes_cm])

In [67]:
len(filtered_nonfresh_indexes)

1542918

In [68]:
full_gt_indexes = sorted(fresh_indexes.tolist() + filtered_nonfresh_indexes)

In [69]:
assert len(full_gt_indexes) == len(fresh_indexes) + len(filtered_nonfresh_indexes)
assert len(full_gt_indexes) == len(set(full_gt_indexes))

In [70]:
assert full_gt_indexes == sorted(full_gt_indexes)

In [71]:
len(full_gt_indexes)

35257428

In [72]:
full_gt = blocksci.GroundTruth.create(chain, full_gt_indexes, "/home/ubuntu/Data/groundtruth/20210715-4-final")

In [73]:
assert len(full_gt_indexes) == full_gt.transactions().size

In [74]:
assert (full_gt_indexes == full_gt.transactions().index).all()