In [1]:
import blocksci

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [3]:
import collections
import random

In [4]:
import utils

In [5]:
import analysis

In [6]:
import gc

In [7]:
import joblib

In [8]:
from utils import grouper

In [9]:
chain = blocksci.Blockchain(utils.blocksci_config())

In [10]:
my_cm = blocksci.cluster.ClusterManager(utils.latest_clustering(), chain)

In [11]:
stored_txes = blocksci.GroundTruth(utils.remaining_txes(), chain)

In [12]:
datestring = "20210803"

In [13]:
datestring_new = "20210809"

In [14]:
converter = blocksci.CurrencyConverter()

'Exchange rates are provided by CoinDesk (https://www.coindesk.com/price/).'

### Create a clustering with the Meiklejohn heuristic

In [None]:
assert False

In [18]:
meiklejohn_heuristic = blocksci.heuristics.change.meiklejohn.unique_index

In [19]:
mj_indizes = stored_txes.transactions().map(lambda tx: meiklejohn_heuristic(tx))

In [20]:
txes_to_merge = []
timestamps = []

for tx_index, prediction in zip(
    stored_txes.transactions().index,
    mj_indizes
):
    
    if prediction == -1:
        continue
    else:
        assert prediction in (0, 1)
        txes_to_merge.append((tx_index, prediction))

In [22]:
len(txes_to_merge)

147600208

In [23]:
np.save("/home/ubuntu/Data/full-prediction/{}-local-meiklejohn-txes-to-cluster.npy".format(datestring_new), txes_to_merge)

In [24]:
len(txes_to_merge) / stored_txes.transactions().size

0.47666317542600434

In [None]:
# no need to regenerate, may want to keep certain data
assert False

In [25]:
refined_clustering_mj = blocksci.cluster.ClusterManager.refine_clustering("/home/ubuntu/Data/clusters/{}-meiklejohn".format(datestring_new), utils.latest_clustering(), txes_to_merge, chain)

Refining clustering at path /home/ubuntu/Data/clusters/20210715-base-clustering.
100.00% done
Successfully stored cluster data at /home/ubuntu/Data/clusters/20210809-meiklejohn
Serializing cluster tx counts.
Flattening cluster tx vector.
Tx counts match.
Serializing transaction IDs.
Computing transaction offsets.
Serializing cluster tx offsets.


In [26]:
del txes_to_merge

In [27]:
del mj_indizes

## Cluster Collapse

In [16]:
refined_clustering_mj = blocksci.cluster.ClusterManager("/home/ubuntu/Data/clusters/{}-meiklejohn".format(datestring_new), chain)

In [28]:
my_cm.clusters().size

658522992

In [29]:
refined_clustering_mj.clusters().size

520462579

### Check for Mt. Gox cluster collapse

In [30]:
mtgox_address = chain.address_from_string("1LNWw6yCxkUmkhArb2Nf2MPw6vG7u5WG7q")

In [31]:
mt_gox_before = my_cm.clusters()[my_cm.cluster_with_address(mtgox_address).index]

In [32]:
mt_gox_before.tx_count, mt_gox_before.type_equiv_size, len(mt_gox_before)

(4161568, 13214890, 13201430)

In [33]:
mt_gox_before.index

221

In [34]:
mt_gox_mj = refined_clustering_mj.clusters()[refined_clustering_mj.cluster_with_address(mtgox_address).index]

In [35]:
mt_gox_mj.index

213

In [36]:
mt_gox_mj.tx_count, mt_gox_mj.type_equiv_size, len(mt_gox_mj)

(124794883, 426967378, 281330288)

### Full cluster dataset

In [37]:
import yaml

In [38]:
with open('walletexplorer.yaml') as f:
    cluster_tags = yaml.safe_load(f)

In [39]:
cluster_tags = cluster_tags['tags']

In [40]:
more_clusters = {}
for entry in cluster_tags:
    more_clusters[entry['label']] = entry['address']

In [41]:
new_clusters_mj = collections.defaultdict(list)

for cluster_name, cluster_address in more_clusters.items():
    address = chain.address_from_string(cluster_address)
    
    mj_cluster = refined_clustering_mj.cluster_with_address(address)
    new_clusters_mj[mj_cluster.index].append(cluster_name)

In [42]:
len(more_clusters), len(new_clusters_mj)

(273, 108)

In [44]:
len(new_clusters_mj[213])

166

### Differences between RF and Meiklejohn heuristic

In [45]:
rf_txes = np.load("/home/ubuntu/Data/full-prediction/20210804-refined-txes-to-cluster.npy")

In [47]:
txes_to_merge = np.load("/home/ubuntu/Data/full-prediction/{}-local-meiklejohn-txes-to-cluster.npy".format(datestring_new))

In [48]:
meiklejohn_tx_indexes = [x for x, _ in txes_to_merge]

In [49]:
rf_tx_indexes = [x for x, _ in rf_txes]

In [50]:
assert len(meiklejohn_tx_indexes) == len(set(meiklejohn_tx_indexes))

In [51]:
assert len(rf_tx_indexes) == len(set(rf_tx_indexes))

In [52]:
len(rf_tx_indexes), len(meiklejohn_tx_indexes)

(150865528, 147600208)

In [53]:
overlap = set(rf_tx_indexes).intersection(set(meiklejohn_tx_indexes))

In [54]:
len(overlap)

71185366

In [55]:
# check is sorted
assert all(meiklejohn_tx_indexes[i] <= meiklejohn_tx_indexes[i+1] for i in range(len(meiklejohn_tx_indexes)-1))

In [56]:
# check is sorted
assert all(rf_tx_indexes[i] <= rf_tx_indexes[i+1] for i in range(len(rf_tx_indexes)-1))

In [57]:
meiklejohn_predict_overlap = np.array([y for x, y in txes_to_merge if x in overlap])

In [58]:
rf_predict_overlap = np.array([y for x, y in rf_txes if x in overlap])

In [59]:
assert len(meiklejohn_predict_overlap) == len(overlap)

In [60]:
(meiklejohn_predict_overlap == rf_predict_overlap).sum()

69851984

In [61]:
len(overlap) - (meiklejohn_predict_overlap == rf_predict_overlap).sum()

1333382

In [62]:
overlap_txes = np.array(sorted(list(overlap)))

In [63]:
tx_indexes_with_mismatch = overlap_txes[meiklejohn_predict_overlap != rf_predict_overlap]

In [64]:
len(tx_indexes_with_mismatch)

1333382

In [65]:
meiklejohn_selection = meiklejohn_predict_overlap[meiklejohn_predict_overlap != rf_predict_overlap]

In [66]:
rf_selection = rf_predict_overlap[meiklejohn_predict_overlap != rf_predict_overlap]

In [67]:
len(meiklejohn_selection), len(rf_selection)

(1333382, 1333382)

In [68]:
mj_value = []
rf_value = []
for txidx, meiklejohn_out, rf_out in zip(tx_indexes_with_mismatch, meiklejohn_selection, rf_selection):
    tx = chain.tx_with_index(txidx)
    assert meiklejohn_out != rf_out
    mj_value.append((tx.outputs[meiklejohn_out].value, tx.block.time))
    rf_value.append((tx.outputs[rf_out].value, tx.block.time))

In [69]:
# output value of mj heuristic
sum([x for x, _ in mj_value]) / 1e8

486606.03163605

In [70]:
# output value of rf
sum([x for x, _ in rf_value]) / 1e8

1842926.98327128

In [71]:
# total difference in value between outputs
sum([abs(x-y) for (x, _), (y, _) in zip(rf_value, mj_value)]) / 1e8

2060535.19556347

In [72]:
sum([converter.satoshi_to_currency(x, y) for x, y in mj_value])

1888569530.9701493

In [73]:
sum([converter.satoshi_to_currency(x, y) for x, y in rf_value])

16266170788.681551

In [74]:
sum([converter.satoshi_to_currency(abs(x-y), d) for (x, d), (y, _) in zip(rf_value, mj_value)])

16579214619.855774

In [75]:
# sanity check for correct ordering
np.array([x == y for (_, x), (_, y) in zip(rf_value, mj_value)]).all()

True