In [1]:
import blocksci

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [3]:
import collections
import random

In [4]:
import utils

In [5]:
import analysis

In [6]:
import gc

In [7]:
import joblib

In [8]:
from utils import grouper

In [9]:
chain = blocksci.Blockchain(utils.blocksci_config())

In [10]:
my_cm = blocksci.cluster.ClusterManager(utils.latest_clustering(), chain)

In [11]:
stored_txes = blocksci.GroundTruth(utils.remaining_txes(), chain)

In [12]:
datestring = "20210803"

In [13]:
datestring_new = "20210809"

In [14]:
converter = blocksci.CurrencyConverter()

'Exchange rates are provided by CoinDesk (https://www.coindesk.com/price/).'

### Create a clustering with the Global Meiklejohn heuristic

In [15]:
meiklejohn_heuristic = blocksci.heuristics.change.meiklejohn_global.unique_index

In [None]:
%time mj_indizes = stored_txes.transactions().map(lambda tx: meiklejohn_heuristic(tx))

In [18]:
txes_to_merge = []
timestamps = []

for tx_index, prediction in zip(
    stored_txes.transactions().index,
    mj_indizes
):
        
    if prediction == -1:
        continue
    else:
        assert prediction in (0, 1)
        txes_to_merge.append((tx_index, prediction))

In [19]:
len(txes_to_merge)

168474519

In [20]:
len(txes_to_merge) / stored_txes.transactions().size

0.5440751086536999

In [21]:
np.save("/home/ubuntu/Data/full-prediction/{}-global-meiklejohn-txes-to-cluster.npy".format(datestring_new), txes_to_merge)

In [22]:
del mj_indizes

In [None]:
# no need to regenerate, may want to keep certain data
assert False

In [15]:
txes_to_merge = np.load("/home/ubuntu/Data/full-prediction/{}-global-meiklejohn-txes-to-cluster.npy".format(datestring_new))

In [16]:
refined_clustering_mj = blocksci.cluster.ClusterManager.refine_clustering("/home/ubuntu/Data/clusters/{}-meiklejohn-global".format(datestring_new), utils.latest_clustering(), txes_to_merge, chain)

Refining clustering at path /home/ubuntu/Data/clusters/20210715-base-clustering.
100.00% done
Successfully stored cluster data at /home/ubuntu/Data/clusters/20210809-meiklejohn-global
Serializing cluster tx counts.
Flattening cluster tx vector.
Tx counts match.
Serializing transaction IDs.
Computing transaction offsets.
Serializing cluster tx offsets.


## Cluster Collapse

In [17]:
my_cm.clusters().size

658522992

In [25]:
refined_clustering_mj = blocksci.cluster.ClusterManager("/home/ubuntu/Data/clusters/{}-meiklejohn-global".format(datestring_new), chain)

In [18]:
refined_clustering_mj.clusters().size

501238058

### Check for Mt. Gox cluster collapse

In [19]:
mtgox_address = chain.address_from_string("1LNWw6yCxkUmkhArb2Nf2MPw6vG7u5WG7q")

In [20]:
mt_gox_before = my_cm.clusters()[my_cm.cluster_with_address(mtgox_address).index]

In [21]:
mt_gox_before.tx_count, mt_gox_before.type_equiv_size, len(mt_gox_before)

(4161568, 13214890, 13201430)

In [22]:
mt_gox_before.index

221

In [23]:
mt_gox_mj = refined_clustering_mj.clusters()[refined_clustering_mj.cluster_with_address(mtgox_address).index]

In [24]:
mt_gox_mj.index

220

In [25]:
mt_gox_mj.tx_count, mt_gox_mj.type_equiv_size, len(mt_gox_mj)

(133088283, 448471396, 298436086)

### Check for large cluster (over 10 million type equiv addresses)

In [26]:
largest_cluster_before = my_cm.clusters().where(lambda x: x.type_equiv_size > 10000000).to_list()

In [27]:
largest_cluster_mj = refined_clustering_mj.clusters().where(lambda x: x.type_equiv_size > 10000000).to_list()

In [28]:
for cluster in largest_cluster_before:
    print(cluster.index, cluster.type_equiv_size, cluster.tx_count)
del cluster

221 13214890 4161568
34508 34534228 8851845


In [29]:
for cluster in largest_cluster_mj:
    print(cluster.index, len(cluster), cluster.tx_count)
del cluster

220 298436086 133088283


### Full cluster dataset

In [30]:
import yaml

In [31]:
with open('walletexplorer.yaml') as f:
    cluster_tags = yaml.safe_load(f)

In [32]:
cluster_tags = cluster_tags['tags']

In [33]:
more_clusters = {}
for entry in cluster_tags:
    more_clusters[entry['label']] = entry['address']

In [34]:
new_clusters_mj = collections.defaultdict(list)

for cluster_name, cluster_address in more_clusters.items():
    address = chain.address_from_string(cluster_address)
    
    mj_cluster = refined_clustering_mj.cluster_with_address(address)
    new_clusters_mj[mj_cluster.index].append(cluster_name)

In [35]:
len(more_clusters), len(new_clusters_mj)

(273, 97)

In [47]:
len(new_clusters_mj[220])

177

### Differences between RF and Meiklejohn heuristic

In [36]:
rf_txes = np.load("/home/ubuntu/Data/full-prediction/20210804-refined-txes-to-cluster.npy")

In [38]:
meiklejohn_tx_indexes = [x for x, y in txes_to_merge]

In [39]:
rf_tx_indexes = [x for x, y in rf_txes]

In [40]:
assert len(meiklejohn_tx_indexes) == len(set(meiklejohn_tx_indexes))

In [41]:
assert len(rf_tx_indexes) == len(set(rf_tx_indexes))

In [42]:
len(rf_tx_indexes), len(meiklejohn_tx_indexes)

(150865528, 168474519)

In [43]:
overlap = set(rf_tx_indexes).intersection(set(meiklejohn_tx_indexes))

In [44]:
len(overlap)

81055341

In [45]:
# check is sorted
all(meiklejohn_tx_indexes[i] <= meiklejohn_tx_indexes[i+1] for i in range(len(meiklejohn_tx_indexes)-1))

True

In [46]:
# check is sorted
all(rf_tx_indexes[i] <= rf_tx_indexes[i+1] for i in range(len(rf_tx_indexes)-1))

True

In [48]:
meiklejohn_predict_overlap = np.array([y for x, y in txes_to_merge if x in overlap])

In [49]:
rf_predict_overlap = np.array([y for x, y in rf_txes if x in overlap])

In [50]:
assert len(meiklejohn_predict_overlap) == len(overlap)

In [51]:
(meiklejohn_predict_overlap == rf_predict_overlap).sum()

79171004

In [52]:
len(overlap) - (meiklejohn_predict_overlap == rf_predict_overlap).sum()

1884337

In [53]:
overlap_txes = np.array(sorted(list(overlap)))

In [54]:
tx_indexes_with_mismatch = overlap_txes[meiklejohn_predict_overlap != rf_predict_overlap]

In [55]:
len(tx_indexes_with_mismatch)

1884337

In [56]:
meiklejohn_selection = meiklejohn_predict_overlap[meiklejohn_predict_overlap != rf_predict_overlap]

In [57]:
rf_selection = rf_predict_overlap[meiklejohn_predict_overlap != rf_predict_overlap]

In [58]:
len(meiklejohn_selection), len(rf_selection)

(1884337, 1884337)

In [59]:
mj_value = []
rf_value = []
for txidx, meiklejohn_out, rf_out in zip(tx_indexes_with_mismatch, meiklejohn_selection, rf_selection):
    tx = chain.tx_with_index(txidx)
    assert meiklejohn_out != rf_out
    mj_value.append((tx.outputs[meiklejohn_out].value, tx.block.time))
    rf_value.append((tx.outputs[rf_out].value, tx.block.time))

In [60]:
sum([x for x, _ in mj_value]) / 1e8

904159.43239078

In [61]:
sum([x for x, _ in rf_value]) / 1e8

3854233.83645997

In [62]:
sum([abs(x-y) for (x, _), (y, _) in zip(rf_value, mj_value)]) / 1e8

4126764.50863259

In [63]:
sum([converter.satoshi_to_currency(x, y) for x, y in mj_value])

4285667953.7110844

In [64]:
sum([converter.satoshi_to_currency(x, y) for x, y in rf_value])

39714515799.08565

In [65]:
sum([converter.satoshi_to_currency(abs(x-y), d) for (x, d), (y, _) in zip(rf_value, mj_value)])

38715493331.70055

In [66]:
# sanity check for correct ordering
np.array([x == y for (_, x), (_, y) in zip(rf_value, mj_value)]).all()

True