# Check for cluster collapse

Check our base clustering for pre-existing cluster collapse by comparing our clusters with those of WalletExplorer.com

Note: walletexplorer.com enforces rate limits for web scraping and running the script did not work on AWS.

In [1]:
import blocksci

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
import collections
import random

In [3]:
from time import sleep

In [4]:
import requests

In [5]:
import pickle

In [7]:
import utils

In [8]:
chain = blocksci.Blockchain(utils.blocksci_config())

In [9]:
my_cm = blocksci.cluster.ClusterManager(utils.latest_clustering(), chain)

In [11]:
gt = blocksci.GroundTruth("/home/ubuntu/Data/groundtruth/20210715-2-2out-nobothclustered-nomtgox", chain)

In [12]:
change_output = blocksci.heuristics.change.cluster_member(my_cm)

In [13]:
actual_spend = blocksci.heuristics.change.actual_spend(my_cm)

In [14]:
cluster_id = blocksci.heuristics.cluster_id(my_cm)

In [15]:
address_reuse = blocksci.heuristics.change.address_reuse

## Check for cluster collapse by comparing clustering against walletexplorer.com

In [6]:
parse_new = False

cluster_addresses = {}
cluster_labels = collections.defaultdict(dict)

if not parse_new:
    with open("cluster_addresses_2021.pickle", "rb") as f:
        cluster_addresses = pickle.load(f)
    with open("cluster_labels_2021.pickle", "rb") as f:
        cluster_labels = pickle.load(f)

In [17]:
if parse_new:
    change_clusters = gt.transactions().map(lambda tx: cluster_id(tx))
    change_cluster_counter = collections.Counter(change_clusters)

In [21]:
if parse_new:
    # add new clusters to cluster data
    random.seed(42)
    for clus_id, _ in change_cluster_counter.most_common(100):
        if clus_id not in cluster_addresses:
            cluster = my_cm.clusters()[clus_id]
            addresses = cluster.addresses.to_list()
            addresses_to_sample = min(100, len(addresses))
            cluster_addresses[clus_id] = [x.address_string for x in random.sample(addresses, addresses_to_sample)]
            print(clus_id, len(addresses))

91628086 1140
34508 26158495
108873629 1197905
98893639 6
70048 2182838
107201 20
159191 2551617
32721661 50269
61817017 855717
23594133 2096593
97299909 77
13699970 343319
135574604 581328
105794406 314508
98223133 154
109995066 280738
23084303 4883462
20988006 623686
119680253 234008
98290865 3609
50075702 7
12503851 2758037
40523343 263074
23425421 809648
100336032 497982
96265016 259401
42739939 238479
127814957 189776
104400659 665
49550557 997254
99387706 350036
100100593 197164
91901708 372753
90536077 348445
103041771 505919
108227938 268487
324204316 1000
5282699 1239413
349441895 18969
12148954 18912
91784202 912682
188958464 170486
95627541 95994
136802 223835
109970812 96888
17786597 265686
102513996 1639518
281979729 116950
21928741 204948
21357591 133084
347160319 33682
97989378 412
101127446 808170
200307103 29
8093053 143556
17991302 301218
44807457 51
12287598 464137
24513187 186719
15449584 660
15025899 797581
24516747 183933
42090904 120562
951310 496084
24511841 174

### Parse labels from Walletexplorer.com

In [9]:
TIMEOUT = 20

def get_cluster_from_walletexplorer(address_string):
    get_url = "https://www.walletexplorer.com/?q={}".format(address_string)
    
    try:
        r = requests.get(get_url, timeout=TIMEOUT)
    except requests.exceptions.Timeout:
        print("Request for address {} timed out".format(address_string))
        return False
    except Exception as err:
        print("Exception {} occurred for address {}".format(err, address_string))
        return False
    if r.status_code == 429:
        return False
    if r.status_code != 200:
        print("Error retrieving label for address {}: Status code {}".format(address_string, r.status_code))
        return False
    if r.url == get_url:
        print("Address {} not found".format(address_string))
        return "<Unknown>"
    
    cluster_name = r.url[38:-(14+len(address_string))]
    return cluster_name

In [11]:
ADDRESS_COUNT_NORMAL = 25

for cid, addresses in cluster_addresses.items():
    cl = cluster_labels[cid]
    
    for address in addresses:
        address_string = address
        
        # get 25 labels per cluster (we sampled 100 at random)
        if len(cl) >= ADDRESS_COUNT_NORMAL:
            print("Finished/skipping cluster {}".format(cid))
            break
        
        # skip addresses that we already retrieved
        if address_string in cl:
            # print("Skipping address {}: {}".format(address_string, cl[address_string]))
            continue
        
        # retrieval may fail, we try multiple times with longer wait in between
        num_tries = 0
        while True:
            # get label
            label = get_cluster_from_walletexplorer(address_string)
            if label:
                cl[address_string] = label
                print("Cluster label {} for address {} in cluster {}".format(label, address_string, cid))
                sleep(random.randint(0, 10))
                break
            else:
                if num_tries > 3:
                    print("Unable to retrieve label for address {} in cluster {}".format(address_string, cid))
                    sleep(120)
                    break
                num_tries += 1
                sleep(60*num_tries)

Cluster label SatoshiDice.com-original for address 1LuLbLZR1TZyk2XtQVXR888FDhdj82qAA2 in cluster 91628086
Cluster label SatoshiDice.com-original for address 18wQeuWhrC7f96QDuyZM4VnGEDQpgmcRnH in cluster 91628086
Cluster label SatoshiDice.com-original for address 18thk7NCdqoGqhd93PsG9hecYdSCWDiB7w in cluster 91628086
Cluster label SatoshiDice.com-original for address 1MXZpCNEFYSX4s6stRfLagND5sdWCLJBUq in cluster 91628086
Cluster label SatoshiDice.com-original for address 162d1ArFmninqV5gWYvCVkqTN6xtdLsxtG in cluster 91628086
Cluster label SatoshiDice.com-original for address 1bank3Uzo8MCfhhRLSZwQnyX1Q8bkAiXg in cluster 91628086
Cluster label SatoshiDice.com-original for address 1KDzgXk7aEv7wax94oE3FmP5tg1pDmnnDF in cluster 91628086
Cluster label SatoshiDice.com-original for address 1Bhz89mv4MXEXFnK3uEaNfHxcr8eXu1CEj in cluster 91628086
Cluster label SatoshiDice.com-original for address 1MQ175hiJHFN1YiXMYerWdE8pTAc5V9KZS in cluster 91628086
Cluster label SatoshiDice.com-original for addr

In [12]:
len(cluster_labels)

100

#### Export updated labels / addresses

In [13]:
import pickle

In [14]:
with open('cluster_labels_2021.pickle', 'wb') as f:
    pickle.dump(cluster_labels, f)

In [15]:
with open("cluster_addresses_2021.pickle", "wb") as f:
    pickle.dump(cluster_addresses, f)

### Analyse clusters with more than one label

In [16]:
clusters_with_more_than_one_label = []
for k, v in cluster_labels.items():
    unique_count = len(set(v.values()))
    if unique_count > 1: # and k in hundred_most_common:
        clusters_with_more_than_one_label.append(k)
        print(k, unique_count)
        print(collections.Counter(v.values()).most_common())
        print()

108873629 2
[('Huobi.com-2', 24), ('4f26173ef5c08d54', 1)]

105794406 2
[('00000e7f41300eb1', 24), ('9a190d275e58fe6d', 1)]

98223133 3
[('00e32d36bec3921d', 23), ('b023bdd19c8b1b4a', 1), ('1969d67a46186f5a', 1)]

23084303 2
[('00000146f9362b0e', 24), ('6996d88024a88118', 1)]

91784202 3
[('LocalBitcoins.com-old', 17), ('AnxPro.com', 7), ('79ad1c33693b26a0', 1)]



In [17]:
clusters_with_more_than_one_label

[108873629, 105794406, 98223133, 23084303, 91784202]