In [30]:
import blocksci

import sys, os, os.path, socket
import numpy as np
import numpy_indexed as npi
import zarr
import time
import pandas as pd

SYMBOLS = { 
            "BTC": "bitcoin",
            "LTC": "litecoin",
            "DOGE": "dogecoin",
            "BCH": "bitcoin_cash",
            "BSV": "bitcoin_sv",
            "LCH": "litecoin_cash",
            "FTC": "feathercoin",
            "MONA": "monacoin"
          } 

DIR_BCHAIN="/mnt/hdd_data/blockchain_data/"
DIR_PARSED="/mnt/hdd_data/blockchain_parsed/"

    
class AddressMapper():
    def __init__(self, chain):
        self.chain = chain

        self.__address_types = [blocksci.address_type.nonstandard, blocksci.address_type.pubkey,
                                blocksci.address_type.pubkeyhash, blocksci.address_type.multisig_pubkey,
                                blocksci.address_type.scripthash, blocksci.address_type.multisig,
                                blocksci.address_type.nulldata, blocksci.address_type.witness_pubkeyhash,
                                blocksci.address_type.witness_scripthash, blocksci.address_type.witness_unknown]

        self.__counter_addresses = { _:self.chain.address_count(_) for _ in self.__address_types }

        self.__offsets = {}
        offset = 0
        for _ in self.__address_types:
            self.__offsets[_] = offset
            offset += self.__counter_addresses[_]


        self.total_addresses = offset
        print(self.total_addresses)
        print(self.__counter_addresses)

# load everything

In [31]:
# load the chain

currency = "bitcoin"
heur = "heur_1"

cfg_file = f"/mnt/hdd_data/blockchain_parsed/{currency}.cfg" # later
# cfg_file = f"/mnt/hdd_data/blockchain-parsed/bitcoin.cfg" # abacus
t = time.time()
chain = blocksci.Blockchain(cfg_file)
print(f"cfg loading time = {time.time()-t}")

t = time.time()
cm = blocksci.cluster.ClusterManager(f"{DIR_PARSED}/{currency}/{heur}/", chain)
print(f"clusters loading time = {time.time()-t}")

t = time.time()
address_cluster_map = zarr.load(f"{DIR_PARSED}/{currency}/{heur}_data/address_cluster_map.zarr")
print(f"address cluster map loading time = {time.time()-t}")

t = time.time()
df = pd.read_csv(f"{DIR_PARSED}/bitcoin_darknet/ground_truth_id.csv")
print(f"ground truth loading time = {time.time()-t}")

t = time.time()
am = AddressMapper(chain)
print(f"am creation time = {time.time()-t}")


cfg loading time = 0.274371862411499
clusters loading time = 0.00021576881408691406
address cluster map loading time = 8.235018014907837
ground truth loading time = 17.9113986492157
4022303272
{address_type.nonstandard: 831942, address_type.pubkey: 857014675, address_type.pubkeyhash: 857014675, address_type.multisig_pubkey: 857014675, address_type.scripthash: 235784252, address_type.multisig: 74844649, address_type.nulldata: 46999459, address_type.witness_pubkeyhash: 857014675, address_type.witness_scripthash: 235784252, address_type.witness_unknown: 18}
am creation time = 0.00028228759765625


In [3]:
am._AddressMapper__offsets

{address_type.nonstandard: 0,
 address_type.pubkey: 831942,
 address_type.pubkeyhash: 857846617,
 address_type.multisig_pubkey: 1714861292,
 address_type.scripthash: 2571875967,
 address_type.multisig: 2807660219,
 address_type.nulldata: 2882504868,
 address_type.witness_pubkeyhash: 2929504327,
 address_type.witness_scripthash: 3786519002,
 address_type.witness_unknown: 4022303254}

## check
clusters are id from 1 to len(clusters)

In [4]:
len(cm.clusters())

591692716

```address_cluster_map``` goes from address(index) to cluster(cluster number - 1, or python index in clusters)

In [5]:
print(len(address_cluster_map))
print(np.max(address_cluster_map))

4022303272
591692715


```df``` rows have address, controlling entity, entity sector, and personal id(ignore this, it is obsolete)

In [6]:
df.head()

Unnamed: 0,address,entity,sector,id
0,1E421ChpHWEqQ2Kcy2JqEE8eDx7nFN4AXj,777Coin.com,Gambling,34916770.0
1,15uWGv4QFoa7Hjkfzs6YCDVGNhxZ4Yz2wW,777Coin.com,Gambling,26152409.0
2,19s8Jzb6fQ9kLu3Gcb46aX5h9uc93PULgC,777Coin.com,Gambling,29354407.0
3,1LfiL43VHwe2tRNjra5uhVAc4J5nFUay5X,777Coin.com,Gambling,29902318.0
4,1Bh6y3BkJVzodfGfLh9GCuuKPqgwoyRibF,777Coin.com,Gambling,29739794.0


# from adress to cluster


In [7]:
# this block load an adress from block 10000
# identify the corresponding cluster id in address_cluster map 
# and check that the identifier is the same of the internal blocksci identifier

for t in chain.blocks[10000].outputs:
    add = t.address
    num = add.address_num
    typ = add.type
    off = am._AddressMapper__offsets[typ]
    print(add)
    print(add.address_num)
    print(am._AddressMapper__offsets[typ])
    
c_num = address_cluster_map[off + num]-1
print(c_num)

c = cm.cluster_with_address(add)
print(c.index)

PubkeyAddress(1JoiKZz2QRd47ARtcYgvgxC9jhnre9aphv)
10055
831942
77270814
77270814


In [29]:
for t in chain.blocks[100000].txes:
    print('\n')
    for i in t.inputs:
        print(i)
        for o in chain.tx_with_index(i.spent_tx_index).outputs:
            print(o)






TxIn(spent_tx_index=213171, address=PubkeyHashAddress(1BNwxHGaFbeUBitpjy2AsKpJ29Ybxntqvb), value=5000000000)
TxOut(spending_tx_index=216574, address=PubkeyHashAddress(1BNwxHGaFbeUBitpjy2AsKpJ29Ybxntqvb), value=5000000000)


TxIn(spent_tx_index=205760, address=PubkeyHashAddress(15vScfMHNrXN4QvWe54q5hwfVoYwG79CS1), value=300000000)
TxOut(spending_tx_index=402825, address=PubkeyHashAddress(1JHJYGshG8Ds9XXHbXuTrDkf8XAXzNhi5c), value=27000000)
TxOut(spending_tx_index=216575, address=PubkeyHashAddress(15vScfMHNrXN4QvWe54q5hwfVoYwG79CS1), value=300000000)


TxIn(spent_tx_index=215898, address=PubkeyHashAddress(1JxDJCyWNakZ5kECKdCU9Zka6mh34mZ7B2), value=1000000)
TxOut(spending_tx_index=216576, address=PubkeyHashAddress(1JxDJCyWNakZ5kECKdCU9Zka6mh34mZ7B2), value=1000000)


In [8]:
# load a black address from df, identify the corresponding cluster and

add_string = df.address[0]  # black address string
add = chain.address_from_string(add_string)  # load blocksci address object
add_typ = add.type
add_id = add.address_num + am._AddressMapper__offsets[add_typ]  # compute address identifier in the addres_cluster_map

c_num = address_cluster_map[add_id]-1
c = cm.cluster_with_address(add)
c_id = c.index

print(add_string)
print(c_num)
print(c_id)

# print(c.address_count())

1E421ChpHWEqQ2Kcy2JqEE8eDx7nFN4AXj
79703656
87341931


```c_num``` and ```c_id``` differ and *it shouldnt happen*.

In [9]:
print(c.address_count())

5966


In [10]:
print(add in c.addresses.to_list())

True


## what happens if i pass inexisting address

In [9]:
try:
    a = chain.address_from_string('fake')
except:
    a = np.nan
print(a)

nan


In [12]:
def catch(a):
    try:
        return chain.address_from_string(a)
    except:
        return np.nan
    
a = [catch(a) for a in df.address[:10]]
print(a)

[PubkeyHashAddress(1E421ChpHWEqQ2Kcy2JqEE8eDx7nFN4AXj), PubkeyHashAddress(15uWGv4QFoa7Hjkfzs6YCDVGNhxZ4Yz2wW), PubkeyHashAddress(19s8Jzb6fQ9kLu3Gcb46aX5h9uc93PULgC), PubkeyHashAddress(1LfiL43VHwe2tRNjra5uhVAc4J5nFUay5X), PubkeyHashAddress(1Bh6y3BkJVzodfGfLh9GCuuKPqgwoyRibF), PubkeyHashAddress(1DraDSB5BRNtafEsQ1WzBfQ2n84pyD5o49), PubkeyHashAddress(1FJiPTa4q81H9yvPgatfqEfwngCfvQdxZg), PubkeyHashAddress(16JGPQbLXUf8KhFybG9AbMqmoTrok2xaKC), PubkeyHashAddress(1MPpXsEffG6XBueaNhDgazzAFu2s5zGrc6), PubkeyHashAddress(1FruhAC8DEtuYj7tCyBVKb3LqgkziAEyWU)]


In [45]:
c = np.array([1,2,3,4])
i = np.array([1,2, np.nan])
#print(c[i,])
nan_i= i[~np.isnan(i)].astype(int)
print(nan_i.dtype)
print(c[[nan_i]])

int64
[2 3]


  print(c[[nan_i]])


In [48]:
df.loc['id', 0] = np.nan

In [52]:
df.id[~np.isnan(df.id)].astype(int)

1           26152409
2           29354407
3           29902318
4           29739794
5           24795829
              ...   
28027926    32503538
28027927    29594299
28027928    34033368
28027929    26941191
28027930    26178131
Name: id, Length: 23369119, dtype: int64

# who got a black address

## has at least a black address?
here we count if a cluster got at least one black address

In [None]:
# v1
black_clusters = np.zeros(len(cm.clusters()), dtype=bool)
t = time.time()
for a in df.address:
    add = chain.address_from_string(a)
    add_typ = add.type
    add_id = add.address_num + am._AddressMapper__offsets[add_typ]
    c_index = address_cluster_map[add_id]
    black_clusters[c_index] = True
print(f"time = {time.time() - t}")

In [None]:
# v2
black_clusters_2 = np.zeros(len(cm.clusters()), dtype=bool)
c_indices = []
t = time.time()
for a in df.address:
    add = chain.address_from_string(a)
    add_typ = add.type
    c_indices.append(address_cluster_map[add.address_num + am._AddressMapper__offsets[add_typ]])
    
black_clusters_2[c_indices] = True
print(f"time = {time.time() - t}")

In [3]:
# v3
black_clusters_3 = np.zeros(len(cm.clusters()), dtype=bool)
c_indices = []
t = time.time()
c_indices = np.array([address_cluster_map[chain.address_from_string(a).address_num + am._AddressMapper__offsets[chain.address_from_string(a).type]] for a in df.address])
black_clusters_3[c_indices] = True
print(f"time = {time.time() - t}")

KeyboardInterrupt: 

## more elaborate version 3

In [20]:
df["test2"] = df.address[:10].applymap(chain.address_from_string)

AttributeError: 'Series' object has no attribute 'applymap'

In [6]:
bc = np.zeros(len(cm.clusters()), dtype=bool)
t = time.time()
df["num"] = [chain.address_from_string(a) for a in df.address]
print(f"time = {t - time.time()}")

KeyboardInterrupt: 

In [None]:
t = time.time()
df["offset"] = [chain.address_from_string(a).type for a in df.address]
print(f"time = {t - time.time()}")

In [None]:
t = time.time()
df["c_index"] = address_cluster_map[df.num+df.offset]
print(f"time = {t - time.time()}")
bc[c_indices] = True

# Code

In [45]:
a = chain.tx_with_index(10200851)

for inp in a.inputs:
    print(chain.tx_with_index(inp.spent_tx_index))


Tx(len(txins)=1, len(txouts)=2, size_bytes=257, block_height=212504, tx_index=9980900)
