In [3]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Web scraping
import requests
import re
from bs4 import BeautifulSoup 

import gc;

In [None]:
gc.collect()

In [7]:
def load_raw_outputs():
  outputs_cname = [
                "txID", 
                "position", 
                "addressID",
                "amount",
                "scriptType"
              ]

  outputs_dtype_spec = {
    outputs_cname[0]: "uint32",   
    outputs_cname[1]: "uint32",
    outputs_cname[2]: "uint32",
    outputs_cname[3]: "uint32",
    outputs_cname[4]: "uint8",
  }

  outputs = pd.read_csv("data/outputs.csv", 
                        # usecols=[0, 1, 2, 4],
                        header=None, 
                        names=outputs_cname, 
                        dtype=outputs_dtype_spec,
                        index_col=[outputs_cname[0], outputs_cname[1]], #txID, position
                    )
  
  return outputs

def load_raw_transactions():
  trans_cname = ["timestamp", "blockID", "txID", "isCoinbase", "fee"]
  trans_dtype_spec = {
      "timestamp": "uint32",   
      "blockID": "uint32",
      "txID": "uint32",
      "isCoinbase": "bool",
      "fee": "uint32"
  }

  transactions = pd.read_csv("data/transactions.csv",
                                # usecols=[0, 2, 3, 4],
                                header=None, 
                                names=trans_cname, 
                                dtype=trans_dtype_spec,
                                index_col=trans_cname[2], #txID
                                engine='c'
                            )

  return transactions

def load_raw_addresses():
  addr_map_cname = [
                      "address", 
                      "addressID", 
                  ]

  addr_map_dtype_spec = {
      addr_map_cname[0]: "string",   
      addr_map_cname[1]: "int32",
  }

  addr_map = pd.read_csv("data/mapAddr2Ids.csv", 
                          header=None, 
                          names=addr_map_cname, 
                          dtype=addr_map_dtype_spec,
                          index_col=addr_map_cname[1],
                      )
  return addr_map

In [8]:
outputs = load_raw_outputs()

In [9]:
addresses = load_raw_addresses()
transactions = load_raw_transactions()

In [10]:
transactions = transactions[transactions['isCoinbase']]

In [11]:
outputs = outputs[outputs.index.get_level_values('txID').isin(transactions.index)]

In [None]:
addresses = addresses[addresses.index.isin(outputs['addressID'])]

In [None]:
BASE_URL="http://www.walletexplorer.com"
KNOWN_WALLETS = ["DeepBit", "Eligius", "BTC Guild", "BitMinter"]

In [None]:
def get_page_number(soup, res): 
    paging_container = soup.find('div', class_='paging')
    if not paging_container:
        print(res)
    assert(paging_container)

    paging_text = paging_container.text.strip()
    match = re.search(r'Page (\d+) \/ (\d+)', paging_text)
    assert(match)
    
    total_pages = match.group(2)
    return int(total_pages)
    
def add_wallet_address(name, addrMap, sleep=1):
    curr_page = 0;
    page_number = 1;

    formatted_name = name.replace(" ", "")
    url = f"{BASE_URL}/wallet/{formatted_name}/addresses"
    print(f"Loading wallet: {url}")
    while curr_page < page_number: 
        curr_page += 1
        
        params = {'page': curr_page}
        response = requests.get(url, params=params)
        
        if response.url.split("?")[0] != url:
            url = response.url
            response = requests.get(url, params=params)

        soup = BeautifulSoup(response.content, 'html.parser')

        if curr_page == 1:
            page_number = get_page_number(soup, response)
        
        links = soup.find_all('a', href=lambda href: href and href.startswith('/address/'))    
        assert(links and len(links) > 0)
    
        for link in links: 
            addr = link['href'][9:]
            addrMap[addr]=name
            
        print(f"Found {len(links)} links for {name} page: {curr_page}/{page_number}")
        time.sleep(sleep)

In [None]:
addrMap = {}

In [None]:
add_wallet_address(KNOWN_WALLETS[0], addrMap)

In [None]:
add_wallet_address(KNOWN_WALLETS[1], addrMap)

In [None]:
add_wallet_address(KNOWN_WALLETS[2], addrMap, 3)

In [None]:
add_wallet_address(KNOWN_WALLETS[3], addrMap)

In [None]:
coinbase_addresses = addresses

In [None]:
coinbase_addresses['wallet'] = coinbase_addresses['address'].map(addrMap)

In [None]:
not_anon_size = len(coinbase_addresses[~coinbase_addresses['wallet'].isna()])
size = len(coinbase_addresses)

print(f"Deanonimized {not_anon_size}/{size}")

In [None]:
tx_per_addr = outputs.reset_index()[['amount', 'addressID']].groupby('addressID').count()

tx_per_addr = tx_per_addr.join(coinbase_addresses)

tx_per_addr = tx_per_addr[tx_per_addr['wallet'].isna()]

tx_per_addr = tx_per_addr.sort_values('amount', ascending=False)

In [None]:
def get_addr_wallet(addr):
    response = requests.get(f"{BASE_URL}/address/{addr}")
    soup = BeautifulSoup(response.content, 'html.parser')
   
    walletnote_div = soup.find('div', class_='walletnote')
    assert(walletnote_div)
    link = walletnote_div.find('a', href=lambda href: href and href.startswith('/wallet/'))    
    assert(link)
    link_url = link['href']
    wallet = link_url[8:]
    assert(len(wallet) > 0)
    
    return wallet

In [None]:
found_wallets = []
N_TOP = 4
i = 0

prev_len = len(addrMap)

while len(found_wallets) < min(N_TOP, len(tx_per_addr)):
    wallet = get_addr_wallet(top_4_solo_miners.iloc[i]['address'])
    if wallet in found_wallets:
        continue

    add_wallet_address(wallet, addrMap)
    found_wallets.append(wallet)
    i = i+1

assert len(found_wallets) == N_TOP, "Unexpeced error..."

print(f"Found {len(addrMap) - prev_len} new addresses")

In [None]:
coinbase_addresses['wallet'] = coinbase_addresses['address'].map(addrMap).fillna('Others')

In [None]:
not_anon_size_old = not_anon_size

not_anon_size = len(coinbase_addresses[~coinbase_addresses['wallet'].isna()])
size = len(coinbase_addresses)

print(f"Deanonimized {not_anon_size}/{size}, prev: {not_anon_size_old}/{size}")

In [None]:
coinbase_addresses

In [None]:
coinbase_addresses.to_csv('data/deanonaddr.csv')

In [4]:
coinbase_addresses = pd.read_csv("data/deanonaddr.csv", index_col=['addressID'])


In [13]:
transactions[['timestamp']]

Unnamed: 0_level_0,timestamp
txID,Unnamed: 1_level_1
0,1231006505
1,1231469665
2,1231469744
3,1231470173
4,1231470988
...,...
10571747,1356996072
10572165,1356996754
10572424,1356997908
10572425,1356997591
