In [1]:
import numpy as np
import pandas as pd
import json

from get_data import *
from get_financial_data import *

In [104]:
def diff_estimate(hash):
    """
     Count the amounts of 0s at the start of a hash to put an upper-bound on the difficulty
     Difficulty reevaluation is every 20160 minutes, so theoretically about 2016 blocks
     Hence, taking the min value of this over 2016 consecutive blocks can gives a reasonable difficulty estimate
     From this we can then reverse-engineer the approximate hashpower of the network 
     (TODO, will do so later)
    """
    return len(hash)-len(hash.lstrip('0'))

def sanitize_height(from_height, to_height):
    """ CQFD """
    if from_height > to_height: from_height, to_height = to_height, from_height
    return max(from_height, 0), min(to_height, getBlockLast()['height'])

def get_value(tx_out):
    """
     Get the total value of a transaction by counting what every output UTXO of the transaction recieves
    """
    return sum([out['value'] for out in tx_out])

def gather_statistics(from_height, to_height, interests, tx_interests):
    """
     gather block information for blocks ranging from from_height to to_height
     interests: statistics gathered (NOT resiliant to change, check code)
     tx_interests: statistics gathered for each transaction (NOT resiliant to change, check code)
     OUTPUT: Pandas DataFrame with data
        columns: the interests then count, mean, std, min, max for each tx_interests
        rows: 1 per block

     Statistics gathered can then be cross examined for possible correlation
     Time taken: 5 blocks per second, seems to be linear
        Limited by web requests (parallelise requests maybe? might hit server limits)
    """
    from_height, to_height = sanitize_height(from_height, to_height)
    interests = interests.split(' ')
    tx_interests = tx_interests.split(' ')
    interests = interests + [a+'_'+b for a in tx_interests for b in 'count mean std min max'.split(' ')]
    block_data = pd.DataFrame(columns=interests, index=range(from_height, to_height))
    #block_data = np.zeros((to_height-from_height, len(interests)))

    for height in range(from_height, to_height):
        block = getBlockN(height)
        
        hash = int(block['hash'], 16)
        difficulty = diff_estimate(block['hash'])

        # interests for the transactions: vin_sz vout_sz fee lock_time value
        transac_data = np.zeros((block['n_tx'], len(tx_interests)))
        for i, tx in enumerate(block['tx']):
            tx_data = [tx[param] for param in tx_interests[:4]]
            value = get_value(tx['out'])
            transac_data[i, :] = tx_data + [value]

        transac_data = pd.DataFrame(transac_data).describe(percentiles=[]).drop(index='50%')
        transac_data = list(transac_data.to_numpy().flatten())
        
        data = [hash] + [block[i] for i in interests[1:7]] + [difficulty] + transac_data
        block_data.loc[height, :] = data
        #block_data[height-from_height, :] = data

    # TODO add time taken to validate block (next_block["time"] - block["time"])
    # look at spread around 10 minutes

    return block_data

In [106]:
from_height, to_height = 0, 100
interests = 'hash ver time bits fee nonce n_tx difficulty'
tx_interests = 'vin_sz vout_sz fee lock_time value'

block_data = gather_statistics(from_height, to_height, interests, tx_interests)

print('Done')
    

Done


In [103]:
print(block_data.columns)
block_data

Index(['hash', 'ver', 'time', 'bits', 'fee', 'nonce', 'n_tx', 'difficulty',
       'vin_sz_count', 'vin_sz_mean', 'vin_sz_std', 'vin_sz_min', 'vin_sz_max',
       'vout_sz_count', 'vout_sz_mean', 'vout_sz_std', 'vout_sz_min',
       'vout_sz_max', 'fee_count', 'fee_mean', 'fee_std', 'fee_min', 'fee_max',
       'lock_time_count', 'lock_time_mean', 'lock_time_std', 'lock_time_min',
       'lock_time_max', 'value_count', 'value_mean', 'value_std', 'value_min',
       'value_max'],
      dtype='object')


Unnamed: 0,hash,ver,time,bits,fee,nonce,n_tx,difficulty,vin_sz_count,vin_sz_mean,...,lock_time_count,lock_time_mean,lock_time_std,lock_time_min,lock_time_max,value_count,value_mean,value_std,value_min,value_max
0,1062894486921856208405014351944454958038946459...,1,1231006505,486604799,0,2083236893,1,10,1.0,1.0,...,1.0,1.0,0.0,0.0,5000000000.0,1.0,1.0,0.0,0.0,5000000000.0
1,1385949097536080555105847587592198316001649390...,1,1231469665,486604799,0,2573394689,1,8,1.0,1.0,...,1.0,1.0,0.0,0.0,5000000000.0,1.0,1.0,0.0,0.0,5000000000.0
2,1120357047773602012812174058398316757397361004...,1,1231469744,486604799,0,1639830024,1,8,1.0,1.0,...,1.0,1.0,0.0,0.0,5000000000.0,1.0,1.0,0.0,0.0,5000000000.0
3,1376505914199784814833151514217235786942951357...,1,1231470173,486604799,0,1844305925,1,8,1.0,1.0,...,1.0,1.0,0.0,0.0,5000000000.0,1.0,1.0,0.0,0.0,5000000000.0
4,8291227170897828765976220166323524636529507549...,1,1231470988,486604799,0,2850094635,1,8,1.0,1.0,...,1.0,1.0,0.0,0.0,5000000000.0,1.0,1.0,0.0,0.0,5000000000.0
5,1637045987831499090910064889430409484403745466...,1,1231471428,486604799,0,2011431709,1,8,1.0,1.0,...,1.0,1.0,0.0,0.0,5000000000.0,1.0,1.0,0.0,0.0,5000000000.0
6,5075405992369223818941460045852607097887587139...,1,1231471789,486604799,0,2538380312,1,8,1.0,1.0,...,1.0,1.0,0.0,0.0,5000000000.0,1.0,1.0,0.0,0.0,5000000000.0
7,1196216919938235038677394048271033088508101150...,1,1231472369,486604799,0,2258412857,1,8,1.0,1.0,...,1.0,1.0,0.0,0.0,5000000000.0,1.0,1.0,0.0,0.0,5000000000.0
8,6797696584311561938253118532230587722665759031...,1,1231472743,486604799,0,1716931356,1,8,1.0,1.0,...,1.0,1.0,0.0,0.0,5000000000.0,1.0,1.0,0.0,0.0,5000000000.0
9,1491393585230935138742217404786629338326962121...,1,1231473279,486604799,0,1397702696,1,8,1.0,1.0,...,1.0,1.0,0.0,0.0,5000000000.0,1.0,1.0,0.0,0.0,5000000000.0
