## Output and transaction characteristics

Exports a variety of output and transaction characteristics, to be used in the Random Forest model

In [1]:
import blocksci

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [3]:
import collections
import random

In [4]:
import itertools

In [5]:
import utils

In [6]:
chain = blocksci.Blockchain(utils.blocksci_config())

In [7]:
gt = blocksci.GroundTruth(utils.latest_groundtruth(), chain)

In [8]:
gt.transactions().size

35257428

### Transaction characteristics

In [9]:
fee = gt.transactions().outputs.map(lambda o: o.tx.fee)

In [10]:
fee_per_byte = gt.transactions().outputs.map(lambda o: o.tx.fee_per_byte())

In [11]:
tx_value = gt.transactions().outputs.map(lambda o: o.tx.output_value)

In [12]:
is_segwit = gt.transactions().outputs.map(lambda o: o.tx.segwit)

In [13]:
has_locktime = gt.transactions().outputs.map(lambda o: blocksci.heuristics.has_locktime(o.tx))

In [14]:
tx_version = gt.transactions().outputs.map(lambda o: o.tx.version)

In [15]:
block_height = gt.transactions().outputs.map(lambda o: o.tx.block.height)

In [16]:
input_count = gt.transactions().outputs.map(lambda o: o.tx.input_count)

### Output characteristics

In [17]:
output_value = gt.transactions().outputs.value

In [9]:
is_larger_output = gt.transactions().outputs.map(lambda o: o.tx.outputs.max(lambda x: x.value).map(lambda x: x.index).or_value(99) == o.index)

In [19]:
is_fresh_output = gt.transactions().outputs.map(lambda o: o.address.first_tx == o.tx)

In [20]:
is_other_output_fresh = np.array(list(itertools.chain.from_iterable((y, x) for x,y in utils.grouper(is_fresh_output, 2))))

In [21]:
output_value_ratio = output_value / tx_value

In [22]:
output_index = gt.transactions().outputs.index

### Check and save

In [23]:
output_count = gt.transactions().size * 2

In [24]:
assert output_count == len(output_value)
assert output_count == len(is_larger_output)
assert output_count == len(output_value_ratio)
assert output_count == len(output_index)
assert output_count == len(is_fresh_output)
assert output_count == len(is_other_output_fresh)
assert output_count == len(fee)
assert output_count == len(fee_per_byte)
assert output_count == len(tx_value)
assert output_count == len(is_segwit)
assert output_count == len(has_locktime)
assert output_count == len(block_height)
assert output_count == len(input_count)

In [25]:
df = pd.DataFrame({
    "co_output_value": output_value,
    "co_is_larger_output": is_larger_output,
    "co_output_value_ratio": output_value_ratio,
    "co_output_index": output_index,
    "co_fresh_output": is_fresh_output,
    "co_other_fresh": is_other_output_fresh,
    "ct_fee": fee,
    "ct_fee_per_byte": fee_per_byte,
    "ct_tx_value": tx_value,
    "ct_version": tx_version,
    "ct_segwit_tx": is_segwit,
    "ct_has_locktime": has_locktime,
    "ct_block_height": block_height,
    "ct_input_count": input_count,
})

In [26]:
assert len(df) == gt.transactions().size * 2

In [27]:
df.to_csv("/home/ubuntu/Data/heuristics/20210720-output-features.csv")

In [31]:
del df