# Creating the feature table in BigQuery

## Configuration of Sample

In [2]:
# Due to the nature of selecting random rows via sql the number of addresses is approximated
approximate_number_of_addresses = 5000

# (start, end), format: dd.mm.yyyy
observation_period = ("2020-02-15", "2020-02-15") 

# "random", "richest"
# random: selects random addresses, that have been active within the observation period.
# richest: selects the accounts that have the most ether
address_selection = "random" 

## Creating Views

Setup

In [28]:
from google.cloud import bigquery
import simplejson as json
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../secrets/bigquery-service-account.json"
client = bigquery.Client()
dataset_id = "ethereum_us"
view_names = []
view_ids = {}

Delete all views

In [29]:
view_ids_to_be_deleted = [t.table_id for t in client.list_tables(dataset_id) if "view" in t.table_id]

for view_id in view_ids_to_be_deleted:
     client.delete_table("{}.{}".format(dataset_id, view_id), not_found_ok=True)  

Create view "traces"

In [30]:
view_names.append("traces")
current_view_name = view_names[-1]
sql_query = """
      select * from `bigquery-public-data.crypto_ethereum.traces`
        where status = 1
            and DATE(block_timestamp) >= '{start}' 
            and DATE(block_timestamp) <= '{end}'
      """.format(start=observation_period[0], end=observation_period[1]); 

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'traces_random_5000_2020_02_15_2020_02_15_view'


Create view "addresses"

In [31]:
view_names.append("addresses")
current_view_name = view_names[-1]

# todo: address_selection == "richest"
if address_selection == "random":
    sql_query = """
                with addresses_traces as (
                    select distinct from_address as address from masterarbeit-245718.ethereum_us.{view_id_traces}
                    UNION ALL
                    select distinct to_address as address from masterarbeit-245718.ethereum_us.{view_id_traces}
                ) 
                select * from addresses_traces where RAND() < {approximate_number_of_addresses}/(SELECT COUNT(*) FROM addresses_traces)
                """.format(view_id_traces=view_ids["traces"], approximate_number_of_addresses=approximate_number_of_addresses)

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'addresses_random_5000_2020_02_15_2020_02_15_view'


Create view "wei"

In [32]:
view_names.append("wei")
current_view_name = view_names[-1]
sql_query = """
            with weiView as (

              with weiReceivedView as (
                    
                  -- debits
                  select to_address, sum(ifnull(value, 0)) as weiReceived
                  from masterarbeit-245718.ethereum_us.{view_id_traces}
                  where to_address is not null
                  and status = 1
                  and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
                  group by to_address
                    
              ), weiSentView as (
              
                  -- credits
                  select from_address, sum(ifnull(value, 0)) as weiSent
                  from  masterarbeit-245718.ethereum_us.{view_id_traces} 
                  where from_address is not null
                  and status = 1
                  and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
                  group by from_address
              ) 
              select 
                  CASE 
                    when to_address is not null then to_address
                    when from_address is not null then from_address
                  end as address, 
                  ifnull(weiReceived,0) as weiReceived, 
                  ifnull(weiSent,0) as weiSent
              from weiReceivedView full outer join weiSentView on from_address = to_address
            ) 
            select address, weiReceived, weiSent from weiView right join masterarbeit-245718.ethereum_us.{view_id_addresses} using(address)
            """.format(view_id_traces=view_ids["traces"], view_id_addresses=view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'wei_random_5000_2020_02_15_2020_02_15_view'


Create view "tx"

In [33]:
view_names.append("tx")
current_view_name = view_names[-1]
sql_query = """
            with txView as (

              with txSent as (
              
                  SELECT from_address, count(*) as numberOfTranscationsSent FROM masterarbeit-245718.ethereum_us.{view_id_traces}
                  where to_address is not null and status = 1 and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
                  group by from_address
                  
                ), txReceived as (
                
                  SELECT to_address, count(*) as numberOfTranscationsReceived FROM masterarbeit-245718.ethereum_us.{view_id_traces}
                  where to_address is not null and status = 1 and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
                  group by to_address
                ) 
                
                SELECT 
                    CASE  
                      WHEN to_address IS NOT NULL THEN to_address
                      WHEN from_address IS NOT NULL THEN from_address
                    END AS address,
                    IFNULL(numberOfTranscationsReceived, 0) as numberOfTranscationsReceived, 
                    IFNULL(numberOfTranscationsSent, 0) as numberOfTranscationsSent
                from txReceived FULL OUTER JOIN txSent on to_address = from_address
            ) 
            
            select address, numberOfTranscationsReceived, numberOfTranscationsSent from txView right join masterarbeit-245718.ethereum_us.{view_id_addresses} using(address)
            
            """.format(view_id_traces=view_ids["traces"], view_id_addresses=view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'tx_random_5000_2020_02_15_2020_02_15_view'


Create view "avg_time_diff_received_tx"

In [34]:
view_names.append("avg_time_diff_received_tx")
current_view_name = view_names[-1]
sql_query = """
           with timeRecView as (

              with receivedTx as (
              
                SELECT to_address, count(*) as numberOfTranscationsReceived 
                FROM masterarbeit-245718.ethereum_us.{view_id_traces} 
                where to_address is not null 
                  and status = 1 
                  and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
                group by to_address
              
              ), timeStampDiffs as (
              
                SELECT to_address, TIMESTAMP_DIFF(MAX(block_timestamp), MIN( block_timestamp ), second ) as timestampDiff
                FROM masterarbeit-245718.ethereum_us.{view_id_traces}
                where to_address is not null 
                  and status = 1 
                  and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
                group by to_address
              
              ) 
              
              select to_address as address, 
                  CASE 
                    when (numberOfTranscationsReceived - 1)  > 0 then timestampDiff / (numberOfTranscationsReceived - 1) 
                    else 0
                  end as avgTimeDiffBetweenReceivedTransactions
              from receivedTx inner join  timeStampDiffs using(to_address)
            )
            
            select address, ifnull(avgTimeDiffBetweenReceivedTransactions,0) as avg_time_diff_received_tx from timeRecView right join masterarbeit-245718.ethereum_us.{view_id_addresses} using(address)         
            
            """.format(view_id_traces=view_ids["traces"], view_id_addresses=view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'avg_time_diff_received_tx_random_5000_2020_02_15_2020_02_15_view'


Create view "avg_time_diff_sent_tx"

In [35]:
view_names.append("avg_time_diff_sent_tx")
current_view_name = view_names[-1]
sql_query = """
            with timeSentView as (

              with sentTx as (
              
                SELECT from_address, count(*) as numberOfTranscationsSent 
                FROM masterarbeit-245718.ethereum_us.{view_id_traces}
                where to_address is not null 
                    and status = 1 
                    and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
                group by from_address
                
              ), timeStampDiffs as (
              
                SELECT from_address, TIMESTAMP_DIFF(MAX(block_timestamp), MIN( block_timestamp ), second ) as timestampDiff
                FROM masterarbeit-245718.ethereum_us.{view_id_traces}
                where to_address is not null 
                  and status = 1 
                  and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
                group by from_address
                
            ) 
              
            select from_address as address, 
                CASE 
                    when (numberOfTranscationsSent - 1)  > 0 then 
                        timestampDiff / (numberOfTranscationsSent - 1) 
                    else 
                        0
                end as avgTimeDiffBetweenSentTransactions
            from sentTx inner join  timeStampDiffs using(from_address)
            )
            
            select address, ifnull(avgTimeDiffBetweenSentTransactions,0) as avg_time_diff_sent_tx from timeSentView right join masterarbeit-245718.ethereum_us.{view_id_addresses} using(address)        
            
            """.format(view_id_traces=view_ids["traces"], view_id_addresses=view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'avg_time_diff_sent_tx_random_5000_2020_02_15_2020_02_15_view'


# Create view "mined_blocks"

In [None]:
view_names.append("mined_blocks")
current_view_name = view_names[-1]
sql_query = """
            with minedBlocksView as (
                SELECT to_address as address, count(*) as mined_blocks 
                FROM masterarbeit-245718.ethereum_us.{view_id_traces}
                where trace_type = "reward"
                group by to_address
            )
            
            select address, ifnull(mined_blocks,0) as minedBlocks from minedBlocksView right join  masterarbeit-245718.ethereum_us.{view_id_addresses} using(address)

            """.format(view_id_traces=view_ids["traces"], view_id_addresses=view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

## Merge Views

## Estimate Costs

## Execute Query (WARNING: this operation costs money!)