# Creating the feature table in BigQuery

## Configuration of Sample

In [257]:
# Due to the nature of selecting random rows via sql the number of addresses is approximated
approximate_number_of_addresses = 100

# (start, end), format: dd.mm.yyyy
observation_period = ("2020-02-15", "2020-02-15") 

# "random", "richest"
# random: selects random addresses, that have been active within the observation period.
# richest: selects the accounts that have the most ether
address_selection = "random" 

max_bigquery_costs_usd = 2

## Creating Views

Setup

In [258]:
from google.cloud import bigquery
import simplejson as json
import os
from datetime import datetime
import pandas as pd

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../secrets/bigquery-service-account.json"
client = bigquery.Client()
dataset_id = "ethereum_us"
view_names = []
view_ids = {}

Delete all views and temporary tables

In [259]:
view_ids_to_be_deleted = [t.table_id for t in client.list_tables(dataset_id) if "view" in t.table_id or "tmp" in t.table_id]

for view_id in view_ids_to_be_deleted:
     client.delete_table("{}.{}".format(dataset_id, view_id), not_found_ok=True)  

Create view "traces"

In [260]:
view_names.append("traces")
current_view_name = view_names[-1]
sql_query = """
      select * from `bigquery-public-data.crypto_ethereum.traces`
        where status = 1
            and DATE(block_timestamp) >= '{start}' 
            and DATE(block_timestamp) <= '{end}'
      """.format(start=observation_period[0], end=observation_period[1]); 

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'traces_random_100_2020_02_15_2020_02_15_view'


Create view "addresses"

In [261]:
view_names.append("addresses")
current_table_name = view_names[-1]

# todo: address_selection == "richest"
if address_selection == "random":
    sql_query = """
                with addresses_traces as (
                    select distinct from_address as address from masterarbeit-245718.ethereum_us.{view_id_traces}
                    UNION ALL
                    select distinct to_address as address from masterarbeit-245718.ethereum_us.{view_id_traces}
                ) 
                select * from addresses_traces where RAND() < {approximate_number_of_addresses}/(SELECT COUNT(*) FROM addresses_traces)
                """.format(view_id_traces=view_ids["traces"], approximate_number_of_addresses=approximate_number_of_addresses)

view_ids[current_table_name] = "{}_{}_{}_{}_{}_tmp".format(current_table_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))

# view_ref = client.dataset(dataset_id).table(view_ids[current_table_name])
# view = bigquery.Table(view_ref)
# view.view_query = sql_query
# view = client.create_table(view)  # API request


job_config = bigquery.QueryJobConfig(destination="masterarbeit-245718.ethereum_us.{}".format(view_ids[current_table_name]))
query_job = client.query(sql_query, job_config=job_config)  # Make an API request.
query_job.result()

print("Successfully created table: '{}'".format(view_ids[current_table_name]))

Successfully created table: 'addresses_random_100_2020_02_15_2020_02_15_tmp'


Create view "wei"

In [262]:
view_names.append("wei")
current_view_name = view_names[-1]
sql_query = """
            with weiView as (

              with weiReceivedView as (
                    
                  -- debits
                  select to_address, sum(ifnull(value, 0)) as weiReceived
                  from masterarbeit-245718.ethereum_us.{view_id_traces}
                  where to_address is not null
                  and status = 1
                  and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
                  group by to_address
                    
              ), weiSentView as (
              
                  -- credits
                  select from_address, sum(ifnull(value, 0)) as weiSent
                  from  masterarbeit-245718.ethereum_us.{view_id_traces} 
                  where from_address is not null
                  and status = 1
                  and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
                  group by from_address
              ) 
              select 
                  CASE 
                    when to_address is not null then to_address
                    when from_address is not null then from_address
                  end as address, 
                  ifnull(weiReceived,0) as weiReceived, 
                  ifnull(weiSent,0) as weiSent
              from weiReceivedView full outer join weiSentView on from_address = to_address
            ) 
            select address, weiReceived, weiSent from weiView right join masterarbeit-245718.ethereum_us.{view_id_addresses} using(address)
            """.format(view_id_traces=view_ids["traces"], view_id_addresses=view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'wei_random_100_2020_02_15_2020_02_15_view'


Create view "tx"

In [263]:
view_names.append("tx")
current_view_name = view_names[-1]
sql_query = """
            with txView as (

              with txSent as (
              
                  SELECT from_address, count(*) as numberOfTranscationsSent FROM masterarbeit-245718.ethereum_us.{view_id_traces}
                  where to_address is not null and status = 1 and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
                  group by from_address
                  
                ), txReceived as (
                
                  SELECT to_address, count(*) as numberOfTranscationsReceived FROM masterarbeit-245718.ethereum_us.{view_id_traces}
                  where to_address is not null and status = 1 and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
                  group by to_address
                ) 
                
                SELECT 
                    CASE  
                      WHEN to_address IS NOT NULL THEN to_address
                      WHEN from_address IS NOT NULL THEN from_address
                    END AS address,
                    IFNULL(numberOfTranscationsReceived, 0) as numberOfTranscationsReceived, 
                    IFNULL(numberOfTranscationsSent, 0) as numberOfTranscationsSent
                from txReceived FULL OUTER JOIN txSent on to_address = from_address
            ) 
            
            select address, numberOfTranscationsReceived, numberOfTranscationsSent from txView right join masterarbeit-245718.ethereum_us.{view_id_addresses} using(address)
            
            """.format(view_id_traces=view_ids["traces"], view_id_addresses=view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'tx_random_100_2020_02_15_2020_02_15_view'


Create view "avg_time_diff_received_tx"

In [264]:
view_names.append("avg_time_diff_received_tx")
current_view_name = view_names[-1]
sql_query = """
           with timeRecView as (

              with receivedTx as (
              
                SELECT to_address, count(*) as numberOfTranscationsReceived 
                FROM masterarbeit-245718.ethereum_us.{view_id_traces} 
                where to_address is not null 
                  and status = 1 
                  and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
                group by to_address
              
              ), timeStampDiffs as (
              
                SELECT to_address, TIMESTAMP_DIFF(MAX(block_timestamp), MIN( block_timestamp ), second ) as timestampDiff
                FROM masterarbeit-245718.ethereum_us.{view_id_traces}
                where to_address is not null 
                  and status = 1 
                  and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
                group by to_address
              
              ) 
              
              select to_address as address, 
                  CASE 
                    when (numberOfTranscationsReceived - 1)  > 0 then timestampDiff / (numberOfTranscationsReceived - 1) 
                    else 0
                  end as avgTimeDiffBetweenReceivedTransactions
              from receivedTx inner join  timeStampDiffs using(to_address)
            )
            
            select address, ifnull(avgTimeDiffBetweenReceivedTransactions,0) as avg_time_diff_received_tx from timeRecView right join masterarbeit-245718.ethereum_us.{view_id_addresses} using(address)         
            
            """.format(view_id_traces=view_ids["traces"], view_id_addresses=view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'avg_time_diff_received_tx_random_100_2020_02_15_2020_02_15_view'


Create view "avg_time_diff_sent_tx"

In [265]:
view_names.append("avg_time_diff_sent_tx")
current_view_name = view_names[-1]
sql_query = """
            with timeSentView as (

              with sentTx as (
              
                SELECT from_address, count(*) as numberOfTranscationsSent 
                FROM masterarbeit-245718.ethereum_us.{view_id_traces}
                where to_address is not null 
                    and status = 1 
                    and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
                group by from_address
                
              ), timeStampDiffs as (
              
                SELECT from_address, TIMESTAMP_DIFF(MAX(block_timestamp), MIN( block_timestamp ), second ) as timestampDiff
                FROM masterarbeit-245718.ethereum_us.{view_id_traces}
                where to_address is not null 
                  and status = 1 
                  and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
                group by from_address
                
            ) 
              
            select from_address as address, 
                CASE 
                    when (numberOfTranscationsSent - 1)  > 0 then 
                        timestampDiff / (numberOfTranscationsSent - 1) 
                    else 
                        0
                end as avgTimeDiffBetweenSentTransactions
            from sentTx inner join  timeStampDiffs using(from_address)
            )
            
            select address, ifnull(avgTimeDiffBetweenSentTransactions,0) as avg_time_diff_sent_tx from timeSentView right join masterarbeit-245718.ethereum_us.{view_id_addresses} using(address)        
            
            """.format(view_id_traces=view_ids["traces"], view_id_addresses=view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'avg_time_diff_sent_tx_random_100_2020_02_15_2020_02_15_view'


Create view "mined_blocks"

In [266]:
view_names.append("mined_blocks")
current_view_name = view_names[-1]
sql_query = """
            with minedBlocksView as (
                SELECT to_address as address, count(*) as mined_blocks 
                FROM masterarbeit-245718.ethereum_us.{view_id_traces}
                where trace_type = "reward"
                group by to_address
            )
            
            select address, ifnull(mined_blocks,0) as minedBlocks from minedBlocksView right join  masterarbeit-245718.ethereum_us.{view_id_addresses} using(address)

            """.format(view_id_traces=view_ids["traces"], view_id_addresses=view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'mined_blocks_random_100_2020_02_15_2020_02_15_view'


Create view "stddev_received_tx"

In [267]:
view_names.append("stddev_received_tx")
current_view_name = view_names[-1]
sql_query = """
            with timestamp_var as (
                
                with timestamps_diffs as (
                    
                    with timestamps_preceding_tx as (
                        
                        with timestamps_received_tx as (
                        
                            select to_address, block_timestamp 
                            from masterarbeit-245718.ethereum_us.{view_id_traces}
                            where to_address is not null 
                                and status = 1 
                                and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
                        )
                        
                        select to_address, block_timestamp,
                            lag(block_timestamp) OVER (partition by to_address order by block_timestamp asc) as preceding_block_timestamp 
                        from timestamps_received_tx
                    )
                    
                    select to_address, block_timestamp, preceding_block_timestamp, 
                        TIMESTAMP_DIFF(block_timestamp, preceding_block_timestamp, second) as timestampdiff
                    from timestamps_preceding_tx
                )
                
                select to_address as address, STDDEV_SAMP(timestampdiff) as stddev_received_tx  
                from timestamps_diffs group by to_address 
            ) 
            
            select address, stddev_received_tx from timestamp_var right join masterarbeit-245718.ethereum_us.{view_id_addresses} using(address)
            
            """.format(view_id_traces=view_ids["traces"], view_id_addresses=view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'stddev_received_tx_random_100_2020_02_15_2020_02_15_view'


Create view "stddev_sent_tx"

In [268]:
view_names.append("stddev_sent_tx")
current_view_name = view_names[-1]
sql_query = """
            with timestamp_var as (
            
                with timestamps_diffs as (
                    
                    with timestamps_preceding_tx as (
                        
                        with timestamps_sent_tx as (
                        
                            select from_address, block_timestamp 
                            from masterarbeit-245718.ethereum_us.{view_id_traces}
                            where from_address is not null 
                                and status = 1 
                                and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
                        )
                        
                        select from_address, block_timestamp,
                            lag(block_timestamp) OVER (partition by from_address order by block_timestamp asc) as preceding_block_timestamp 
                        from timestamps_sent_tx
                    )
                    
                    select from_address, block_timestamp, preceding_block_timestamp, 
                        TIMESTAMP_DIFF(block_timestamp, preceding_block_timestamp, second) as timestampdiff
                    from timestamps_preceding_tx
                )
                
                select from_address as address, STDDEV_SAMP(timestampdiff) as stddev_sent_tx  
                from timestamps_diffs group by from_address 
            ) 
            
            select address, stddev_sent_tx from timestamp_var right join masterarbeit-245718.ethereum_us.{view_id_addresses} using(address)
            
            """.format(view_id_traces=view_ids["traces"], view_id_addresses=view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'stddev_sent_tx_random_100_2020_02_15_2020_02_15_view'


Create view "active_months"

In [269]:
view_names.append("active_months")
current_view_name = view_names[-1]
sql_query = """
            with tx_received as (
            
                select 
                    TIMESTAMP_TRUNC(block_timestamp, MONTH, 'UTC') as month, 
                    to_address,
                    count(*) as number_tx_received
                from `masterarbeit-245718.ethereum_us.{view_id_traces}`
                group by TIMESTAMP_TRUNC(block_timestamp, MONTH, 'UTC'), to_address
                
            ), tx_sent as (
            
                select 
                    TIMESTAMP_TRUNC(block_timestamp, MONTH, 'UTC') as month, 
                    from_address,
                    count(*) as number_tx_sent
                from `masterarbeit-245718.ethereum_us.{view_id_traces}`
                group by TIMESTAMP_TRUNC(block_timestamp, MONTH, 'UTC'), from_address
                
            ), monthly_tx as (
            
                select 
                    CASE  
                      WHEN tx_sent.from_address IS NOT NULL THEN tx_sent.from_address
                      WHEN tx_received.to_address IS NOT NULL THEN tx_received.to_address
                    END AS address,
                    CASE  
                      WHEN tx_sent.month IS NOT NULL THEN tx_sent.month
                      WHEN tx_received.month IS NOT NULL THEN tx_received.month
                    END AS month,
                    ifnull(number_tx_sent,0) as number_tx_sent, 
                    ifnull(number_tx_received,0) as number_tx_received 
                from tx_sent full join tx_received 
                    on (tx_sent.from_address = tx_received.to_address and tx_sent.month = tx_received.month)
            
            ), active_months_view as (
            
                select 
                    address, 
                    countif(number_tx_sent > 0 or number_tx_received > 0) as active_months 
                from monthly_tx group by address order by address ASC 
            )
            
            select 
                address, 
                ifnull(active_months,0) as active_months 
            from `masterarbeit-245718.ethereum_us.{view_id_addresses}` left join active_months_view using(address)    
            
            """.format(view_id_addresses=view_ids["addresses"],view_id_traces=view_ids["traces"], view_id_tx=view_ids["tx"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'active_months_random_100_2020_02_15_2020_02_15_view'


Create table "traces_usd"

In [270]:
from urllib.request import urlopen, Request
import csv 

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3"}
url = 'https://etherscan.io/chart/etherprice?output=csv'
req = Request(url=url, headers=headers) 
html = urlopen(req).read().decode('utf-8')

with open('../data/test.csv', 'w') as file:
        file.write(html)
        
eth_usd_res = pd.read_csv('../data/test.csv')  
eth_usd = eth_usd_res.copy()
eth_usd["Date(UTC)"] = [datetime.strptime(ts, "%m/%d/%Y") for ts in eth_usd_res["Date(UTC)"]]
eth_usd = eth_usd.drop(columns=["UnixTimeStamp"])
eth_usd.columns = ["usd_eth_timestamp", "usd_eth"]
eth_usd["usd_eth"] = pd.to_numeric(eth_usd["usd_eth"])
eth_usd.to_gbq('ethereum_us.usd_eth_table', if_exists="replace")

1it [00:03,  3.81s/it]


In [271]:
view_names.append("traces_usd")
current_view_name = view_names[-1]
sql_query = """
             select 
                 from_address, 
                 to_address, 
                 value, 
                 status, 
                 call_type, 
                 trace_type, 
                 block_timestamp, 
                 usd_eth * value as value_usd 
             from `masterarbeit-245718.ethereum_us.{view_id_traces}` as traces_view
             left join `masterarbeit-245718.ethereum_us.usd_eth_table` as usd_eth_table 
                on (TIMESTAMP_TRUNC(usd_eth_table.usd_eth_timestamp, DAY, 'UTC') = TIMESTAMP_TRUNC(traces_view.block_timestamp, DAY, 'UTC'))
            """.format(view_id_traces=view_ids["traces"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'traces_usd_random_100_2020_02_15_2020_02_15_view'


Create table "usd"

In [272]:
view_names.append("usd")
current_view_name = view_names[-1]
sql_query = """
            with usdView as (

              with usdReceivedView as (
                    
                  -- debits
                  select to_address, sum(ifnull(value_usd, 0)) as usdReceived
                  from masterarbeit-245718.ethereum_us.{view_id_traces_usd}
                  where to_address is not null
                  and status = 1
                  and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
                  group by to_address
                    
              ), usdSentView as (
              
                  -- credits
                  select from_address, sum(ifnull(value_usd, 0)) as usdSent
                  from  masterarbeit-245718.ethereum_us.{view_id_traces_usd} 
                  where from_address is not null
                  and status = 1
                  and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
                  group by from_address
              ) 
              select 
                  CASE 
                    when to_address is not null then to_address
                    when from_address is not null then from_address
                  end as address, 
                  ifnull(usdReceived,0) as usdReceived, 
                  ifnull(usdSent,0) as usdSent
              from usdReceivedView full outer join usdSentView on from_address = to_address
            ) 
            select address, usdReceived, usdSent from usdView right join masterarbeit-245718.ethereum_us.{view_id_addresses} using(address)
            """.format(view_id_traces_usd=view_ids["traces_usd"], view_id_addresses=view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'usd_random_100_2020_02_15_2020_02_15_view'


Create View "wei_avg"

In [273]:
view_names.append("wei_avg")
current_view_name = view_names[-1]
sql_query = """
            with features as (
                select * from `masterarbeit-245718.ethereum_us.{view_id_wei}` 
                inner join `masterarbeit-245718.ethereum_us.{view_id_tx}` using(address) 
            )
            select 
                address,
                CASE 
                    when numberOfTranscationsSent > 0 THEN weiSent / numberOfTranscationsSent 
                    when numberOfTranscationsSent = 0 THEN 0 
                END as avg_wei_sent,
                CASE 
                    when numberOfTranscationsReceived > 0 THEN weiReceived / numberOfTranscationsReceived 
                    when numberOfTranscationsReceived = 0 THEN 0 
                END as avg_wei_received
            from features        
            """.format(view_id_wei=view_ids["wei"], view_id_tx=view_ids["tx"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'wei_avg_random_100_2020_02_15_2020_02_15_view'


Create View "usd_avg"

In [274]:
view_names.append("usd_avg")
current_view_name = view_names[-1]
sql_query = """
            with features as (
                select * from `masterarbeit-245718.ethereum_us.{view_id_usd}` 
                inner join `masterarbeit-245718.ethereum_us.{view_id_tx}` using(address) 
            )
            select 
                address, 
                CASE 
                    when numberOfTranscationsSent > 0 THEN usdSent / numberOfTranscationsSent 
                    when numberOfTranscationsSent = 0 THEN 0 
                END as avg_usd_sent,
                CASE 
                    when numberOfTranscationsReceived > 0 THEN usdReceived / numberOfTranscationsReceived 
                    when numberOfTranscationsReceived = 0 THEN 0 
                END as avg_usd_received
            from features        
            """.format(view_id_usd=view_ids["usd"], view_id_tx=view_ids["tx"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'usd_avg_random_100_2020_02_15_2020_02_15_view'


Create View "wei_avg_monthly"

In [275]:
view_names.append("wei_avg_monthly")
current_view_name = view_names[-1]
sql_query = """
            with features as (
                select * from `masterarbeit-245718.ethereum_us.{view_id_wei}` 
                inner join `masterarbeit-245718.ethereum_us.{view_id_active_months}` using(address) 
            )
            select address,
                CASE 
                    when active_months > 0 THEN weiSent / active_months 
                    when active_months = 0 THEN 0 
                END as monthly_wei_sent,
                CASE 
                     when active_months > 0 THEN weiReceived / active_months 
                     when active_months = 0 THEN 0 
                END as monthly_wei_received
                from features
            """.format(view_id_wei=view_ids["wei"], view_id_active_months=view_ids["active_months"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'wei_avg_monthly_random_100_2020_02_15_2020_02_15_view'


Create View "usd_avg_monthly"

In [276]:
view_names.append("usd_avg_monthly")
current_view_name = view_names[-1]
sql_query = """
            with features as (
                select * from `masterarbeit-245718.ethereum_us.{view_id_usd}` 
                inner join `masterarbeit-245718.ethereum_us.{view_id_active_months}` using(address) 
            )
            select address,
                CASE 
                    when active_months > 0 THEN usdSent / active_months 
                    when active_months = 0 THEN 0 
                END as monthly_usd_sent,
                CASE 
                     when active_months > 0 THEN usdReceived / active_months 
                     when active_months = 0 THEN 0 
                END as monthly_usd_received
                from features
            """.format(view_id_usd=view_ids["usd"], view_id_active_months=view_ids["active_months"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'usd_avg_monthly_random_100_2020_02_15_2020_02_15_view'


Create View "tx_avg_monthly"

In [277]:
view_names.append("tx_avg_monthly")
current_view_name = view_names[-1]
sql_query = """
            with features as (
                select * from `masterarbeit-245718.ethereum_us.{view_id_tx}` 
                inner join `masterarbeit-245718.ethereum_us.{view_id_active_months}` using(address) 
            )
            select address,
                CASE 
                    when active_months > 0 THEN numberOfTranscationsSent / active_months 
                    when active_months = 0 THEN 0 
                END as monthly_outgoing_txns,
                CASE 
                    when active_months > 0 THEN numberOfTranscationsReceived / active_months 
                    when active_months = 0 THEN 0 
                END as monthly_incoming_txns
            from features
            """.format(view_id_tx=view_ids["tx"], view_id_active_months=view_ids["active_months"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'tx_avg_monthly_random_100_2020_02_15_2020_02_15_view'


Create View "contracts_created"

In [278]:
view_names.append("contracts_created")
current_view_name = view_names[-1]

sql_query = """
            with contract_created_view as (
            select 
                from_address as address, 
                count(*) as number_of_contracts_created
            from `masterarbeit-245718.ethereum_us.{view_id_traces}` 
                where
                    trace_type = "create"
            group by from_address
            order by number_of_contracts_created DESC
            )
            select 
                address, 
                ifnull(number_of_contracts_created,0) as number_of_contracts_created 
            from contract_created_view right join `masterarbeit-245718.ethereum_us.{view_id_addresses}` using(address)
            """.format(view_id_traces=view_ids["traces"], view_id_addresses = view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'contracts_created_random_100_2020_02_15_2020_02_15_view'


Create View "contract_tx"

In [279]:
view_names.append("contract_tx")
current_view_name = view_names[-1]
sql_query = """
            with contract_tx_view as (
            select 
                from_address as address, 
                count(*) as contract_tx
            from `masterarbeit-245718.ethereum_us.{view_id_traces}` 
                where
                    (input != "None" and input != "0x" and trace_type = "call") or 
                    trace_type = "create" or
                    trace_type = "suicide"
            group by from_address
            order by from_address
            )
            select 
                address, 
                ifnull(contract_tx,0) as contract_tx 
            from contract_tx_view right join `masterarbeit-245718.ethereum_us.{view_id_addresses}` using(address)
            """.format(view_id_traces=view_ids["traces"], view_id_addresses = view_ids["addresses"])

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'contract_tx_random_100_2020_02_15_2020_02_15_view'


Create View "diff_token_used"

In [280]:
view_names.append("diff_token_used")
current_view_name = view_names[-1]
sql_query = """
            with token_transfers as (
                select * from bigquery-public-data.crypto_ethereum.token_transfers 
                where DATE(block_timestamp) >= '{start}' and DATE(block_timestamp) <= '{end}'
            ) 
            select 
                address, 
                count(DISTINCT token_address) as diff_token_used
            from token_transfers right join `masterarbeit-245718.ethereum_us.{view_id_addresses}` on from_address = address 
            group by address
            order by diff_token_used DESC
            """.format(start=observation_period[0], end=observation_period[1], view_id_addresses=view_ids["addresses"]); 

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'diff_token_used_random_100_2020_02_15_2020_02_15_view'


## Merge Views

In [281]:
# Remove non-feature views
for name in ["traces","addresses", "traces_usd"]:
    view_names.remove(name)

In [282]:
current_view_name = "features"
sql_query = "select * from "

for index, view_name in enumerate(view_names):
    sql_query += "masterarbeit-245718.ethereum_us.{}".format(view_ids[view_name])
    if index != 0:
        sql_query += " using(address)"
    if index != (len(view_names) - 1):
        sql_query += " inner join "
        
# sql_query

view_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))
view_ref = client.dataset(dataset_id).table(view_ids[current_view_name])
view = bigquery.Table(view_ref)
view.view_query = sql_query
view = client.create_table(view)  # API request
print("Successfully created view: '{}'".format(view_ids[current_view_name]))

Successfully created view: 'features_random_100_2020_02_15_2020_02_15_view'


## Estimate Costs

In [283]:
table_name = "features"
table_id = "{}_{}_{}_{}_{}".format(table_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))

job_config = bigquery.QueryJobConfig(destination="masterarbeit-245718.ethereum_us.{table_id}".format(table_id = table_id))
job_config.dry_run = True
job_config.use_query_cache = False

sql = """
    SELECT *
    FROM `masterarbeit-245718.ethereum_us.{view_id_features}` 
""".format(view_id_features = view_ids["features"])

# Start the query, passing in the extra configuration.
query_job = client.query(sql, job_config=job_config)  # Make an API request.

cost_dollars = (query_job.total_bytes_processed / 10 ** 12) * 5

print("{} Mega-Bytes will be processed".format(round(query_job.total_bytes_processed / 10 ** 6)))
print("It will cost ${}.".format(cost_dollars))

692 Mega-Bytes will be processed
It will cost $0.003462201575.


## Execute Query (WARNING: this operation costs money!)

In [284]:
table_name = "features"
table_id = "{}_{}_{}_{}_{}".format(table_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))

job_config = bigquery.QueryJobConfig(destination="masterarbeit-245718.ethereum_us.{table_id}".format(table_id = table_id))

sql = """
    SELECT *
    FROM `masterarbeit-245718.ethereum_us.{view_id_features}` 
""".format(view_id_features = view_ids["features"])

# Start the query, passing in the extra configuration.
if (cost_dollars > max_bigquery_costs_usd):
    raise Exception("Warning: This operation costs $ {}.".format(cost_dollars))

query_job = client.query(sql, job_config=job_config)  # Make an API request.
data = query_job.result().to_dataframe(); 

data

Unnamed: 0,address,weiReceived,weiSent,numberOfTranscationsReceived,numberOfTranscationsSent,avg_time_diff_received_tx,avg_time_diff_sent_tx,minedBlocks,stddev_received_tx,stddev_sent_tx,...,avg_usd_received,monthly_wei_sent,monthly_wei_received,monthly_usd_sent,monthly_usd_received,monthly_outgoing_txns,monthly_incoming_txns,number_of_contracts_created,contract_tx,diff_token_used
0,0x8e38949dfc506a96dd89af09dd33721deca23c0f,,,,,0.000000,0.000000,0,,,...,,,,,,,,0,1,0
1,0x53f8382456ed6a4b5f88690415165dc4bfb135f5,0,0,0.0,1.0,0.000000,0.000000,0,,,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,1.0,0.0,0,1,0
2,0x142066eeba654e744cb4089c584062e3cfb0066a,0,0,0.0,1.0,0.000000,0.000000,0,,,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,1.0,0.0,0,1,0
3,0xb96376d80a16af6700dcbaba2a459dd7856f103a,0,0,0.0,1.0,0.000000,0.000000,0,,,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,1.0,0.0,0,1,0
4,0x56f02a19356fce67135fbe026a4141b69dac0cca,0,0,0.0,1.0,0.000000,0.000000,0,,,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,1.0,0.0,0,1,0
5,0xc7ddf05806f12272ca9fa913148efa6bd3f369c2,0,0,0.0,1.0,0.000000,0.000000,0,,,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,1.0,0.0,0,1,1
6,0x2bb7c1cf3eda8ec2d416c1b1860bfcf82dd58100,0,0,0.0,1.0,0.000000,0.000000,0,,,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,1.0,0.0,0,1,0
7,0x801081f7c63457b6dff73fae4c354e82d534453c,0,0,0.0,3.0,0.000000,5271.500000,0,,3619.679613,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,3.0,0.0,0,3,0
8,0x45df22651d1d4f8a5b1bad59aa55031f33e9f633,0,0,0.0,1.0,0.000000,0.000000,0,,,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,1.0,0.0,0,1,1
9,0xca3c36df2d995754a5cd862869bae7a152b83733,0,0,0.0,2.0,0.000000,210.000000,0,,,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,2.0,0.0,0,2,2


# Temporary Scripts

In [285]:
# table_id = "{}_{}_{}_{}_{}_tmp".format(current_view_name, address_selection, approximate_number_of_addresses, observation_period[0].replace("-","_"),observation_period[1].replace("-","_"))

# job_config = bigquery.QueryJobConfig(destination="masterarbeit-245718.ethereum_us.{table_id}".format(table_id = table_id))

# sql = """
#     SELECT *
#     FROM `masterarbeit-245718.ethereum_us.{view_id_diff_token_used}` 
# """.format(view_id_diff_token_used = view_ids["diff_token_used"])

# # Start the query, passing in the extra configuration.
# query_job = client.query(sql, job_config=job_config)  # Make an API request.
# data = query_job.result().to_dataframe(); 

# data