# Setup

## Configuration

In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.cloud import bigquery
import time
from IPython.core.interactiveshell import InteractiveShell
import simplejson as json
import os
import re
import datetime

In [129]:
%env GOOGLE_APPLICATION_CREDENTIALS=../secrets/bigquery-service-account.json 
%load_ext google.cloud.bigquery

client = bigquery.Client()
InteractiveShell.ast_node_interactivity = "all"

env: GOOGLE_APPLICATION_CREDENTIALS=../secrets/bigquery-service-account.json
The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


## Create Test-Tables in BigQuery

In [130]:
import papermill as pm
from enum import Enum

In [131]:
number_of_addresses = 100

# format: dd.mm.yyyy
observation_period_start = "2020-02-01 05:30:00+00"
observation_period_end = "2020-02-01 05:30:30+00"

class ADDRESS_SELECTION(Enum):
    RANDOM = 1 # selects random addresses, that have been active within the observation period.
    RICHEST = 2 # selects the accounts that have the most ether # not yet implemented
    HIGHEST_TURNOVER = 3 # selects the accounts that have the most ether received + sent

address_selection = ADDRESS_SELECTION.HIGHEST_TURNOVER.value

# max USD amount to spent for executing sql queries
max_bigquery_costs_usd = 2

# Delete old tables
reset = True

In [132]:
%%capture
pm.execute_notebook(
   './features.ipynb',
   './build/features.build.ipynb',
   parameters = dict(observation_period_start=observation_period_start,observation_period_end=observation_period_end, address_selection=address_selection,max_bigquery_costs_usd=max_bigquery_costs_usd, reset = reset)
)

Todo: Only execute notebook when tables are not yet uploaded to BigQuery

# Tests

In [133]:
table_ids = dict()

## feature view "wei" 

In [134]:
current_view_name = "wei"
table_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, ADDRESS_SELECTION(address_selection).name, number_of_addresses, re.sub(r'[-.+: ]', '_', observation_period_start),re.sub(r'[-.+: ]', '_', observation_period_end))

sql = """
    SELECT *
    FROM `masterarbeit-245718.ethereum_us.{table_id_features}` 
""".format(table_id_features = table_ids["wei"])

query_job = client.query(sql)  
wei_sql = query_job.result().to_dataframe(); 
wei_sql = wei_sql.set_index("address")

In [135]:
current_view_name = "traces"
table_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, ADDRESS_SELECTION(address_selection).name, number_of_addresses, re.sub(r'[-.+: ]', '_', observation_period_start),re.sub(r'[-.+: ]', '_', observation_period_end))

filename = '{}.json'.format(table_ids[current_view_name])
data_dir = "../data"

if filename not in os.listdir(data_dir):
    
    print("Loading data from bigquery ...")
    
    sql = """
        SELECT *
        FROM `masterarbeit-245718.ethereum_us.{table_id_features}` 
        """.format(table_id_features = table_ids["traces"])

    query_job = client.query(sql)  
    
    start = time.time();
    
    traces_sql = query_job.result().to_dataframe(); 
    
    done = time.time();
    elapsed = round(done - start);
    
    print("Time to retrieve data from BigQuery: {} Minutes.".format(round(elapsed/60)))
    
    with open('{}/{}'.format(data_dir, filename), 'w') as json_file:
        json.dump(traces_sql.to_dict(), json_file, use_decimal=True, default=str)
        
else: 
    
    print("Loading data from local cache ...")
    
    start = time.time();
    
    with open('../data/{}'.format(filename), "r") as file:  
        file_content_json = json.load(file);
        
    done = time.time();
    elapsed = round(done - start);
    
    print("Time to retrieve data from local cache: {} Seconds.".format(elapsed))
    
    traces_sql = pd.DataFrame(file_content_json)      

Loading data from local cache ...
Time to retrieve data from local cache: 0 Seconds.


In [136]:
m = traces_sql.memory_usage();
totalBytes = m.sum();
totalMegabytes = totalBytes/10**6;
print("Size of traces dataframe: {} Megabytes.".format(round(totalMegabytes)))

Size of traces dataframe: 0.0 Megabytes.


In [137]:
# filter traces
traces = [row for (index, row) in traces_sql.iterrows() if (row.call_type not in ['delegatecall', 'callcode', 'staticcall'] or row.call_type == None)]
traces = pd.DataFrame(traces)
# format traces
traces["block_timestamp"] = pd.to_datetime(traces["block_timestamp"])

In [138]:
# calculate wei send received values for each address
wei_py = pd.DataFrame(index=set(traces_sql["from_address"].unique()) | set(traces_sql["to_address"].unique()))
wei_py["wei_received"] = traces.groupby("to_address").apply(lambda row: row["value"].sum())
wei_py["wei_sent"] = traces.groupby("from_address").apply(lambda row: row["value"].sum())
# refactoring
wei_py["wei_received"] = wei_py["wei_received"].fillna(0.)
wei_py["wei_sent"] = wei_py["wei_sent"].fillna(0.)
wei_py.index = wei_py.index.rename("address")
wei_py = wei_py.reindex(wei_sql.index, fill_value=0.)

In [139]:
wei_py.head()
wei_sql.head()

Unnamed: 0_level_0,wei_received,wei_sent
address,Unnamed: 1_level_1,Unnamed: 2_level_1
0x0b076760c3eb9e28e2cff7b6b17d177e0f0d1dd8,0,87999958000000000000
0x6f48a3e70f0251d1e83a989e62aaa2281a6d5380,87999958000000000000,0
0xfdac42da5632d9083dc132fb52451524f9b9551e,10571854500000000000,0
0xc098b2a3aa256d2140208c3de6543aaef5cd3a94,0,10000000000000000000
0xcadc64f9f974f810b68a714e9001ce4800719b86,10000000000000000000,0


Unnamed: 0_level_0,wei_received,wei_sent
address,Unnamed: 1_level_1,Unnamed: 2_level_1
0x0b076760c3eb9e28e2cff7b6b17d177e0f0d1dd8,0,87999958000000000000
0x6f48a3e70f0251d1e83a989e62aaa2281a6d5380,87999958000000000000,0
0xfdac42da5632d9083dc132fb52451524f9b9551e,10571854500000000000,0
0xc098b2a3aa256d2140208c3de6543aaef5cd3a94,0,10000000000000000000
0xcadc64f9f974f810b68a714e9001ce4800719b86,10000000000000000000,0


In [140]:
pd.testing.assert_frame_equal(wei_py, wei_sql)
print("weiSent, weiReceived Test succeeded!!")

weiSent, weiReceived Test succeeded!!


### feature view "tx"

In [141]:
current_view_name = "tx"
table_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, ADDRESS_SELECTION(address_selection).name, number_of_addresses, re.sub(r'[-.+: ]', '_', observation_period_start),re.sub(r'[-.+: ]', '_', observation_period_end))

sql = """
    SELECT *
    FROM `masterarbeit-245718.ethereum_us.{table_id_features}` 
""".format(table_id_features = table_ids["tx"])

query_job = client.query(sql) 
tx_sql = query_job.result().to_dataframe(); 
# refactoring
tx_sql = tx_sql.set_index("address")

In [142]:
# res3 = res2.copy()
# res3 = res3.fillna(0.)
# res3 = res3.sort_values(by="address", ascending=False)
# tx_sent_received_result_sql = res3.set_index("address", drop=True)

In [143]:
tx_py = pd.DataFrame(index=set(traces["from_address"].unique()) | set(traces["to_address"].unique()))
tx_py["number_of_tx_received"] = traces.groupby('to_address').count().value
tx_py["number_of_tx_sent"] = traces.groupby('from_address').count().value
tx_py = tx_py.fillna(0)
tx_py = tx_py.astype("int")
tx_py.index = tx_py.index.rename("address")
tx_py = tx_py.reindex(wei_sql.index, fill_value=0.)

In [144]:
tx_py.head()
tx_sql.head()

Unnamed: 0_level_0,number_of_tx_received,number_of_tx_sent
address,Unnamed: 1_level_1,Unnamed: 2_level_1
0x0b076760c3eb9e28e2cff7b6b17d177e0f0d1dd8,0,1
0x6f48a3e70f0251d1e83a989e62aaa2281a6d5380,1,0
0xfdac42da5632d9083dc132fb52451524f9b9551e,2,0
0xc098b2a3aa256d2140208c3de6543aaef5cd3a94,0,1
0xcadc64f9f974f810b68a714e9001ce4800719b86,1,0


Unnamed: 0_level_0,number_of_tx_received,number_of_tx_sent
address,Unnamed: 1_level_1,Unnamed: 2_level_1
0x0b076760c3eb9e28e2cff7b6b17d177e0f0d1dd8,0,1
0x6f48a3e70f0251d1e83a989e62aaa2281a6d5380,1,0
0xfdac42da5632d9083dc132fb52451524f9b9551e,2,0
0xc098b2a3aa256d2140208c3de6543aaef5cd3a94,0,1
0xcadc64f9f974f810b68a714e9001ce4800719b86,1,0


In [145]:
pd.testing.assert_frame_equal(tx_py, tx_sql)
print("txSent, txReceived Test succeeded !!")

txSent, txReceived Test succeeded !!


In [146]:
# features = balance_result_sql.join(tx_sent_received_result_sql,how="left")
# features = features.reset_index()
# features.to_gbq('ethereum_us.sample_features', if_exists="replace")

### feature view avg_wei

Benötigt keine Tests, da der SQL Befehl sehr einfach aufgebaut ist.

### feature view avg_time_diff_sent_tx

In [147]:
current_view_name = "avg_time_diff_sent_tx"
table_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, ADDRESS_SELECTION(address_selection).name, number_of_addresses, re.sub(r'[-.+: ]', '_', observation_period_start),re.sub(r'[-.+: ]', '_', observation_period_end))

sql = """
    SELECT *
    FROM `masterarbeit-245718.ethereum_us.{table_id_features}` 
""".format(table_id_features = table_ids["avg_time_diff_sent_tx"])

query_job = client.query(sql) 
avg_time_diff_sent_tx_sql = query_job.result().to_dataframe(); 
# refactoring
avg_time_diff_sent_tx_sql = avg_time_diff_sent_tx_sql.set_index("address")

In [148]:
traces_tmp = [row for (index, row) in traces.iterrows() if (row.call_type not in ['delegatecall', 'callcode', 'staticcall'] or row.call_type == None) and row.status == 1]
traces_tmp = pd.DataFrame(traces_tmp)
tmp1 = traces_tmp.groupby("from_address").max().block_timestamp
tmp2 = traces_tmp.groupby("from_address").min().block_timestamp
tmp3 = tmp1 - tmp2
tmp3 = tmp3.rename("seconds_diff")
avg_time_diff_sent_tx_py = tx_py.join(tmp3, how="right").drop("number_of_tx_received", axis=1)
avg_time_diff_sent_tx_py = avg_time_diff_sent_tx_py.fillna(0.)
avg_time_diff_sent_tx_py["avg_time_diff_sent_tx"] = avg_time_diff_sent_tx_py["seconds_diff"] / (avg_time_diff_sent_tx_py["number_of_tx_sent"] - 1)
avg_time_diff_sent_tx_py["avg_time_diff_sent_tx"] = avg_time_diff_sent_tx_py["avg_time_diff_sent_tx"].fillna(datetime.timedelta(0))
avg_time_diff_sent_tx_py = avg_time_diff_sent_tx_py.reindex(wei_sql.index, fill_value=0.)
avg_time_diff_sent_tx_py.index = avg_time_diff_sent_tx_py.index.rename("address")
avg_time_diff_sent_tx_py = avg_time_diff_sent_tx_py.drop(["number_of_tx_sent", "seconds_diff"],axis=1)
avg_time_diff_sent_tx_py["avg_time_diff_sent_tx"] = [ts.total_seconds() for ts in avg_time_diff_sent_tx_py["avg_time_diff_sent_tx"]]

  result = self._data / other


In [149]:
avg_time_diff_sent_tx_py.head()
avg_time_diff_sent_tx_sql.head()

Unnamed: 0_level_0,avg_time_diff_sent_tx
address,Unnamed: 1_level_1
0x0b076760c3eb9e28e2cff7b6b17d177e0f0d1dd8,0.0
0x6f48a3e70f0251d1e83a989e62aaa2281a6d5380,0.0
0xfdac42da5632d9083dc132fb52451524f9b9551e,0.0
0xc098b2a3aa256d2140208c3de6543aaef5cd3a94,0.0
0xcadc64f9f974f810b68a714e9001ce4800719b86,0.0


Unnamed: 0_level_0,avg_time_diff_sent_tx
address,Unnamed: 1_level_1
0x0b076760c3eb9e28e2cff7b6b17d177e0f0d1dd8,0.0
0x6f48a3e70f0251d1e83a989e62aaa2281a6d5380,0.0
0xfdac42da5632d9083dc132fb52451524f9b9551e,0.0
0xc098b2a3aa256d2140208c3de6543aaef5cd3a94,0.0
0xcadc64f9f974f810b68a714e9001ce4800719b86,0.0


In [150]:
pd.testing.assert_frame_equal(avg_time_diff_sent_tx_py, avg_time_diff_sent_tx_sql)
print("avg_time_diff_sent_tx Test succeeded !!")

avg_time_diff_sent_tx Test succeeded !!


# feature view "mined_blocks"

In [151]:
current_view_name = "mined_blocks"
table_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, ADDRESS_SELECTION(address_selection).name, number_of_addresses, re.sub(r'[-.+: ]', '_', observation_period_start),re.sub(r'[-.+: ]', '_', observation_period_end))

sql = """
    SELECT *
    FROM `masterarbeit-245718.ethereum_us.{table_id_features}` 
""".format(table_id_features = table_ids[current_view_name])

query_job = client.query(sql) 
mined_blocks_sql = query_job.result().to_dataframe(); 
# refactoring
mined_blocks_sql = mined_blocks_sql.set_index("address")

# feature view "stddev_received_tx"

In [152]:
current_view_name = "stddev_received_tx"
table_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, ADDRESS_SELECTION(address_selection).name, number_of_addresses, re.sub(r'[-.+: ]', '_', observation_period_start),re.sub(r'[-.+: ]', '_', observation_period_end))

sql = """
    SELECT *
    FROM `masterarbeit-245718.ethereum_us.{table_id_features}` 
""".format(table_id_features = table_ids[current_view_name])

query_job = client.query(sql) 
stddev_received_tx_sql = query_job.result().to_dataframe(); 
# refactoring
stddev_received_tx_sql = stddev_received_tx_sql.set_index("address")
# handle 0 values
stddev_received_tx_sql[current_view_name] = stddev_received_tx_sql[current_view_name].replace(to_replace=0.0, value=max(stddev_received_tx_sql[current_view_name]))

In [153]:
traces_tmp = traces.sort_values(by="block_timestamp", ascending=True)
traces_tmp["block_timestamp"] = pd.to_datetime(traces["block_timestamp"])
stddev_received_tx_py = traces_tmp.groupby("to_address").apply(lambda df: df["block_timestamp"].diff().std() )
stddev_received_tx_py = pd.DataFrame(stddev_received_tx_py, columns=["stddev_received_tx"])
stddev_received_tx_py = stddev_received_tx_py.reindex(wei_sql.index)
stddev_received_tx_py = stddev_received_tx_py.fillna(stddev_received_tx_py.max())
stddev_received_tx_py["stddev_received_tx"] = [td.total_seconds() for td in stddev_received_tx_py["stddev_received_tx"]]
stddev_received_tx_py = stddev_received_tx_py.replace(to_replace=0.0, value=stddev_received_tx_py.max())

In [154]:
stddev_received_tx_py.head()
stddev_received_tx_sql.head()

Unnamed: 0_level_0,stddev_received_tx
address,Unnamed: 1_level_1
0x0b076760c3eb9e28e2cff7b6b17d177e0f0d1dd8,8.020806
0x6f48a3e70f0251d1e83a989e62aaa2281a6d5380,8.020806
0xfdac42da5632d9083dc132fb52451524f9b9551e,8.020806
0xc098b2a3aa256d2140208c3de6543aaef5cd3a94,8.020806
0xcadc64f9f974f810b68a714e9001ce4800719b86,8.020806


Unnamed: 0_level_0,stddev_received_tx
address,Unnamed: 1_level_1
0x0b076760c3eb9e28e2cff7b6b17d177e0f0d1dd8,8.020806
0x6f48a3e70f0251d1e83a989e62aaa2281a6d5380,8.020806
0xfdac42da5632d9083dc132fb52451524f9b9551e,8.020806
0xc098b2a3aa256d2140208c3de6543aaef5cd3a94,8.020806
0xcadc64f9f974f810b68a714e9001ce4800719b86,8.020806


In [155]:
np.testing.assert_almost_equal(list(stddev_received_tx_py["stddev_received_tx"]), list(stddev_received_tx_sql["stddev_received_tx"]), decimal=2)
print("stddev Test succeeded !!")

stddev Test succeeded !!


# feature view "active_months"

In [156]:
current_view_name = "active_months"
table_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, ADDRESS_SELECTION(address_selection).name, number_of_addresses, re.sub(r'[-.+: ]', '_', observation_period_start),re.sub(r'[-.+: ]', '_', observation_period_end))

sql = """
    SELECT *
    FROM `masterarbeit-245718.ethereum_us.{table_id_features}` 
""".format(table_id_features = table_ids[current_view_name])

query_job = client.query(sql) 
active_months_sql = query_job.result().to_dataframe(); 
# refactoring
active_months_sql = active_months_sql.set_index("address")

In [178]:
from datetime import timedelta

def address_was_active(address, month):
    for i,t in traces.iterrows():
        if (t["from_address"] == address or t["to_address"] == address) and (month.strftime("%Y-%m") == t["block_timestamp"].strftime("%Y-%m")):
            return True
    return False

In [None]:
active_months_py = {}
min_ts = traces["block_timestamp"].min()
max_ts = traces["block_timestamp"].max()
min_ts = min_ts.to_pydatetime()
max_ts = max_ts.to_pydatetime()

months = [month.to_pydatetime() for month in pd.date_range(start=min_ts, end=(max_ts + timedelta(days=31)), freq="MS")]

for address in wei_sql.index:
    for month in months:
        if (address_was_active(address, month)):
            active_months_py[address] = active_months_py[address] + 1 if address in active_months_py else 1 

In [None]:
# refactoring
active_months_py = pd.DataFrame(pd.Series(active_months_py), columns=["active_months"])
active_months_py = active_months_py.reindex(active_months_sql.index)
active_months_py = active_months_py.fillna(0.)

In [None]:
active_months_py.head()
active_months_sql.head()

In [None]:
pd.testing.assert_frame_equal(active_months_py, active_months_sql)
print("active months Test succeeded!!")

# Deprecated

# Erstellen der traces_usd Tabelle

In [65]:
import os 
from datetime import datetime

In [66]:
eth_usd_res = pd.read_csv('../data/eth_usd.csv') 
# Preview the first 5 lines of the loaded data 
eth_usd = eth_usd_res.copy()
eth_usd["avg_usd_eth"] = (eth_usd_res["High"] + eth_usd_res["Low"]) / 2
eth_usd["usd_eth_timestamp"] = [datetime.strptime(ts, "%Y-%m-%d") for ts in eth_usd_res["Date"]]
eth_usd = eth_usd.loc[:, ["usd_eth_timestamp", "avg_usd_eth"]]
# data.sort_values(by="avg", ascending=False)
# data.iloc[data["avg_usd_eth"].idxmax(),:]
eth_usd.tail()

Unnamed: 0,usd_eth_timestamp,avg_usd_eth
1645,2020-02-07,217.722564
1646,2020-02-08,220.98719
1647,2020-02-09,226.423241
1648,2020-02-10,223.632332
1649,2020-02-11,222.864899


In [67]:
eth_usd.to_gbq('ethereum_us.usd_eth', if_exists="replace")

1it [00:04,  4.10s/it]


In [68]:
%%bigquery traces_usd --project masterarbeit-245718 --verbose 

select from_address, to_address, value, status, call_type, trace_type, block_timestamp, avg_usd_eth * value as value_usd from `masterarbeit-245718.ethereum_us.traces` as traces left join `masterarbeit-245718.ethereum_us.usd_eth` as usd_eth 
  on (TIMESTAMP_TRUNC(usd_eth.usd_eth_timestamp, DAY, 'UTC') = TIMESTAMP_TRUNC(traces.block_timestamp, DAY, 'UTC'))

Executing query with job ID: fae8d5bf-6b53-487c-98c8-115b771f39c6
Query executing: 1.70s
Query complete after 2.64s


In [69]:
traces_usd.to_gbq('ethereum_us.traces_usd', if_exists="replace")

1it [00:04,  4.12s/it]


# Total USD received / sent 

Analog zu wei.

# AVG USD received / sent

Analog zu wei.

# Montly USD reveived / sent

Analog zu wei.

# Active Months

In [70]:
%%bigquery active_months_result_sql --project masterarbeit-245718 --verbose 

with traces_clean as (
    select * from `masterarbeit-245718.ethereum_us.traces` where 
    status = 1 and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
    
), tx_received as (
    select 
        TIMESTAMP_TRUNC(block_timestamp, MONTH, 'UTC') as month, 
        to_address,
        count(*) as number_tx_received
    from traces_clean
    group by TIMESTAMP_TRUNC(block_timestamp, MONTH, 'UTC'), to_address
    
), tx_sent as (
    select 
        TIMESTAMP_TRUNC(block_timestamp, MONTH, 'UTC') as month, 
        from_address,
        count(*) as number_tx_sent
    from traces_clean
    group by TIMESTAMP_TRUNC(block_timestamp, MONTH, 'UTC'), from_address
    
), monthly_tx as (
    select 
        CASE  
          WHEN tx_sent.from_address IS NOT NULL THEN tx_sent.from_address
          WHEN tx_received.to_address IS NOT NULL THEN tx_received.to_address
        END AS address,
        CASE  
          WHEN tx_sent.month IS NOT NULL THEN tx_sent.month
          WHEN tx_received.month IS NOT NULL THEN tx_received.month
        END AS month,
        ifnull(number_tx_sent,0) as number_tx_sent, 
        ifnull(number_tx_received,0) as number_tx_received 
    from tx_sent full join tx_received 
        on (tx_sent.from_address = tx_received.to_address and tx_sent.month = tx_received.month)

), active_months_view as (
    select 
        address, 
        countif(number_tx_sent > 0 or number_tx_received > 0) as active_months 
    from monthly_tx group by address order by address ASC 
)

select 
    address, 
    ifnull(active_months,0) as active_months 
from ethereum_us.sample_addresses left join active_months_view using(address)



Executing query with job ID: 80656125-d495-4b9d-97fc-73b9f9618a8d
Query executing: 1.07s
Query complete after 2.12s


#### Tests für active_months feature

In [71]:
traces = [row for (index, row) in traces.iterrows() if (row.call_type not in ['delegatecall', 'callcode', 'staticcall'] or row.call_type == None) and row.status == 1]
traces = pd.DataFrame(traces)

In [72]:
def address_was_active(address, month):
    for i,t in traces.iterrows():
        if (t["from_address"] == address or t["to_address"] == address) and (month.strftime("%Y-%m") == t["block_timestamp"].strftime("%Y-%m")):
            return True
    return False
        

In [73]:
from datetime import timedelta

active_months = {}

min_ts = traces["block_timestamp"].min()
max_ts = traces["block_timestamp"].max()
min_ts = min_ts.to_pydatetime()
max_ts = max_ts.to_pydatetime()

         
months = [month.to_pydatetime() for month in pd.date_range(start=min_ts, end=(max_ts + timedelta(days=31)), freq="MS")]

for index,address in sample_addresses.iterrows():
    for month in months:
        if (address_was_active(address[0], month)):
            active_months[address[0]] = active_months[address[0]] + 1 if address[0] in active_months else 1 

In [74]:
s = pd.Series(active_months)
df = pd.DataFrame(s)
df = df.reindex(wei_sql.index)
df = df.reset_index()
df.columns=["address", "active_months"]
df = df.fillna(0.)
df["active_months"] = df["active_months"].astype(int)
active_months_result_py  = df
pd.testing.assert_frame_equal(active_months_result_py, active_months_result_sql)
print("active months Test succeeded!!")

active months Test succeeded!!


# montly_wei_sent, monthly_wei_recd

In [75]:
active_months_result_sql = active_months_result_sql.set_index("address")
features = balance_result_sql.join(active_months_result_sql, how="left")

In [76]:
features.to_gbq('ethereum_us.sample_features', if_exists="replace")

1it [00:03,  3.18s/it]


In [77]:
%%bigquery --project masterarbeit-245718 --verbose 

select 
    *,
    CASE 
        when active_months > 0 THEN weiSent / active_months 
        when active_months = 0 THEN 0 
    END as monthly_wei_sent,
    CASE 
        when active_months > 0 THEN weiReceived / active_months 
        when active_months = 0 THEN 0 
    END as monthly_wei_recd
from `ethereum_us.sample_features`

Executing query with job ID: aad97238-74fb-4913-bd16-80202f60496a
Query executing: 0.71s
Query complete after 1.22s


Unnamed: 0,weiReceived,weiSent,balance,active_months,monthly_wei_sent,monthly_wei_recd
0,0.0,0.0,0.0,0,0.0,0.0
1,0.0,0.0,0.0,0,0.0,0.0
2,0.0,0.0,0.0,0,0.0,0.0
3,0.0,0.0,0.0,0,0.0,0.0
4,0.0,0.0,0.0,0,0.0,0.0
5,0.0,0.0,0.0,0,0.0,0.0
6,0.0,0.0,0.0,0,0.0,0.0
7,0.0,0.0,0.0,0,0.0,0.0
8,0.0,0.0,0.0,0,0.0,0.0
9,0.0,0.0,0.0,0,0.0,0.0


# monthly_outgoing_txns, monthly_incoming_txns

In [78]:
features = tx_sent_received_result_sql.join(active_months_result_sql, how="left")

In [79]:
features.to_gbq('ethereum_us.sample_features', if_exists="replace")

1it [00:05,  5.54s/it]


In [80]:
%%bigquery --project masterarbeit-245718 --verbose 

select 
    *,
    CASE 
        when active_months > 0 THEN number_of_tx_sent / active_months 
        when active_months = 0 THEN 0 
    END as monthly_outgoing_txns,
    CASE 
        when active_months > 0 THEN number_of_tx_received / active_months 
        when active_months = 0 THEN 0 
    END as monthly_incoming_txns
from `ethereum_us.sample_features`

Executing query with job ID: 681517e1-7c08-460b-9eae-4c0a3489df44
Query executing: 0.89s
Query complete after 1.42s


Unnamed: 0,numberOfTranscationsReceived,numberOfTranscationsSent,active_months,monthly_outgoing_txns,monthly_incoming_txns
0,0.0,0.0,0,0.0,0.0
1,0.0,0.0,0,0.0,0.0
2,0.0,0.0,0,0.0,0.0
3,0.0,0.0,0,0.0,0.0
4,0.0,0.0,0,0.0,0.0
5,0.0,0.0,0,0.0,0.0
6,0.0,0.0,0,0.0,0.0
7,0.0,0.0,0,0.0,0.0
8,0.0,0.0,0,0.0,0.0
9,0.0,0.0,0,0.0,0.0


# number_of_contracts_created

In [81]:
%%bigquery number_of_contracts_created_res_sql --project masterarbeit-245718 --verbose 

select 
    from_address, count(*) as number_of_contracts_created
from `masterarbeit-245718.ethereum_us.traces`
    where
        status = 1 and
        trace_type = "create"
group by from_address
order by from_address

Executing query with job ID: 22d5a532-8447-4226-93f3-9ac0754473ba
Query executing: 1.14s
Query complete after 1.74s


Anmerkung: Es werden sowohl internal als auch external Transaktionen berücksichtigt, welche einen contract erstellt haben. 

In [82]:
number_of_contracts_created_res_python = [row for (index, row) in traces.iterrows() if row.trace_type == "create" and row.status == 1]
number_of_contracts_created_res_python = pd.DataFrame(number_of_contracts_created_res_python)
number_of_contracts_created_res_python = pd.DataFrame(number_of_contracts_created_res_python.groupby("from_address").size(), columns=["number_of_contracts_created"])
number_of_contracts_created_res_python = number_of_contracts_created_res_python.reset_index()
number_of_contracts_created_res_python = number_of_contracts_created_res_python.sort_values(by="from_address")
print("Tests für Merkmal 'number_of_contracts_created' erfolgreich ausgeführt.")

Tests für Merkmal 'number_of_contracts_created' erfolgreich ausgeführt.


In [83]:
pd.testing.assert_frame_equal(number_of_contracts_created_res_sql, number_of_contracts_created_res_python)

# sample_oneday_traces view

In [64]:
import random
import time

def random_date(start="2018-1-1", end="2020-1-1", format='%Y-%m-%d', prop=random.random()):
  
    stime = time.mktime(time.strptime(start, format))
    etime = time.mktime(time.strptime(end, format))

    ptime = stime + prop * (etime - stime)

    return time.strftime(format, time.localtime(ptime))

In [65]:
from google.cloud import bigquery
import simplejson as json

client = bigquery.Client()

destination_dataset_id = "ethereum_us"
destination_table_id = "sample_oneday_traces_view"

source_project = "bigquery-public-data"
source_dataset_id = "crypto_ethereum"
source_table_id = "traces"

tables = client.list_tables(dataset_id)  # Make an API request.

table_ids = [t.table_id for t in tables]

if destination_table_id not in table_ids:

    view_ref = client.dataset(destination_dataset_id).table(destination_table_id)
    view = bigquery.Table(view_ref)

    view.view_query = """SELECT * from `{}.{}.{}` 
        where DATE(block_timestamp) = '{}'""".format(source_project, source_dataset_id, source_table_id, random_date()); 
    
    view = client.create_table(view)  # API request
    
    print("Successfully created view at {}".format(view.full_table_id))
else:
    print("View '{}' already exists.".format(destination_table_id))


View 'sample_oneday_traces_view' already exists.


Successfully created view at masterarbeit-245718:ethereum_us.sample_oneday_traces_view


# contract_tx feature

In [66]:
%load_ext google.cloud.bigquery
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../secrets/bigquery-service-account.json"

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [67]:
%%bigquery contract_tx_res_sql --project masterarbeit-245718 --verbose 

select 
    from_address, count(*) as contract_tx
from `masterarbeit-245718.ethereum_us.sample_oneday_traces_view`
    where
        status = 1 and
        (input != "None" and input != "0x" and trace_type = "call") or 
        trace_type = "create" or
        trace_type = "suicide"
group by from_address
order by from_address

Executing query with job ID: 7fc03aaf-f9b4-499c-a0be-59255b42d812
Query executing: 4.27s
Query complete after 5.19s


# test: contract_tx feature

Todo: Beim Erstellen des views die Daten lokal speichern, so dass ich im Zuge der Tests schnell darauf zugreifen kann.

# sample_daily_token_transfers view

In [71]:
import random
import time

def random_date(start="2018-1-1", end="2020-1-1", format='%Y-%m-%d', prop=random.random()):
  
    stime = time.mktime(time.strptime(start, format))
    etime = time.mktime(time.strptime(end, format))

    ptime = stime + prop * (etime - stime)

    return time.strftime(format, time.localtime(ptime))

In [72]:
from google.cloud import bigquery
import simplejson as json

client = bigquery.Client()

destination_dataset_id = "ethereum_us"
destination_table_id = "sample_daily_token_transfers_view"

source_project = "bigquery-public-data"
source_dataset_id = "crypto_ethereum"
source_table_id = "token_transfers"

tables = client.list_tables(dataset_id)  # Make an API request.

table_ids = [t.table_id for t in tables]

if destination_table_id not in table_ids:

    view_ref = client.dataset(destination_dataset_id).table(destination_table_id)
    view = bigquery.Table(view_ref)

    view.view_query = """SELECT * from `{}.{}.{}` 
        where DATE(block_timestamp) = '{}'""".format(source_project, source_dataset_id, source_table_id, random_date()); 
    
    view = client.create_table(view)  # API request
    
    print("Successfully created view at {}".format(view.full_table_id))
else:
    print("View '{}' already exists.".format(destination_table_id))

Successfully created view at masterarbeit-245718:ethereum_us.sample_daily_token_transfers_view


# diff_token_used feature

In [73]:
%load_ext google.cloud.bigquery
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../secrets/bigquery-service-account.json"

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [76]:
%%bigquery diff_token_used_res_sql --project masterarbeit-245718 --verbose 

select 
    from_address, count(DISTINCT token_address) as diff_token_used
from `masterarbeit-245718.ethereum_us.sample_daily_token_transfers_view`
group by from_address
order by diff_token_used DESC

Executing query with job ID: eca0f375-30e9-40db-983c-bd198f3f54b2
Query executing: 6.58s
Query complete after 7.29s


# test: diff_token_used feature

# tmp

In [None]:
from google.cloud import bigquery

# TODO(developer): Construct a BigQuery client object.
client = bigquery.Client()

# TODO(developer): Set table_id to the ID of the destination table.
table_id = "masterarbeit-245718.ethereum_us.sample_oneday_traces"

job_config = bigquery.QueryJobConfig(destination=table_id)

sql = """SELECT * from `bigquery-public-data.crypto_ethereum.traces` 
    where DATE(block_timestamp) = '{}'""".format(random_date()); 

# Start the query, passing in the extra configuration.
query_job = client.query(sql, job_config=job_config)  # Make an API request.
query_job.result()  # Wait for the job to complete.

print("Query results loaded to the table {}".format(table_id))

In [None]:
import os
from google.cloud import bigquery
import simplejson as json
import pandas as pd
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../secrets/bigquery-service-account.json"

filename = "sample_oneday_traces"
if filename not in os.listdir("../data"):
    client = bigquery.Client()
    tableName = 'bigquery-public-data.crypto_ethereum.traces'
    query = "SELECT * from `" + tableName +"` where DATE(block_timestamp) = '{}'".format(random_date()); 
    print("query", query)
    query_job = client.query(query);
    start = time.time();
    data = query_j
    done = time.time();
    elapsed = round(done - start);
    print("Time to retrieve data from BigQuery: {} Seconds.".format(elapsed))
    with open('../data/{}'.format(filename), 'w') as json_file:
        import simplejson as json
        json.dump(data.to_dict(), json_file, default=str)
    sample_oneday_traces.to_gbq('ethereum_us.sample_oneday_traces', if_exists="replace")
else: 
    print("Loading data from local cache.")
    with open('../data/{}'.format(filename), "r") as file:  
        file_content_json = json.load(file);
    data = pd.DataFrame(file_content_json)
    data["block_timestamp"] = pd.to_datetime(data["block_timestamp"])

sample_oneday_traces = data

In [41]:
test_data = data.head()
test_data = test_data.to_dict()

In [42]:
json_file = json.dumps(test_data, default=str)

In [43]:
import pandas as pd
res = json.loads(json_file)
res = pd.DataFrame(res)
res["block_timestamp"] = pd.to_datetime(res["block_timestamp"])
res

Unnamed: 0,transaction_hash,transaction_index,from_address,to_address,value,input,output,trace_type,call_type,reward_type,gas,gas_used,subtraces,trace_address,error,status,block_timestamp,block_number,block_hash,trace_id
0,,,,0x52bc44d5378309ee2abf1539bf71de1b7d7be3b5,2625000000000000000,,,reward,,uncle,,,0,,,1,2018-05-18 10:19:29+00:00,5634392,0x31a5c98db131327f8c34900645b7a771d8984ae441e1...,reward_5634392_1
1,0x13c9ed7b907b1ab9fdeef7375e69799db218184e525a...,44.0,0x52bc44d5378309ee2abf1539bf71de1b7d7be3b5,0x89b254afbc2ac7abcf78b815f503bd3796b996eb,50597595273870200,0x,0x,call,call,,29000.0,0.0,0,,,1,2018-05-18 17:19:42+00:00,5635987,0x8375a74ccc536f250b154e924fdc73a410f5ceddb257...,call_0x13c9ed7b907b1ab9fdeef7375e69799db218184...
2,0xdb12099a815093b87508faf7c3addbd1dded119c215a...,237.0,0xe57a18783640c9fa3c5e8e4d4b4443e2024a7ff9,0xd416bf739a7d148a7fe77b9de3ed28c4fd731167,500000000000000000,0x,0x,call,call,,0.0,0.0,0,,,1,2018-05-18 11:30:27+00:00,5634674,0x2dd210cc1c1d11c68814852fc80ecf9fa1dba839dc1b...,call_0xdb12099a815093b87508faf7c3addbd1dded119...
3,0x254eb95cd7432691cc810cd876bd64b6761d29f91537...,79.0,0x8b86ddcc02fc8b31553ab80189e0b5f670adf35a,0x4f878c0852722b0976a955d68b376e4cd4ae99e5,0,0xa9059cbb000000000000000000000000fde8da4ad2c8...,0x00000000000000000000000000000000000000000000...,call,call,,39218.0,28891.0,0,,,1,2018-05-18 10:26:18+00:00,5634419,0x3d67534cf57e5d441b6ce3fa5c06311c5b5a53fb7ae9...,call_0x254eb95cd7432691cc810cd876bd64b6761d29f...
4,0xb1c49535d277b3b6427e68b5a8cc93fd8d8033e72d8c...,179.0,0x7400ebe97cff0eaa103d1a51f4c1e25445ef5976,0x74eab9dab2c4b453cf5d0cb13ea33ea683e47bc9,2184563270000000000,0x,0x,call,call,,0.0,0.0,0,,,1,2018-05-18 01:09:48+00:00,5632239,0x2f22d60c43faa818febeb55898f6e1badc93e0595b5b...,call_0xb1c49535d277b3b6427e68b5a8cc93fd8d8033e...


In [44]:
with open('../data/sample_onedays_traces', 'w') as json_file:
        import simplejson as json
        json.dump(data.to_dict(), json_file, default=str)

In [46]:
with open('../data/sample_onedays_traces', "r") as file:  
    file_content_json = json.load(file);
    data = pd.DataFrame(file_content_json)
    data["block_timestamp"] = pd.to_datetime(data["block_timestamp"])

In [49]:
list(data["input"].unique())

[None,
 '0x',
 '0x3f83acff73763a7464656d75727261676500000000000000000000000000000000000000',
 '0xa9059cbb00000000000000000000000032428bb3643542fa00c17a38c79b7730e6754fb80000000000000000000000000000000000000000000000000000000219c426e0',
 '0xa9059cbb00000000000000000000000022a32113f50b89137bad81211b0886722cd3e00c0000000000000000000000000000000000000000000000000000007759574600',
 '0x9fa0f763',
 '0xa9059cbb000000000000000000000000fdb16996831753d5331ff813c29a93c76834a0ad0000000000000000000000000000000000000000000000914878a8c05ee00000',
 '0x8da5cb5b',
 '0x0000000000000000000000000000000000000000',
 '0xa9059cbb000000000000000000000000af211d6629ec2f86f5eea38a04a26f8050ea3a880000000000000000000000000000000000000000000000056bc75e2d63100000',
 '0xa9059cbb000000000000000000000000d09423077580628ddcdda342337280e8de34aa8000000000000000000000000000000000000000000000007a3c1269c2be8de7be',
 '0x2295115b0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

# End of tmp

In [85]:
# %%bigquery sample_onedays_traces --project masterarbeit-245718 --verbose --params $params

# SELECT * from `bigquery-public-data.crypto_ethereum.traces` 
#       where DATE(block_timestamp) = @random_date

Executing query with job ID: e2c11717-058e-4025-bbe4-3962f1335475
Query executing: 61.43s
Query complete after 61.90s


In [87]:
%%bigquery contract_tx_res_sql --project masterarbeit-245718 --verbose 

select 
    from_address, count(*) as contract_tx
from `masterarbeit-245718.ethereum_us.traces`
    where
        status = 1 and
        ((input != None input != "0x" and trace_type = "call") or 
        trace_type = "create" or
        trace_type = "suicide")
group by from_address
order by from_address

Executing query with job ID: 9d311de5-3192-4e02-b84f-e9b71820b172
Query executing: 0.62s

BadRequest: 400 Unrecognized name: input at [7:11]

In [None]:
contract_tx_res_sql

Analog zu traces = [t for i,t in traces_tmp_1.iterrows() if t.status == 1 and 
          (t.input != None and t.trace_type == "call") or 
          t.trace_type == "create" or 
          t.trace_type == "suicide"]

# Deprecated

In [2]:
%%bigquery res --project masterarbeit-245718 --verbose 

SELECT * FROM `masterarbeit-245718.ethereum_us.top40k_traces`
where 
    trace_type = "create"
LIMIT 20

UsageError: Cell magic `%%bigquery` not found.


In [None]:
%%bigquery traces_tmp --project masterarbeit-245718 --verbose 

SELECT * from `bigquery-public-data.crypto_ethereum.traces` 
      where DATE(block_timestamp) >= '2019-1-1' and DATE(block_timestamp) <= '2019-1-1'

In [None]:
traces_tmp

In [None]:
%%bigquery traces_tmp_1 --project masterarbeit-245718 --verbose 

SELECT * from `bigquery-public-data.crypto_ethereum.traces` 
      where DATETIME(block_timestamp) >= DATETIME(2020, 01, 01, 18, 00, 00) and DATETIME(block_timestamp) <= DATETIME(2020, 01, 01, 18, 15, 00)

In [None]:
traces = [t for i,t in traces_tmp_1.iterrows() if t.status == 1 and 
          (t.input != None and t.trace_type == "call") or 
          t.trace_type == "create" or 
          t.trace_type == "suicide"]

In [None]:
traces = pd.DataFrame(traces)

In [None]:
traces[traces.trace_type == "call"].loc[43,"transaction_hash"]

In [None]:
traces.loc[45,"transaction_hash"]

# Delete Tables

In [None]:
# %%bigquery res6 --project masterarbeit-245718 --verbose 

# SELECT
# * 
# FROM
# `masterarbeit-245718.ethereum_us.INFORMATION_SCHEMA.TABLES`

In [None]:
# tablesToDelete = [tn for tn in res6["current_view_name"] if (not "features" in tn and not "usd_eth" in tn)]

# print(tablesToDelete)

# for t in tablesToDelete:
#      client.delete_table("ethereum_us.{}".format(t), not_found_ok=True)  