# Setup

## Configuration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.cloud import bigquery
import time
from IPython.core.interactiveshell import InteractiveShell
import simplejson as json
import os
import re
import datetime

In [2]:
%env GOOGLE_APPLICATION_CREDENTIALS=../secrets/bigquery-service-account.json 
%load_ext google.cloud.bigquery

client = bigquery.Client()
InteractiveShell.ast_node_interactivity = "all"

env: GOOGLE_APPLICATION_CREDENTIALS=../secrets/bigquery-service-account.json


## Create Test-Tables in BigQuery

In [3]:
import papermill as pm
from enum import Enum

In [4]:
number_of_addresses = 100

# format: dd.mm.yyyy
observation_period_start = "2020-02-01 05:30:00+00"
observation_period_end = "2020-02-01 05:40:30+00"

class ADDRESS_SELECTION(Enum):
    RANDOM = 1 # selects random addresses, that have been active within the observation period.
    RICHEST = 2 # selects the accounts that have the most ether # not yet implemented
    HIGHEST_TURNOVER = 3 # selects the accounts that have the most ether received + sent

address_selection = ADDRESS_SELECTION.HIGHEST_TURNOVER.value

# max USD amount to spent for executing sql queries
max_bigquery_costs_usd = 2

# Delete old tables
reset = True

In [5]:
%%capture
pm.execute_notebook(
   './features.ipynb',
   './build/features.build.ipynb',
   parameters = dict(observation_period_start=observation_period_start,observation_period_end=observation_period_end, address_selection=address_selection,max_bigquery_costs_usd=max_bigquery_costs_usd, reset = reset)
)

Todo: Only execute notebook when tables are not yet uploaded to BigQuery

# Tests

In [6]:
table_ids = dict()

## feature view "wei" 

In [7]:
current_view_name = "wei"
table_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, ADDRESS_SELECTION(address_selection).name, number_of_addresses, re.sub(r'[-.+: ]', '_', observation_period_start),re.sub(r'[-.+: ]', '_', observation_period_end))

sql = """
    SELECT *
    FROM `masterarbeit-245718.ethereum_us.{table_id_features}` 
""".format(table_id_features = table_ids["wei"])

query_job = client.query(sql)  
wei_sql = query_job.result().to_dataframe(); 
wei_sql = wei_sql.set_index("address")

In [8]:
current_view_name = "traces"
table_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, ADDRESS_SELECTION(address_selection).name, number_of_addresses, re.sub(r'[-.+: ]', '_', observation_period_start),re.sub(r'[-.+: ]', '_', observation_period_end))

filename = '{}.json'.format(table_ids[current_view_name])
data_dir = "../data"

if filename not in os.listdir(data_dir):
    
    print("Loading data from bigquery ...")
    
    sql = """
        SELECT *
        FROM `masterarbeit-245718.ethereum_us.{table_id_features}` 
        """.format(table_id_features = table_ids["traces"])

    query_job = client.query(sql)  
    
    start = time.time();
    
    traces_sql = query_job.result().to_dataframe(); 
    
    done = time.time();
    elapsed = round(done - start);
    
    print("Time to retrieve data from BigQuery: {} Minutes.".format(round(elapsed/60)))
    
    with open('{}/{}'.format(data_dir, filename), 'w') as json_file:
        json.dump(traces_sql.to_dict(), json_file, use_decimal=True, default=str)
        
else: 
    
    print("Loading data from local cache ...")
    
    start = time.time();
    
    with open('../data/{}'.format(filename), "r") as file:  
        file_content_json = json.load(file);
        
    done = time.time();
    elapsed = round(done - start);
    
    print("Time to retrieve data from local cache: {} Seconds.".format(elapsed))
    
    traces_sql = pd.DataFrame(file_content_json)      

Loading data from bigquery ...
Time to retrieve data from BigQuery: 0 Minutes.


In [9]:
m = traces_sql.memory_usage();
totalBytes = m.sum();
totalMegabytes = totalBytes/10**6;
print("Size of traces dataframe: {} Megabytes.".format(round(totalMegabytes)))

Size of traces dataframe: 2.0 Megabytes.


In [10]:
# filter traces
traces = [row for (index, row) in traces_sql.iterrows() if (row.call_type not in ['delegatecall', 'callcode', 'staticcall'] or row.call_type == None)]
traces = pd.DataFrame(traces)
# format traces
traces["block_timestamp"] = pd.to_datetime(traces["block_timestamp"])

In [11]:
# calculate wei send received values for each address
wei_py = pd.DataFrame(index=set(traces_sql["from_address"].unique()) | set(traces_sql["to_address"].unique()))
wei_py["wei_received"] = traces.groupby("to_address").apply(lambda row: row["value"].sum())
wei_py["wei_sent"] = traces.groupby("from_address").apply(lambda row: row["value"].sum())
# refactoring
wei_py["wei_received"] = wei_py["wei_received"].fillna(0.)
wei_py["wei_sent"] = wei_py["wei_sent"].fillna(0.)
wei_py.index = wei_py.index.rename("address")
wei_py = wei_py.reindex(wei_sql.index, fill_value=0.)

In [12]:
wei_py.head()
wei_sql.head()

Unnamed: 0_level_0,wei_received,wei_sent
address,Unnamed: 1_level_1,Unnamed: 2_level_1
0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,1117745663359893508278,257079910000000000
0x9cc5be83b211620cfe23e25bc65d6da6751bcf09,0,400002942480000000000
0xd8a83b72377476d0a66683cde20a8aad0b628713,306897425157000000000,14490000000000000000
0x310962d9a743f1d9f153743b90b2121e040b42d8,0,316002360680000000000
0xc1c3257c6123f644e359d3efce3c07e9ec75ec36,0,300001025557000000000


Unnamed: 0_level_0,wei_received,wei_sent
address,Unnamed: 1_level_1,Unnamed: 2_level_1
0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,1117745663359893508278,257079910000000000
0x9cc5be83b211620cfe23e25bc65d6da6751bcf09,0,400002942480000000000
0xd8a83b72377476d0a66683cde20a8aad0b628713,306897425157000000000,14490000000000000000
0x310962d9a743f1d9f153743b90b2121e040b42d8,0,316002360680000000000
0xc1c3257c6123f644e359d3efce3c07e9ec75ec36,0,300001025557000000000


In [13]:
pd.testing.assert_frame_equal(wei_py, wei_sql)
print("weiSent, weiReceived Test succeeded!!")

weiSent, weiReceived Test succeeded!!


### feature view "tx"

In [14]:
current_view_name = "tx"
table_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, ADDRESS_SELECTION(address_selection).name, number_of_addresses, re.sub(r'[-.+: ]', '_', observation_period_start),re.sub(r'[-.+: ]', '_', observation_period_end))

sql = """
    SELECT *
    FROM `masterarbeit-245718.ethereum_us.{table_id_features}` 
""".format(table_id_features = table_ids["tx"])

query_job = client.query(sql) 
tx_sql = query_job.result().to_dataframe(); 
# refactoring
tx_sql = tx_sql.set_index("address")

In [15]:
# res3 = res2.copy()
# res3 = res3.fillna(0.)
# res3 = res3.sort_values(by="address", ascending=False)
# tx_sent_received_result_sql = res3.set_index("address", drop=True)

In [16]:
tx_py = pd.DataFrame(index=set(traces["from_address"].unique()) | set(traces["to_address"].unique()))
tx_py["number_of_tx_received"] = traces.groupby('to_address').count().value
tx_py["number_of_tx_sent"] = traces.groupby('from_address').count().value
tx_py = tx_py.fillna(0)
tx_py = tx_py.astype("int")
tx_py.index = tx_py.index.rename("address")
tx_py = tx_py.reindex(wei_sql.index, fill_value=0.)

In [17]:
tx_py.head()
tx_sql.head()

Unnamed: 0_level_0,number_of_tx_received,number_of_tx_sent
address,Unnamed: 1_level_1,Unnamed: 2_level_1
0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,41,11
0x9cc5be83b211620cfe23e25bc65d6da6751bcf09,0,1
0xd8a83b72377476d0a66683cde20a8aad0b628713,4,2
0x310962d9a743f1d9f153743b90b2121e040b42d8,0,1
0xc1c3257c6123f644e359d3efce3c07e9ec75ec36,0,1


Unnamed: 0_level_0,number_of_tx_received,number_of_tx_sent
address,Unnamed: 1_level_1,Unnamed: 2_level_1
0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,41,11
0x9cc5be83b211620cfe23e25bc65d6da6751bcf09,0,1
0xd8a83b72377476d0a66683cde20a8aad0b628713,4,2
0x310962d9a743f1d9f153743b90b2121e040b42d8,0,1
0xc1c3257c6123f644e359d3efce3c07e9ec75ec36,0,1


In [18]:
pd.testing.assert_frame_equal(tx_py, tx_sql)
print("txSent, txReceived Test succeeded !!")

txSent, txReceived Test succeeded !!


In [19]:
# features = balance_result_sql.join(tx_sent_received_result_sql,how="left")
# features = features.reset_index()
# features.to_gbq('ethereum_us.sample_features', if_exists="replace")

### feature view avg_time_diff_sent_tx

In [20]:
current_view_name = "avg_time_diff_sent_tx"
table_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, ADDRESS_SELECTION(address_selection).name, number_of_addresses, re.sub(r'[-.+: ]', '_', observation_period_start),re.sub(r'[-.+: ]', '_', observation_period_end))

sql = """
    SELECT *
    FROM `masterarbeit-245718.ethereum_us.{table_id_features}` 
""".format(table_id_features = table_ids["avg_time_diff_sent_tx"])

query_job = client.query(sql) 
avg_time_diff_sent_tx_sql = query_job.result().to_dataframe(); 
# refactoring
avg_time_diff_sent_tx_sql = avg_time_diff_sent_tx_sql.set_index("address")

In [21]:
traces_tmp = [row for (index, row) in traces.iterrows() if (row.call_type not in ['delegatecall', 'callcode', 'staticcall'] or row.call_type == None) and row.status == 1]
traces_tmp = pd.DataFrame(traces_tmp)
tmp1 = traces_tmp.groupby("from_address").max().block_timestamp
tmp2 = traces_tmp.groupby("from_address").min().block_timestamp
tmp3 = tmp1 - tmp2
tmp3 = tmp3.rename("seconds_diff")
avg_time_diff_sent_tx_py = tx_py.join(tmp3, how="right").drop("number_of_tx_received", axis=1)
avg_time_diff_sent_tx_py = avg_time_diff_sent_tx_py.fillna(0.)
avg_time_diff_sent_tx_py["avg_time_diff_sent_tx"] = avg_time_diff_sent_tx_py["seconds_diff"] / (avg_time_diff_sent_tx_py["number_of_tx_sent"] - 1)
avg_time_diff_sent_tx_py["avg_time_diff_sent_tx"] = avg_time_diff_sent_tx_py["avg_time_diff_sent_tx"].fillna(datetime.timedelta(0))
avg_time_diff_sent_tx_py = avg_time_diff_sent_tx_py.reindex(wei_sql.index, fill_value=0.)
avg_time_diff_sent_tx_py.index = avg_time_diff_sent_tx_py.index.rename("address")
avg_time_diff_sent_tx_py = avg_time_diff_sent_tx_py.drop(["number_of_tx_sent", "seconds_diff"],axis=1)
avg_time_diff_sent_tx_py["avg_time_diff_sent_tx"] = [ts.total_seconds() for ts in avg_time_diff_sent_tx_py["avg_time_diff_sent_tx"]]

  result = self._data / other


In [22]:
avg_time_diff_sent_tx_py.head()
avg_time_diff_sent_tx_sql.head()

Unnamed: 0_level_0,avg_time_diff_sent_tx
address,Unnamed: 1_level_1
0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,57.0
0x9cc5be83b211620cfe23e25bc65d6da6751bcf09,0.0
0xd8a83b72377476d0a66683cde20a8aad0b628713,189.0
0x310962d9a743f1d9f153743b90b2121e040b42d8,0.0
0xc1c3257c6123f644e359d3efce3c07e9ec75ec36,0.0


Unnamed: 0_level_0,avg_time_diff_sent_tx
address,Unnamed: 1_level_1
0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,57.0
0x9cc5be83b211620cfe23e25bc65d6da6751bcf09,0.0
0xd8a83b72377476d0a66683cde20a8aad0b628713,189.0
0x310962d9a743f1d9f153743b90b2121e040b42d8,0.0
0xc1c3257c6123f644e359d3efce3c07e9ec75ec36,0.0


In [23]:
pd.testing.assert_frame_equal(avg_time_diff_sent_tx_py, avg_time_diff_sent_tx_sql)
print("avg_time_diff_sent_tx Test succeeded !!")

avg_time_diff_sent_tx Test succeeded !!


## feature view "stddev_received_tx"

In [25]:
current_view_name = "stddev_received_tx"
table_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, ADDRESS_SELECTION(address_selection).name, number_of_addresses, re.sub(r'[-.+: ]', '_', observation_period_start),re.sub(r'[-.+: ]', '_', observation_period_end))

sql = """
    SELECT *
    FROM `masterarbeit-245718.ethereum_us.{table_id_features}` 
""".format(table_id_features = table_ids[current_view_name])

query_job = client.query(sql) 
stddev_received_tx_sql = query_job.result().to_dataframe(); 
# refactoring
stddev_received_tx_sql = stddev_received_tx_sql.set_index("address")
# handle 0 values
stddev_received_tx_sql[current_view_name] = stddev_received_tx_sql[current_view_name].replace(to_replace=0.0, value=max(stddev_received_tx_sql[current_view_name]))

In [26]:
traces_tmp = traces.sort_values(by="block_timestamp", ascending=True)
traces_tmp["block_timestamp"] = pd.to_datetime(traces["block_timestamp"])
stddev_received_tx_py = traces_tmp.groupby("to_address").apply(lambda df: df["block_timestamp"].diff().std() )
stddev_received_tx_py = pd.DataFrame(stddev_received_tx_py, columns=["stddev_received_tx"])
stddev_received_tx_py = stddev_received_tx_py.reindex(wei_sql.index)
stddev_received_tx_py = stddev_received_tx_py.fillna(stddev_received_tx_py.max())
stddev_received_tx_py["stddev_received_tx"] = [td.total_seconds() for td in stddev_received_tx_py["stddev_received_tx"]]
stddev_received_tx_py = stddev_received_tx_py.replace(to_replace=0.0, value=stddev_received_tx_py.max())

In [27]:
stddev_received_tx_py.head()
stddev_received_tx_sql.head()

Unnamed: 0_level_0,stddev_received_tx
address,Unnamed: 1_level_1
0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,24.813342
0x9cc5be83b211620cfe23e25bc65d6da6751bcf09,143.074573
0xd8a83b72377476d0a66683cde20a8aad0b628713,100.271299
0x310962d9a743f1d9f153743b90b2121e040b42d8,143.074573
0xc1c3257c6123f644e359d3efce3c07e9ec75ec36,143.074573


Unnamed: 0_level_0,stddev_received_tx
address,Unnamed: 1_level_1
0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,24.813342
0x9cc5be83b211620cfe23e25bc65d6da6751bcf09,143.074573
0xd8a83b72377476d0a66683cde20a8aad0b628713,100.271299
0x310962d9a743f1d9f153743b90b2121e040b42d8,143.074573
0xc1c3257c6123f644e359d3efce3c07e9ec75ec36,143.074573


In [28]:
np.testing.assert_almost_equal(list(stddev_received_tx_py["stddev_received_tx"]), list(stddev_received_tx_sql["stddev_received_tx"]), decimal=2)
print("stddev Test succeeded !!")

stddev Test succeeded !!


## feature view "active_months"

In [29]:
current_view_name = "active_months"
table_ids[current_view_name] = "{}_{}_{}_{}_{}_view".format(current_view_name, ADDRESS_SELECTION(address_selection).name, number_of_addresses, re.sub(r'[-.+: ]', '_', observation_period_start),re.sub(r'[-.+: ]', '_', observation_period_end))

sql = """
    SELECT *
    FROM `masterarbeit-245718.ethereum_us.{table_id_features}` 
""".format(table_id_features = table_ids[current_view_name])

query_job = client.query(sql) 
active_months_sql = query_job.result().to_dataframe(); 
# refactoring
active_months_sql = active_months_sql.set_index("address")

In [30]:
from datetime import timedelta

def address_was_active(address, month):
    for i,t in traces.iterrows():
        if (t["from_address"] == address or t["to_address"] == address) and (month.strftime("%Y-%m") == t["block_timestamp"].strftime("%Y-%m")):
            return True
    return False

In [31]:
active_months_py = {}
min_ts = traces["block_timestamp"].min()
max_ts = traces["block_timestamp"].max()
min_ts = min_ts.to_pydatetime()
max_ts = max_ts.to_pydatetime()

months = [month.to_pydatetime() for month in pd.date_range(start=min_ts, end=(max_ts + timedelta(days=31)), freq="MS")]

for address in wei_sql.index:
    for month in months:
        if (address_was_active(address, month)):
            active_months_py[address] = active_months_py[address] + 1 if address in active_months_py else 1 

# refactoring
active_months_py = pd.DataFrame(pd.Series(active_months_py), columns=["active_months"])
active_months_py = active_months_py.reindex(active_months_sql.index)
active_months_py = active_months_py.fillna(0.)

In [32]:
active_months_py.head()
active_months_sql.head()

Unnamed: 0_level_0,active_months
address,Unnamed: 1_level_1
0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,1
0x9cc5be83b211620cfe23e25bc65d6da6751bcf09,1
0xd8a83b72377476d0a66683cde20a8aad0b628713,1
0x310962d9a743f1d9f153743b90b2121e040b42d8,1
0xc1c3257c6123f644e359d3efce3c07e9ec75ec36,1


Unnamed: 0_level_0,active_months
address,Unnamed: 1_level_1
0x3f5ce5fbfe3e9af3971dd833d26ba9b5c936f0be,1
0x9cc5be83b211620cfe23e25bc65d6da6751bcf09,1
0xd8a83b72377476d0a66683cde20a8aad0b628713,1
0x310962d9a743f1d9f153743b90b2121e040b42d8,1
0xc1c3257c6123f644e359d3efce3c07e9ec75ec36,1


In [33]:
pd.testing.assert_frame_equal(active_months_py, active_months_sql)
print("active months Test succeeded!!")

active months Test succeeded!!


## feature view "mined_blocks"

In [50]:
# Todo

## feature view "stddev_sent_tx"

In [34]:
# Todo

## feature view "usd_received"

In [35]:
# Todo

## feature view "usd_sent"

In [36]:
# Todo

## feature view "avg_wei_sent"

In [37]:
# Todo

## feature view "avg_wei_received"

In [38]:
# Todo

## feature view "avg_usd_received"

In [39]:
# Todo

## feature view "monthly_wei_sent"

In [40]:
# Todo

## feature view "monthly_wei_received"

In [41]:
# Todo

## feature view "monthly_wei_received"

In [42]:
# Todo

## feature view "monthly_usd_sent"

In [43]:
# Todo

## feature view "monthly_usd_received"

In [44]:
# Todo

## feature view "monthly_outgoing_txns"

In [45]:
# Todo

## feature view "monthly_incoming_txns"

In [46]:
# Todo

## feature view "number_of_contracts_created"

In [47]:
# Todo

## feature view "contract_tx"

In [48]:
# Todo

## feature view "diff_token_used"

In [49]:
# Todo