In [308]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.cloud import bigquery
import time
%env GOOGLE_APPLICATION_CREDENTIALS=../secrets/bigquery-service-account.json 
client = bigquery.Client()
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

env: GOOGLE_APPLICATION_CREDENTIALS=../secrets/bigquery-service-account.json


In [309]:
%load_ext google.cloud.bigquery

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [310]:
%%bigquery metadata --project masterarbeit-245718 --verbose 
SELECT * FROM `bigquery-public-data`.crypto_ethereum.INFORMATION_SCHEMA.COLUMNS where table_name = "traces"

Executing query with job ID: bfab4507-307d-4368-8439-fc69dc593bce
Query executing: 1.55s
Query complete after 2.37s


In [311]:
size_sample_data = 1000

### Generate sample data for "call_type"

In [312]:
# %%bigquery call_types_sql_result --project masterarbeit-245718 --verbose 
# select DISTINCT call_type from `bigquery-public-data.crypto_ethereum.traces`
# where DATE(block_timestamp) >= '2019-07-6' AND DATE(block_timestamp) <= '2019-7-7'

In [313]:
call_types = ['call', 'delegatecall', 'staticcall', 'callcode', None]
prob_call_types = [0.7, 0.05, 0.025, 0.025, 0.2]

In [314]:
call_type_sample = np.random.choice(call_types, size_sample_data, p=prob_call_types)

### Generate sample data for "status"

In [315]:
# %%bigquery status_values --project masterarbeit-245718 --verbose 
# select DISTINCT status from `bigquery-public-data.crypto_ethereum.traces`
# where DATE(block_timestamp) >= '2019-07-6' AND DATE(block_timestamp) <= '2019-7-7'

In [316]:
status_values = [0, 1]
probs_status_values = [0.05, 0.95]

In [317]:
probs_status_values = [0.05, 0.95]

In [318]:
status_sample = np.random.choice(status_values, size_sample_data, p=probs_status_values)

### Generate sample accounts 

In [319]:
exchanges = ["exchange_{}".format(i) for i in range(1,int(0.05*size_sample_data + 1))]

In [320]:
speculators = ["speculator_{}".format(i) for i in range(1,int(0.95*size_sample_data + 1))]

In [321]:
addresses_testdata = []
addresses_testdata.extend(speculators)
addresses_testdata.extend(exchanges)

In [322]:
addresses_testdata = pd.DataFrame(addresses_testdata, columns=["address"])

## Upload 'addresses' table to bigquery 

In [323]:
addresses_testdata.to_gbq('ethereum_us.addresses_testdata', if_exists="replace")

1it [00:04,  4.65s/it]


### Generate sample transactions (speculators to exchanges)

In [324]:
from_spec_addresses = [np.random.choice(speculators) for i in range(int(size_sample_data/2))]
to_ex_addresses = [np.random.choice(exchanges) for i in range(int(size_sample_data/2))]
values_spec_to_ex = np.random.randint(1, 20, int(size_sample_data/2))

In [325]:
txdata1 = pd.DataFrame(zip(from_spec_addresses, to_ex_addresses, values_spec_to_ex), columns=["from_address", "to_address", "value"])

### Generate sample transactions (exchanges to speculators)

In [326]:
to_spec_addresses = [np.random.choice(speculators) for i in range(int(size_sample_data/2))]
from_ex_addresses = [np.random.choice(exchanges) for i in range(int(size_sample_data/2))]
values_ex_to_spec = np.random.randint(1, 5, int(size_sample_data/2))

In [327]:
txdata2 = pd.DataFrame(zip(from_ex_addresses, to_spec_addresses, values_ex_to_spec), columns=["from_address", "to_address", "value"])

Anmerkung: Die speculators schicken mehr Geld zu den Börsen als umgekehrt.

In [328]:
txdata = txdata1.append(txdata2).reset_index(drop=True)

### Generate sample "block_timestamps"

In [329]:
import datetime as datetime

base = datetime.datetime.utcnow()
block_timestamps = [base - datetime.timedelta(seconds=x) for x in range(0, 12*size_sample_data, 12)]

### Merge data to sample "traces" table

In [330]:
traces_sampleData = txdata.copy()

In [331]:
traces_sampleData["status"] = status_sample 

In [332]:
traces_sampleData["call_type"] = call_type_sample 

In [333]:
traces_sampleData["block_timestamp"] = block_timestamps 

In [334]:
traces_sampleData.head()

Unnamed: 0,from_address,to_address,value,status,call_type,block_timestamp
0,speculator_326,exchange_8,7,1,staticcall,2020-01-31 18:00:27.607920
1,speculator_869,exchange_42,5,1,call,2020-01-31 18:00:15.607920
2,speculator_599,exchange_20,12,1,call,2020-01-31 18:00:03.607920
3,speculator_253,exchange_34,12,1,,2020-01-31 17:59:51.607920
4,speculator_266,exchange_15,3,1,call,2020-01-31 17:59:39.607920


### Upload "traces" table to bigquery

In [335]:
traces_sampleData.to_gbq('ethereum_us.traces_sampleData', if_exists="replace")

1it [00:04,  4.91s/it]


### Test SQL command for retrieving "weiReceived", "weiSent"

In [336]:
# %%bigquery res1 --project masterarbeit-245718 --verbose 

#   with weiReceivedView as (
      
#     -- debits
#     select to_address, sum(ifnull(value, 0)) as weiReceived
#     from ethereum_us.traces_sampleData
#     where to_address is not null
#     and status = 1
#     and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
#     group by to_address
      
# ), weiSentView as (
      
#     -- credits
#     select from_address, sum(ifnull(value, 0)) as weiSent
#     from ethereum_us.traces_sampleData
#     where from_address is not null
#     and status = 1
#     and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
#     group by from_address
# ) 
# select 
# CASE 
#   when to_address is not null then to_address
#   when from_address is not null then from_address
# end as address, 
# ifnull(weiReceived,0) as weiReceived, 
# ifnull(weiSent,0) as weiSent
# from (weiReceivedView full outer join weiSentView on from_address = to_address)

In [337]:
%%bigquery res1 --project masterarbeit-245718 --verbose 
with weiView as (

  with weiReceivedView as (
        
      -- debits
      select to_address, sum(ifnull(value, 0)) as weiReceived
      from `ethereum_us.traces_sampleData` 
      where to_address is not null
      and status = 1
      and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
      group by to_address
        
  ), weiSentView as (
  
      -- credits
      select from_address, sum(ifnull(value, 0)) as weiSent
      from  `ethereum_us.traces_sampleData`
      where from_address is not null
      and status = 1
      and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
      group by from_address
  ) 
  select 
  CASE 
    when to_address is not null then to_address
    when from_address is not null then from_address
  end as address, 
  ifnull(weiReceived,0) as weiReceived, 
  ifnull(weiSent,0) as weiSent
  from weiReceivedView full outer join weiSentView on from_address = to_address
) 
select address, weiReceived, weiSent from weiView right join `ethereum_us.addresses_testdata` using(address)

Executing query with job ID: c686579a-08cb-4bc7-9dfe-0728a3df4282
Query executing: 3.21s
Query complete after 4.24s


In [338]:
res2 = res1.copy()
res2["balance"] = res1.weiReceived - res1.weiSent
res2 = res2.set_index("address")
res2 = res2.sort_values(by="address", ascending=False)
res2 = res2.astype("float")
res2 = res2.fillna(0.)
balance_result_sql = res2

In [339]:
%%bigquery traces_sampleData --project masterarbeit-245718 --verbose 
select * from `ethereum_us.traces_sampleData`

Executing query with job ID: 4bf3f5c9-15d0-42e4-bff2-69a889e7ca0a
Query executing: 0.72s
Query complete after 1.31s


In [340]:
data1 = [row for (index, row) in traces_sampleData.iterrows() if (row.call_type not in ['delegatecall', 'callcode', 'staticcall'] or row.call_type == None) and row.status == 1]
data1 = pd.DataFrame(data1)

In [341]:
data2 = pd.DataFrame(index=set(data1["from_address"].unique()) | set(data1["to_address"].unique()))
data2["weiReceived"] = data1.groupby('to_address').sum().value
data2["weiSent"] = data1.groupby('from_address').sum().value
data2["weiSent"] = data2["weiSent"].fillna(0.)
data2["weiReceived"] = data2["weiReceived"].fillna(0.)
data2["balance"] = data2["weiReceived"] - data2["weiSent"]
data2.index = data2.index.rename("address")
data2 = data2.reindex(addresses, fill_value=0.)
data2 = data2.sort_values(by="address", ascending=False)
balance_result_py = data2

In [342]:
balance_result_py.head()
balance_result_sql.head()

Unnamed: 0_level_0,weiReceived,weiSent,balance
address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
speculator_99,3.0,0.0,3.0
speculator_98,0.0,0.0,0.0
speculator_97,0.0,0.0,0.0
speculator_96,0.0,7.0,-7.0
speculator_950,6.0,0.0,6.0


Unnamed: 0_level_0,weiReceived,weiSent,balance
address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
speculator_99,3.0,0.0,3.0
speculator_98,0.0,0.0,0.0
speculator_97,0.0,0.0,0.0
speculator_96,0.0,7.0,-7.0
speculator_950,6.0,0.0,6.0


In [343]:
pd.testing.assert_frame_equal(balance_result_py, balance_result_sql)
print("weiSent, weiReceived Test succeeded!!")

weiSent, weiReceived Test succeeded!!


### Test SQL command for retrieving "txSent" and "txReceived"

In [78]:
%%bigquery res2 --project masterarbeit-245718 --verbose 
with txSent as (
  SELECT from_address, count(*) as numberOfTranscationsSent FROM `masterarbeit-245718.ethereum_us.traces_sampleData` 
  where to_address is not null and status = 1 and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
  group by from_address
), txReceived as (
  SELECT to_address, count(*) as numberOfTranscationsReceived FROM `masterarbeit-245718.ethereum_us.traces_sampleData` 
  where to_address is not null and status = 1 and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
  group by to_address
) SELECT 
CASE  
  WHEN to_address IS NOT NULL THEN to_address
  WHEN from_address IS NOT NULL THEN from_address
END AS address,
IFNULL(numberOfTranscationsReceived, 0) as numberOfTranscationsReceived, 
IFNULL(numberOfTranscationsSent, 0) as numberOfTranscationsSent
from txReceived FULL OUTER JOIN txSent on to_address = from_address

Executing query with job ID: 8ebb850a-35e1-4aa8-8ea1-f75165d083f4
Query executing: 1.32s
Query complete after 1.81s


In [79]:
res3 = res2.copy()
res3 = res3.sort_values(by="address", ascending=False)
tx_sent_received_result_sql = res3.set_index("address", drop=True)

In [80]:
data3 = pd.DataFrame(index=set(data1["from_address"].unique()) | set(data1["to_address"].unique()))
data3["numberOfTranscationsReceived"] = data1.groupby('to_address').count().value
data3["numberOfTranscationsSent"] = data1.groupby('from_address').count().value
data3 = data3.fillna(0)
data3 = data3.astype("int")
data3.index = data3.index.rename("address")
data3 = data3.sort_values(by="address", ascending=False)
tx_sent_received_result_py = data3

In [81]:
pd.testing.assert_frame_equal(tx_sent_received_result_py, tx_sent_received_result_sql)
print("txSent, txReceived Test succeeded !!")

txSent, txReceived Test succeeded !!


### Test avg time between tx

In [82]:
%%bigquery res4 --project masterarbeit-245718 --verbose 
with receivedTx as (
  SELECT to_address, count(*) as numberOfTranscationsSent FROM `masterarbeit-245718.ethereum_us.traces_sampleData` 
  where to_address is not null 
    and status = 1 
    and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
  group by to_address),
timeStampDiffs as (
  SELECT to_address, TIMESTAMP_DIFF(MAX(block_timestamp), MIN( block_timestamp ), second ) as timestampDiff
  FROM `masterarbeit-245718.ethereum_us.traces_sampleData`
  where to_address is not null 
    and status = 1 
    and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
  group by to_address
) select to_address as address, 
CASE 
  when (numberOfTranscationsSent - 1)  > 0 then timestampDiff / (numberOfTranscationsSent - 1) 
  else 0
end as avgTimeDiffBetweenReceivedTransactions
   from receivedTx inner join  timeStampDiffs using(to_address)

Executing query with job ID: e806ef83-d8f2-4324-a5fc-ac951677fe14
Query executing: 1.24s
Query complete after 3.01s


In [83]:
res6 = res4.set_index("address", drop=True)
res6 = res6.sort_values(by="address")
avg_time_diff_receivedtx_result_sql = res6
avg_time_diff_receivedtx_result_sql

Unnamed: 0_level_0,avgTimeDiffBetweenReceivedTransactions
address,Unnamed: 1_level_1
exchange_1,1384.000000
exchange_10,744.000000
exchange_11,437.000000
exchange_12,357.600000
exchange_13,972.000000
exchange_14,612.000000
exchange_15,475.636364
exchange_16,588.000000
exchange_17,495.600000
exchange_18,540.000000


In [84]:
%%bigquery res5 --project masterarbeit-245718 --verbose 
with sentTx as (
  SELECT from_address, count(*) as numberOfTranscationsSent FROM `masterarbeit-245718.ethereum_us.traces_sampleData` 
  where to_address is not null 
    and status = 1 
    and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
  group by from_address),
timeStampDiffs as (
  SELECT from_address, TIMESTAMP_DIFF(MAX(block_timestamp), MIN( block_timestamp ), second ) as timestampDiff
  FROM `masterarbeit-245718.ethereum_us.traces_sampleData`
  where to_address is not null 
    and status = 1 
    and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
  group by from_address
) select from_address as address, 
CASE 
  when (numberOfTranscationsSent - 1)  > 0 then timestampDiff / (numberOfTranscationsSent - 1) 
  else 0
end as avgTimeDiffBetweenSentTransactions
   from sentTx inner join  timeStampDiffs using(from_address)

Executing query with job ID: 93204570-b535-40eb-b1f9-5af72a69f8b7
Query executing: 0.86s
Query complete after 2.56s


In [85]:
%%bigquery --project masterarbeit-245718 --verbose 
with sentTx as (
  SELECT from_address, count(*) as numberOfTranscationsSent FROM `masterarbeit-245718.ethereum_us.traces_sampleData` 
  where to_address is not null 
    and status = 1 
    and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
  group by from_address),
timeStampDiffs as (
  SELECT from_address, TIMESTAMP_DIFF(MAX(block_timestamp), MIN( block_timestamp ), second ) as timestampDiff
  FROM `masterarbeit-245718.ethereum_us.traces_sampleData`
  where to_address is not null 
    and status = 1 
    and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
  group by from_address
) select from_address as address, 
CASE 
  when (numberOfTranscationsSent - 1)  > 0 then timestampDiff / (numberOfTranscationsSent - 1) 
  else 0
end as avgTimeDiffBetweenSentTransactions
   from sentTx inner join  timeStampDiffs using(from_address)

Executing query with job ID: 03afe4ad-6103-4de2-8fe8-ad2b1c77c7cc
Query executing: 2.35s
Query complete after 4.72s


Unnamed: 0,address,avgTimeDiffBetweenSentTransactions
0,speculator_813,0.000000
1,speculator_193,0.000000
2,speculator_415,0.000000
3,speculator_848,0.000000
4,speculator_361,0.000000
5,exchange_31,625.500000
6,exchange_13,461.454545
7,exchange_19,548.400000
8,exchange_20,652.500000
9,exchange_12,780.000000


In [86]:
res6 = res5.set_index("address", drop=True)
res6 = res6.sort_values(by="address")
avg_time_diff_senttx_result_sql = res6
avg_time_diff_senttx_result_sql

Unnamed: 0_level_0,avgTimeDiffBetweenSentTransactions
address,Unnamed: 1_level_1
exchange_1,336.000000
exchange_10,931.200000
exchange_11,430.000000
exchange_12,780.000000
exchange_13,461.454545
exchange_14,496.800000
exchange_15,509.333333
exchange_16,629.142857
exchange_17,774.000000
exchange_18,710.400000


In [87]:
res7 = [row for (index, row) in traces_sampleData.iterrows() if (row.call_type not in ['delegatecall', 'callcode', 'staticcall'] or row.call_type == None) and row.status == 1]
res7 = pd.DataFrame(res7)
res8 = res7.groupby("from_address").max().block_timestamp
res9 = res7.groupby("from_address").min().block_timestamp
res10 = res8 - res9
res10 = res10.rename("seconds_diff")
res10 = tx_sent_received_result_py.join(res10, how="right").drop("numberOfTranscationsReceived", axis=1)
res10 = res10.fillna(0.)
res10["avgTimeDiffBetweenSentTransactions"] = res10["seconds_diff"] / (res10["numberOfTranscationsSent"] - 1)
res10["avgTimeDiffBetweenSentTransactions"] = res10["avgTimeDiffBetweenSentTransactions"].fillna(datetime.timedelta(0))
res10.index =res10.index.rename("address")
res10 = res10.sort_values(by="address")
res10 = res10.drop(["numberOfTranscationsSent", "seconds_diff"],axis=1)
res10["avgTimeDiffBetweenSentTransactions"] = [ts.total_seconds() for ts in res10["avgTimeDiffBetweenSentTransactions"]]
avg_time_diff_senttx_result_py = res10
avg_time_diff_senttx_result_py

Unnamed: 0_level_0,avgTimeDiffBetweenSentTransactions
address,Unnamed: 1_level_1
exchange_1,336.000000
exchange_10,931.200000
exchange_11,430.000000
exchange_12,780.000000
exchange_13,461.454545
exchange_14,496.800000
exchange_15,509.333333
exchange_16,629.142857
exchange_17,774.000000
exchange_18,710.400000


In [88]:
pd.testing.assert_frame_equal(avg_time_diff_senttx_result_sql, avg_time_diff_senttx_result_py)
print("avgTimeDiffBetweenSentTransactions Test succeeded !!")

avgTimeDiffBetweenSentTransactions Test succeeded !!


In [89]:
features = balance_result_sql.join(tx_sent_received_result_sql,how="left")
features = features.join(avg_time_diff_senttx_result_sql)
features = features.join(avg_time_diff_receivedtx_result_sql)
features = features.sort_values(by="balance", ascending=False)
features = features.fillna(0.0)
import sys
# addresses that have sent/received only one transaction get the avg time max * 2
features["avgTimeDiffBetweenSentTransactions"] = features["avgTimeDiffBetweenSentTransactions"].replace(to_replace=0.0, value=2 * max(features["avgTimeDiffBetweenSentTransactions"]))
features["avgTimeDiffBetweenReceivedTransactions"] = features["avgTimeDiffBetweenReceivedTransactions"].replace(to_replace=0.0, value=2 * max(features["avgTimeDiffBetweenReceivedTransactions"]))

In [90]:
features = features.reset_index()

In [91]:
features.head()

Unnamed: 0,address,weiReceived,weiSent,balance,numberOfTranscationsReceived,numberOfTranscationsSent,avgTimeDiffBetweenSentTransactions,avgTimeDiffBetweenReceivedTransactions
0,exchange_45,149.0,20.0,129.0,15,7,736.0,426.0
1,exchange_29,140.0,15.0,125.0,10,5,1161.0,605.333333
2,exchange_23,137.0,13.0,124.0,13,7,470.0,388.0
3,exchange_30,121.0,12.0,109.0,11,7,908.0,512.4
4,exchange_31,123.0,19.0,104.0,13,9,625.5,420.0


In [92]:
features.to_gbq('ethereum_us.features_sampleData', if_exists="replace")

1it [00:06,  6.30s/it]


# Deprecated