In [149]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.cloud import bigquery
import time
%env GOOGLE_APPLICATION_CREDENTIALS=../secrets/bigquery-service-account.json 
client = bigquery.Client()
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

env: GOOGLE_APPLICATION_CREDENTIALS=../secrets/bigquery-service-account.json


In [150]:
%load_ext google.cloud.bigquery

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [151]:
%%bigquery metadata --project masterarbeit-245718 --verbose 
SELECT * FROM `bigquery-public-data`.crypto_ethereum.INFORMATION_SCHEMA.COLUMNS where table_name = "traces"

Executing query with job ID: 1924ec4b-b3ff-4d7e-aa4e-f5f8fa318b47
Query executing: 0.49s
Query complete after 1.25s


In [152]:
size_sample_data = 500

In [153]:
metadata.to_html("./test.html")

### Generate sample data for "call_type"

In [154]:
# %%bigquery call_types_sql_result --project masterarbeit-245718 --verbose 
# select DISTINCT call_type from `bigquery-public-data.crypto_ethereum.traces`
# where DATE(block_timestamp) >= '2019-07-6' AND DATE(block_timestamp) <= '2019-7-7'

In [155]:
call_types = ['call', 'delegatecall', 'staticcall', 'callcode', None]
prob_call_types = [0.7, 0.05, 0.025, 0.025, 0.2]

In [156]:
call_type_sample = np.random.choice(call_types, 2*size_sample_data, p=prob_call_types)

# Generate sample data for "trace_type"

In [158]:
additional_trace_types = ["create", "suicide", "reward", "genesis", "daofork"]
prob_additional_trace_types = [0.0625, 0.0125, 0.9, 0.0125, 0.0125]

trace_type_sample = []
for ct in call_type_sample:
    if ct == None:
        trace_type_sample.append(np.random.choice(additional_trace_types, p=prob_additional_trace_types))
    else:
        trace_type_sample.append("call")

### Generate sample data for "status"

In [159]:
# %%bigquery status_values --project masterarbeit-245718 --verbose 
# select DISTINCT status from `bigquery-public-data.crypto_ethereum.traces`
# where DATE(block_timestamp) >= '2019-07-6' AND DATE(block_timestamp) <= '2019-7-7'

In [160]:
status_values = [0, 1]
probs_status_values = [0.05, 0.95]

In [161]:
probs_status_values = [0.05, 0.95]

In [162]:
status_sample = np.random.choice(status_values, 2*size_sample_data, p=probs_status_values)

### Generate sample accounts 

In [163]:
exchanges = ["exchange_{}".format(i) for i in range(1,int(0.05*size_sample_data + 1))]

In [164]:
speculators = ["speculator_{}".format(i) for i in range(1,int(0.95*size_sample_data + 1))]

In [165]:
sample_addresses = []
sample_addresses.extend(speculators)
sample_addresses.extend(exchanges)

In [166]:
sample_addresses = pd.DataFrame(sample_addresses, columns=["address"])

## Upload 'addresses' table to bigquery 

In [167]:
sample_addresses.to_gbq('ethereum_us.sample_addresses', if_exists="replace")

1it [00:09,  9.56s/it]


### Generate sample transactions (speculators to exchanges)

In [168]:
from_spec_addresses = [np.random.choice(speculators) for i in range(int(2*size_sample_data/2))]
to_ex_addresses = [np.random.choice(exchanges) for i in range(int(2*size_sample_data/2))]
values_spec_to_ex = np.random.randint(1, 20, int(2*size_sample_data/2))

In [169]:
txdata1 = pd.DataFrame(zip(from_spec_addresses, to_ex_addresses, values_spec_to_ex), columns=["from_address", "to_address", "value"])

### Generate sample transactions (exchanges to speculators)

In [170]:
to_spec_addresses = [np.random.choice(speculators) for i in range(int(2*size_sample_data/2))]
from_ex_addresses = [np.random.choice(exchanges) for i in range(int(2*size_sample_data/2))]
values_ex_to_spec = np.random.randint(1, 5, int(2*size_sample_data/2))

In [171]:
txdata2 = pd.DataFrame(zip(from_ex_addresses, to_spec_addresses, values_ex_to_spec), columns=["from_address", "to_address", "value"])

Anmerkung: Die speculators schicken mehr Geld zu den Börsen als umgekehrt.

In [172]:
txdata = txdata1.append(txdata2).reset_index(drop=True)
txdata = txdata.sample(frac=1)

### Generate sample "block_timestamps"

In [173]:
import datetime as datetime

base = datetime.datetime.strptime("2020-02-02", '%Y-%m-%d')
block_timestamps = [base - datetime.timedelta(seconds=x) for x in range(0, 12*size_sample_data, 12)]
base = base - datetime.timedelta(days=31)
block_timestamps.extend([base - datetime.timedelta(seconds=x) for x in range(0, 12*size_sample_data, 12)])

### Merge data to sample "traces" table

In [174]:
sample_traces = txdata.copy()

In [175]:
sample_traces["status"] = status_sample 

In [176]:
sample_traces["call_type"] = call_type_sample 

In [177]:
sample_traces["trace_type"] = trace_type_sample 

In [178]:
sample_traces["block_timestamp"] = block_timestamps 

In [179]:
sample_traces.sort_values(by="from_address")

Unnamed: 0,from_address,to_address,value,status,call_type,trace_type,block_timestamp
959,exchange_1,speculator_339,2,1,call,call,2020-01-01 23:49:24
992,exchange_1,speculator_188,1,1,call,call,2020-01-01 22:43:00
624,exchange_1,speculator_31,2,1,call,call,2020-01-01 22:45:36
606,exchange_1,speculator_234,3,1,call,call,2020-02-01 23:27:12
930,exchange_1,speculator_188,2,1,,reward,2020-01-01 22:56:00
544,exchange_1,speculator_324,3,1,call,call,2020-02-01 23:16:12
947,exchange_1,speculator_39,3,1,call,call,2020-01-01 23:04:36
968,exchange_1,speculator_387,1,1,,reward,2020-02-01 22:58:00
537,exchange_1,speculator_398,2,1,call,call,2020-01-01 22:26:12
890,exchange_1,speculator_140,1,1,call,call,2020-01-01 23:16:00


### Upload "traces" table to bigquery

In [180]:
sample_traces.to_gbq('ethereum_us.sample_traces', if_exists="replace")

1it [00:04,  4.44s/it]


### Test SQL command for retrieving "weiReceived", "weiSent"

In [181]:
%%bigquery res1 --project masterarbeit-245718 --verbose 
with weiView as (

  with weiReceivedView as (
        
      -- debits
      select to_address, sum(ifnull(value, 0)) as weiReceived
      from `ethereum_us.sample_traces` 
      where to_address is not null
      and status = 1
      and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
      group by to_address
        
  ), weiSentView as (
  
      -- credits
      select from_address, sum(ifnull(value, 0)) as weiSent
      from  `ethereum_us.sample_traces` 
      where from_address is not null
      and status = 1
      and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
      group by from_address
  ) 
  select 
  CASE 
    when to_address is not null then to_address
    when from_address is not null then from_address
  end as address, 
  ifnull(weiReceived,0) as weiReceived, 
  ifnull(weiSent,0) as weiSent
  from weiReceivedView full outer join weiSentView on from_address = to_address
) 
select address, weiReceived, weiSent from weiView right join `ethereum_us.sample_addresses`  using(address)

Executing query with job ID: 917f4aab-7383-4eb3-a8db-a20ea53e1d81
Query executing: 0.55s
Query complete after 3.35s


In [182]:
res2 = res1.copy()
res2["balance"] = res1.weiReceived - res1.weiSent
res2 = res2.set_index("address")
res2 = res2.sort_values(by="address", ascending=False)
res2 = res2.astype("float")
res2 = res2.fillna(0.)
balance_result_sql = res2

In [183]:
%%bigquery sample_traces --project masterarbeit-245718 --verbose 
select * from `ethereum_us.sample_traces`

Executing query with job ID: 0dcddbe9-f161-4f12-a671-2fa31d5e7c55
Query executing: 0.50s
Query complete after 1.29s


In [184]:
data1 = [row for (index, row) in sample_traces.iterrows() if (row.call_type not in ['delegatecall', 'callcode', 'staticcall'] or row.call_type == None) and row.status == 1]
data1 = pd.DataFrame(data1)

In [185]:
data2 = pd.DataFrame(index=set(data1["from_address"].unique()) | set(data1["to_address"].unique()))
data2["weiReceived"] = data1.groupby('to_address').sum().value
data2["weiSent"] = data1.groupby('from_address').sum().value
data2["weiSent"] = data2["weiSent"].fillna(0.)
data2["weiReceived"] = data2["weiReceived"].fillna(0.)
data2["balance"] = data2["weiReceived"] - data2["weiSent"]
data2.index = data2.index.rename("address")
data2 = data2.reindex(sample_addresses["address"], fill_value=0.)
data2 = data2.sort_values(by="address", ascending=False)
balance_result_py = data2

In [186]:
balance_result_py.head()
balance_result_sql.head()

Unnamed: 0_level_0,weiReceived,weiSent,balance
address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
speculator_99,6.0,0.0,6.0
speculator_98,2.0,21.0,-19.0
speculator_97,0.0,11.0,-11.0
speculator_96,3.0,16.0,-13.0
speculator_95,3.0,23.0,-20.0


Unnamed: 0_level_0,weiReceived,weiSent,balance
address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
speculator_99,6.0,0.0,6.0
speculator_98,2.0,21.0,-19.0
speculator_97,0.0,11.0,-11.0
speculator_96,3.0,16.0,-13.0
speculator_95,3.0,23.0,-20.0


In [187]:
pd.testing.assert_frame_equal(balance_result_py, balance_result_sql)
print("weiSent, weiReceived Test succeeded!!")

weiSent, weiReceived Test succeeded!!


### Test SQL command for retrieving "txSent" and "txReceived"

In [188]:
%%bigquery res2 --project masterarbeit-245718 --verbose 
with txView as (

  with txSent as (
      SELECT from_address, count(*) as numberOfTranscationsSent FROM `masterarbeit-245718.ethereum_us.sample_traces` 
      where to_address is not null and status = 1 and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
      group by from_address
    ), txReceived as (
      SELECT to_address, count(*) as numberOfTranscationsReceived FROM `masterarbeit-245718.ethereum_us.sample_traces` 
      where to_address is not null and status = 1 and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
      group by to_address
    ) 
    SELECT 
    CASE  
      WHEN to_address IS NOT NULL THEN to_address
      WHEN from_address IS NOT NULL THEN from_address
    END AS address,
    IFNULL(numberOfTranscationsReceived, 0) as numberOfTranscationsReceived, 
    IFNULL(numberOfTranscationsSent, 0) as numberOfTranscationsSent
    from txReceived FULL OUTER JOIN txSent on to_address = from_address
) 

    select address, numberOfTranscationsReceived, numberOfTranscationsSent from txView right join `ethereum_us.sample_addresses` using(address)

Executing query with job ID: 91c962cc-299b-4b44-ada5-5919f49969ef
Query executing: 0.60s
Query complete after 2.41s


In [189]:
res3 = res2.copy()
res3 = res3.fillna(0.)
res3 = res3.sort_values(by="address", ascending=False)
tx_sent_received_result_sql = res3.set_index("address", drop=True)

In [190]:
data3 = pd.DataFrame(index=set(data1["from_address"].unique()) | set(data1["to_address"].unique()))
data3["numberOfTranscationsReceived"] = data1.groupby('to_address').count().value
data3["numberOfTranscationsSent"] = data1.groupby('from_address').count().value
data3 = data3.fillna(0)
data3 = data3.astype("int")
data3.index = data3.index.rename("address")
data3 = data3.reindex(sample_addresses["address"], fill_value=0.)
data3 = data3.sort_values(by="address", ascending=False)
tx_sent_received_result_py = data3

In [191]:
pd.testing.assert_frame_equal(tx_sent_received_result_py, tx_sent_received_result_sql)
print("txSent, txReceived Test succeeded !!")

txSent, txReceived Test succeeded !!


In [192]:
features = balance_result_sql.join(tx_sent_received_result_sql,how="left")
features = features.reset_index()
features.to_gbq('ethereum_us.sample_features', if_exists="replace")

1it [00:03,  3.68s/it]


# avg_wei_sent, avg_wei_recd

In [193]:
%%bigquery --project masterarbeit-245718 --verbose 

select 
    address, weiReceived, weiSent, balance, numberOfTranscationsReceived, numberOfTranscationsSent,
    CASE 
        when numberOfTranscationsSent > 0 THEN weiSent / numberOfTranscationsSent 
        when numberOfTranscationsSent = 0 THEN 0 
    END as agv_wei_sent,
    CASE 
        when numberOfTranscationsReceived > 0 THEN weiReceived / numberOfTranscationsReceived 
        when numberOfTranscationsReceived = 0 THEN 0 
    END as avg_wei_recd
from `ethereum_us.sample_features`

Executing query with job ID: 9cf79560-a911-4fe0-aab3-e87bae78aaa4
Query executing: 0.51s
Query complete after 1.26s


Unnamed: 0,address,weiReceived,weiSent,balance,numberOfTranscationsReceived,numberOfTranscationsSent,agv_wei_sent,avg_wei_recd
0,speculator_82,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1,speculator_74,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
2,speculator_72,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
3,speculator_58,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
4,speculator_5,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
5,speculator_467,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
6,speculator_46,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
7,speculator_457,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
8,speculator_456,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
9,speculator_455,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000


#### Test: avg_wei_sent, avg_wei_recd

Benötigt keine Tests, da der SQL Befehl sehr einfach aufgebaut ist.

### Test avg time between tx

In [194]:
%%bigquery res4 --project masterarbeit-245718 --verbose 
with timeRecView as (

  with receivedTx as (
    SELECT to_address, count(*) as numberOfTranscationsReceived FROM `masterarbeit-245718.ethereum_us.sample_traces` 
    where to_address is not null 
      and status = 1 
      and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
    group by to_address),
  
  timeStampDiffs as (
    SELECT to_address, TIMESTAMP_DIFF(MAX(block_timestamp), MIN( block_timestamp ), second ) as timestampDiff
    FROM `masterarbeit-245718.ethereum_us.sample_traces`
    where to_address is not null 
      and status = 1 
      and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
    group by to_address
  
  ) select to_address as address, 
  CASE 
    when (numberOfTranscationsReceived - 1)  > 0 then timestampDiff / (numberOfTranscationsReceived - 1) 
    else 0
  end as avgTimeDiffBetweenReceivedTransactions
  from receivedTx inner join  timeStampDiffs using(to_address)
)

select address, ifnull(avgTimeDiffBetweenReceivedTransactions,0) as avgTimeDiffBetweenReceivedTransactions from timeRecView right join `ethereum_us.sample_addresses` using(address)

Executing query with job ID: c883424a-35ff-4bd5-b489-9cb3f95a61c2
Query executing: 1.36s
Query complete after 2.15s


In [195]:
res6 = res4.set_index("address", drop=True)
res6 = res6.fillna(0.)
res6 = res6.sort_values(by="address")
avg_time_diff_receivedtx_result_sql = res6

In [196]:
%%bigquery res5 --project masterarbeit-245718 --verbose 
with timeSentView as (

  with sentTx as (
    SELECT from_address, count(*) as numberOfTranscationsSent FROM `masterarbeit-245718.ethereum_us.sample_traces` 
    where to_address is not null 
      and status = 1 
      and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
    group by from_address),
  timeStampDiffs as (
    SELECT from_address, TIMESTAMP_DIFF(MAX(block_timestamp), MIN( block_timestamp ), second ) as timestampDiff
    FROM `masterarbeit-245718.ethereum_us.sample_traces`
    where to_address is not null 
      and status = 1 
      and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
    group by from_address
  ) select from_address as address, 
  CASE 
    when (numberOfTranscationsSent - 1)  > 0 then timestampDiff / (numberOfTranscationsSent - 1) 
    else 0
  end as avgTimeDiffBetweenSentTransactions
     from sentTx inner join  timeStampDiffs using(from_address)
)

select address, ifnull(avgTimeDiffBetweenSentTransactions,0) as avgTimeDiffBetweenSentTransactions from timeSentView right join `ethereum_us.sample_addresses` using(address)

Executing query with job ID: ab3fe5e6-43c2-4549-90b1-f6ac92bf451d
Query executing: 0.58s
Query complete after 2.71s


In [197]:
res6 = res5.set_index("address", drop=True)
res6 = res6.fillna(0.)
res6 = res6.sort_values(by="address")
avg_time_diff_senttx_result_sql = res6

In [198]:
res7 = [row for (index, row) in sample_traces.iterrows() if (row.call_type not in ['delegatecall', 'callcode', 'staticcall'] or row.call_type == None) and row.status == 1]
res7 = pd.DataFrame(res7)
res8 = res7.groupby("from_address").max().block_timestamp
res9 = res7.groupby("from_address").min().block_timestamp
res10 = res8 - res9
res10 = res10.rename("seconds_diff")
res10 = tx_sent_received_result_py.join(res10, how="right").drop("numberOfTranscationsReceived", axis=1)
res10 = res10.fillna(0.)
res10["avgTimeDiffBetweenSentTransactions"] = res10["seconds_diff"] / (res10["numberOfTranscationsSent"] - 1)
res10["avgTimeDiffBetweenSentTransactions"] = res10["avgTimeDiffBetweenSentTransactions"].fillna(datetime.timedelta(0))
res10 = res10.reindex(sample_addresses["address"], fill_value=0.)
res10.index = res10.index.rename("address")
res10 = res10.sort_values(by="address")
res10 = res10.drop(["numberOfTranscationsSent", "seconds_diff"],axis=1)
res10["avgTimeDiffBetweenSentTransactions"] = [ts.total_seconds() for ts in res10["avgTimeDiffBetweenSentTransactions"]]
avg_time_diff_senttx_result_py = res10

  result = self._data / other


In [199]:
pd.testing.assert_frame_equal(avg_time_diff_senttx_result_sql, avg_time_diff_senttx_result_py)
print("avgTimeDiffBetweenSentTransactions Test succeeded !!")

avgTimeDiffBetweenSentTransactions Test succeeded !!


In [200]:
features = balance_result_sql.join(tx_sent_received_result_sql,how="left")
features = features.join(avg_time_diff_senttx_result_sql)
features = features.join(avg_time_diff_receivedtx_result_sql)
features = features.sort_values(by="balance", ascending=False)
features = features.fillna(0.0)
import sys
# addresses that have sent/received only one transaction get the avg time max * 2
features["avgTimeDiffBetweenSentTransactions"] = features["avgTimeDiffBetweenSentTransactions"].replace(to_replace=0.0, value=2 * max(features["avgTimeDiffBetweenSentTransactions"]))
features["avgTimeDiffBetweenReceivedTransactions"] = features["avgTimeDiffBetweenReceivedTransactions"].replace(to_replace=0.0, value=2 * max(features["avgTimeDiffBetweenReceivedTransactions"]))

In [201]:
features = features.reset_index()

In [202]:
features.to_gbq('ethereum_us.sample_features', if_exists="replace")

1it [00:03,  3.29s/it]


# Test mined Blocks

In [203]:
%%bigquery sql_res_minedblocks --project masterarbeit-245718 --verbose 

with minedBlocksView as (
    SELECT to_address as address, count(*) as mined_blocks FROM `masterarbeit-245718.ethereum_us.sample_traces`
        where trace_type = "reward"
        group by to_address
    )
select address, ifnull(mined_blocks,0) as minedBlocks from minedBlocksView right join `ethereum_us.sample_addresses` using(address)


Executing query with job ID: 685f4cc8-3c82-4398-b06a-43333a6d304e
Query executing: 0.51s
Query complete after 2.67s


In [204]:
features.head()

Unnamed: 0,address,weiReceived,weiSent,balance,numberOfTranscationsReceived,numberOfTranscationsSent,avgTimeDiffBetweenSentTransactions,avgTimeDiffBetweenReceivedTransactions
0,exchange_17,281.0,62.0,219.0,24.0,22.0,127748.571429,116706.26087
1,exchange_4,253.0,38.0,215.0,22.0,16.0,178944.0,127774.285714
2,exchange_6,224.0,27.0,197.0,21.0,11.0,268226.4,134136.6
3,exchange_2,218.0,44.0,174.0,22.0,18.0,157698.352941,127784.571429
4,exchange_8,207.0,34.0,173.0,21.0,15.0,191635.714286,134148.6


# Test var(avgTimeDiff...)

In [205]:
%%bigquery res_sql_stddev --project masterarbeit-245718 --verbose 

with timestamp_var as (
    with timestamps_diffs as (
        
        with timestamps_preceding_tx as (
            
            with timestamps_sent_tx as (
                select from_address, block_timestamp from `masterarbeit-245718.ethereum_us.sample_traces`
                    where from_address is not null 
                      and status = 1 
                      and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
            )
            
            select from_address, block_timestamp,
                lag(block_timestamp) OVER (partition by from_address order by block_timestamp asc) as preceding_block_timestamp 
            from timestamps_sent_tx
        )
        
        select from_address, block_timestamp, preceding_block_timestamp, 
            TIMESTAMP_DIFF(block_timestamp, preceding_block_timestamp, second) as timestampdiff
        from timestamps_preceding_tx
    )
    select from_address as address, STDDEV_SAMP(timestampdiff) as stddev_sample  
    from timestamps_diffs group by from_address 
) 
select * from timestamp_var right join `ethereum_us.sample_addresses` using(address)

Executing query with job ID: 2862505e-b39d-44e5-bff7-11e7bd57ca02
Query executing: 1.05s
Query complete after 1.48s


In [206]:
stddev_result_sql = res_sql_stddev.set_index("address")
stddev_result_sql = stddev_result_sql.reindex(sample_addresses["address"])
stddev_result_sql = stddev_result_sql.fillna(res_sql_stddev["stddev_sample"].max())
stddev_result_sql = stddev_result_sql.sort_values(by="address")

In [207]:
%%bigquery res_sql_stddev --project masterarbeit-245718 --verbose 

with timestamp_var as (
    
    with timestamps_diffs as (
        
        with timestamps_preceding_tx as (
            
            with timestamps_sent_tx as (
                select to_address, block_timestamp from `masterarbeit-245718.ethereum_us.sample_traces`
                    where to_address is not null 
                      and status = 1 
                      and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null)
            )
            
            select to_address, block_timestamp,
                lag(block_timestamp) OVER (partition by to_address order by block_timestamp asc) as preceding_block_timestamp 
            from timestamps_sent_tx
        )
        
        select to_address, block_timestamp, preceding_block_timestamp, 
            TIMESTAMP_DIFF(block_timestamp, preceding_block_timestamp, second) as timestampdiff
        from timestamps_preceding_tx
    )
    select to_address as address, STDDEV_SAMP(timestampdiff) as stddev_sample  
    from timestamps_diffs group by to_address 
) 
select * from timestamp_var right join `ethereum_us.sample_addresses` using(address)

Executing query with job ID: 76e93553-ff8c-4c6c-9620-a0091798fec6
Query executing: 0.51s
Query complete after 3.14s


In [208]:
stddev_result_sql = res_sql_stddev.set_index("address")
stddev_result_sql = stddev_result_sql.reindex(sample_addresses["address"])
stddev_result_sql = stddev_result_sql.fillna(res_sql_stddev["stddev_sample"].max())
stddev_result_sql = stddev_result_sql.sort_values(by="stddev_sample")

# Verifikation der var(avgTimeDiff...) Tabelle via Python

In [209]:
res7 = [row for (index, row) in sample_traces.iterrows() if (row.call_type not in ['delegatecall', 'callcode', 'staticcall'] or row.call_type == None) and row.status == 1]
res7 = pd.DataFrame(res7)
res7 = res7.sort_values(by="block_timestamp", ascending=True)

In [210]:
def calculateDifference(df):
#     from_address = df["from_address"].reset_index(drop=True)[0]
    variance = df["block_timestamp"].diff().std()
    return variance
#     return pd.DataFrame({"variance": [variance]})

res8 = res7.groupby("to_address").apply(lambda df: df["block_timestamp"].diff().std() )
res8 = pd.DataFrame(res8, columns=["stddev_sample"])
res8 = res8.reindex(sample_addresses["address"])
res8 = res8.fillna(res8.max())
res8["stddev_sample"] = res8["stddev_sample"] / np.timedelta64(1, 's')
res8 = res8.sort_values(by="stddev_sample")
stddev_result_py = res8

In [211]:
# pd.testing.assert_frame_equal(stddev_result_py, stddev_result_sql)
np.testing.assert_almost_equal(list(stddev_result_py["stddev_sample"]), list(stddev_result_sql["stddev_sample"]), decimal=2)
print("stddev Test succeeded !!")

stddev Test succeeded !!


# Erstellen der traces_usd Tabelle

In [212]:
import os 
from datetime import datetime

In [213]:
eth_usd_res = pd.read_csv('../data/eth_usd.csv') 
# Preview the first 5 lines of the loaded data 
eth_usd = eth_usd_res.copy()
eth_usd["avg_usd_eth"] = (eth_usd_res["High"] + eth_usd_res["Low"]) / 2
eth_usd["usd_eth_timestamp"] = [datetime.strptime(ts, "%Y-%m-%d") for ts in eth_usd_res["Date"]]
eth_usd = eth_usd.loc[:, ["usd_eth_timestamp", "avg_usd_eth"]]
# data.sort_values(by="avg", ascending=False)
# data.iloc[data["avg_usd_eth"].idxmax(),:]
eth_usd.tail()

Unnamed: 0,usd_eth_timestamp,avg_usd_eth
1645,2020-02-07,217.722564
1646,2020-02-08,220.98719
1647,2020-02-09,226.423241
1648,2020-02-10,223.632332
1649,2020-02-11,222.864899


In [214]:
eth_usd.to_gbq('ethereum_us.usd_eth', if_exists="replace")

1it [00:03,  3.64s/it]


In [215]:
%%bigquery traces_usd --project masterarbeit-245718 --verbose 

select from_address, to_address, value, status, call_type, trace_type, block_timestamp, avg_usd_eth * value as value_usd from `masterarbeit-245718.ethereum_us.sample_traces` as traces left join `masterarbeit-245718.ethereum_us.usd_eth` as usd_eth 
  on (TIMESTAMP_TRUNC(usd_eth.usd_eth_timestamp, DAY, 'UTC') = TIMESTAMP_TRUNC(traces.block_timestamp, DAY, 'UTC'))

Executing query with job ID: 4abba0a6-9ba0-40c9-ace3-360ec194020f
Query executing: 0.58s
Query complete after 1.35s


In [216]:
traces_usd.to_gbq('ethereum_us.sample_traces_usd', if_exists="replace")

1it [00:06,  6.32s/it]


# Total USD received / sent 

Analog zu wei.

# AVG USD received / sent

Analog zu wei.

# Montly USD reveived / sent

Analog zu wei.

# Active Months

In [217]:
%%bigquery active_months_result_sql --project masterarbeit-245718 --verbose 

with traces_clean as (
    select * from `masterarbeit-245718.ethereum_us.sample_traces` where 
    status = 1 and (call_type not in ('delegatecall', 'callcode', 'staticcall') or call_type is null) 
    
), tx_received as (
    select 
        TIMESTAMP_TRUNC(block_timestamp, MONTH, 'UTC') as month, 
        to_address,
        count(*) as number_tx_received
    from traces_clean
    group by TIMESTAMP_TRUNC(block_timestamp, MONTH, 'UTC'), to_address
    
), tx_sent as (
    select 
        TIMESTAMP_TRUNC(block_timestamp, MONTH, 'UTC') as month, 
        from_address,
        count(*) as number_tx_sent
    from traces_clean
    group by TIMESTAMP_TRUNC(block_timestamp, MONTH, 'UTC'), from_address
    
), monthly_tx as (
    select 
        CASE  
          WHEN tx_sent.from_address IS NOT NULL THEN tx_sent.from_address
          WHEN tx_received.to_address IS NOT NULL THEN tx_received.to_address
        END AS address,
        CASE  
          WHEN tx_sent.month IS NOT NULL THEN tx_sent.month
          WHEN tx_received.month IS NOT NULL THEN tx_received.month
        END AS month,
        ifnull(number_tx_sent,0) as number_tx_sent, 
        ifnull(number_tx_received,0) as number_tx_received 
    from tx_sent full join tx_received 
        on (tx_sent.from_address = tx_received.to_address and tx_sent.month = tx_received.month)

), active_months_view as (
    select 
        address, 
        countif(number_tx_sent > 0 or number_tx_received > 0) as active_months 
    from monthly_tx group by address order by address ASC 
)

select 
    address, 
    ifnull(active_months,0) as active_months 
from ethereum_us.sample_addresses left join active_months_view using(address)



Executing query with job ID: 0e2a1e8b-88a3-411e-a561-ffc5a92e3d42
Query executing: 0.55s
Query complete after 3.06s


#### Tests für active_months feature

In [218]:
traces = [row for (index, row) in sample_traces.iterrows() if (row.call_type not in ['delegatecall', 'callcode', 'staticcall'] or row.call_type == None) and row.status == 1]
traces = pd.DataFrame(traces)

In [219]:
def address_was_active(address, month):
    for i,t in traces.iterrows():
        if (t["from_address"] == address or t["to_address"] == address) and (month.strftime("%Y-%m") == t["block_timestamp"].strftime("%Y-%m")):
            return True
    return False
        

In [220]:
from datetime import timedelta

active_months = {}

min_ts = traces["block_timestamp"].min()
max_ts = traces["block_timestamp"].max()
min_ts = min_ts.to_pydatetime()
max_ts = max_ts.to_pydatetime()

         
months = [month.to_pydatetime() for month in pd.date_range(start=min_ts, end=(max_ts + timedelta(days=31)), freq="MS")]

for index,address in sample_addresses.iterrows():
    for month in months:
        if (address_was_active(address[0], month)):
            active_months[address[0]] = active_months[address[0]] + 1 if address[0] in active_months else 1 

In [221]:
s = pd.Series(active_months)
df = pd.DataFrame(s)
df = df.reindex(sample_addresses["address"])
df = df.reset_index()
df.columns=["address", "active_months"]
df = df.fillna(0.)
df["active_months"] = df["active_months"].astype(int)
active_months_result_py  = df
pd.testing.assert_frame_equal(active_months_result_py, active_months_result_sql)
print("active months Test succeeded!!")

active months Test succeeded!!


# montly_wei_sent, monthly_wei_recd

In [222]:
active_months_result_sql = active_months_result_sql.set_index("address")
features = balance_result_sql.join(active_months_result_sql, how="left")

In [223]:
features.to_gbq('ethereum_us.sample_features', if_exists="replace")

1it [00:04,  4.65s/it]


In [224]:
%%bigquery --project masterarbeit-245718 --verbose 

select 
    *,
    CASE 
        when active_months > 0 THEN weiSent / active_months 
        when active_months = 0 THEN 0 
    END as monthly_wei_sent,
    CASE 
        when active_months > 0 THEN weiReceived / active_months 
        when active_months = 0 THEN 0 
    END as monthly_wei_recd
from `ethereum_us.sample_features`

Executing query with job ID: d5d62d01-e0ac-495c-9e6c-fa5baeba4ec6
Query executing: 0.47s
Query complete after 0.96s


Unnamed: 0,weiReceived,weiSent,balance,active_months,monthly_wei_sent,monthly_wei_recd
0,0.0,0.0,0.0,0,0.0,0.0
1,0.0,0.0,0.0,0,0.0,0.0
2,0.0,0.0,0.0,0,0.0,0.0
3,0.0,0.0,0.0,0,0.0,0.0
4,0.0,0.0,0.0,0,0.0,0.0
5,0.0,0.0,0.0,0,0.0,0.0
6,0.0,0.0,0.0,0,0.0,0.0
7,0.0,0.0,0.0,0,0.0,0.0
8,0.0,0.0,0.0,0,0.0,0.0
9,0.0,0.0,0.0,0,0.0,0.0


# monthly_outgoing_txns, monthly_incoming_txns

In [225]:
features = tx_sent_received_result_sql.join(active_months_result_sql, how="left")

In [226]:
features.to_gbq('ethereum_us.sample_features', if_exists="replace")

1it [00:08,  8.94s/it]


In [227]:
%%bigquery --project masterarbeit-245718 --verbose 

select 
    *,
    CASE 
        when active_months > 0 THEN numberOfTranscationsSent / active_months 
        when active_months = 0 THEN 0 
    END as monthly_outgoing_txns,
    CASE 
        when active_months > 0 THEN numberOfTranscationsReceived / active_months 
        when active_months = 0 THEN 0 
    END as monthly_incoming_txns
from `ethereum_us.sample_features`

Executing query with job ID: d1c870ea-3fee-4748-9109-db029bc7e865
Query executing: 0.46s
Query complete after 1.21s


Unnamed: 0,numberOfTranscationsReceived,numberOfTranscationsSent,active_months,monthly_outgoing_txns,monthly_incoming_txns
0,0.0,0.0,0,0.0,0.0
1,0.0,0.0,0,0.0,0.0
2,0.0,0.0,0,0.0,0.0
3,0.0,0.0,0,0.0,0.0
4,0.0,0.0,0,0.0,0.0
5,0.0,0.0,0,0.0,0.0
6,0.0,0.0,0,0.0,0.0
7,0.0,0.0,0,0.0,0.0
8,0.0,0.0,0,0.0,0.0
9,0.0,0.0,0,0.0,0.0


# number_of_contracts_created

In [228]:
%%bigquery number_of_contracts_created_res_sql --project masterarbeit-245718 --verbose 

select 
    from_address, count(*) as number_of_contracts_created
from `masterarbeit-245718.ethereum_us.sample_traces`
    where
        status = 1 and
        trace_type = "create"
group by from_address
order by from_address

Executing query with job ID: 0a487927-33d2-401f-950d-a5a4874034e1
Query executing: 0.41s
Query complete after 1.14s


Anmerkung: Es werden sowohl internal als auch external Transaktionen berücksichtigt, welche einen contract erstellt haben. 

In [230]:
number_of_contracts_created_res_python = [row for (index, row) in sample_traces.iterrows() if row.trace_type == "create" and row.status == 1]
number_of_contracts_created_res_python = pd.DataFrame(number_of_contracts_created_res_python)
number_of_contracts_created_res_python = pd.DataFrame(number_of_contracts_created_res_python.groupby("from_address").size(), columns=["number_of_contracts_created"])
number_of_contracts_created_res_python = number_of_contracts_created_res_python.reset_index()
number_of_contracts_created_res_python = number_of_contracts_created_res_python.sort_values(by="from_address")
print("Tests für Merkmal 'number_of_contracts_created' erfolgreich ausgeführt.")

In [231]:
pd.testing.assert_frame_equal(number_of_contracts_created_res_sql, number_of_contracts_created_res_python)

# Deprecated

In [232]:
%%bigquery res --project masterarbeit-245718 --verbose 

SELECT * FROM `masterarbeit-245718.ethereum_us.top40k_traces`
where 
    trace_type = "create"
LIMIT 20

Executing query with job ID: 6e8f8c56-4eac-41e4-9e88-8986e4c174ad
Query executing: 0.52s
Query complete after 0.98s


In [236]:
# %%bigquery res6 --project masterarbeit-245718 --verbose 

# SELECT
# * 
# FROM
# `masterarbeit-245718.ethereum_us.INFORMATION_SCHEMA.TABLES`


In [237]:
# tablesToDelete = [tn for tn in res6["table_name"] if tn.startswith("top40k_19_20")]

# for t in tablesToDelete:
#      client.delete_table("ethereum_us.{}".format(t), not_found_ok=True)  

In [238]:
# view_ref = shared_dataset_ref.table("my_shared_view")
# view = bigquery.Table(view_ref)
# sql_template = 'SELECT name, post_abbr FROM `{}.{}.{}` WHERE name LIKE "W%"'
# view.view_query = sql_template.format(project, source_dataset_id, source_table_id)
# view = client.create_table(view)  # API request

# print("Successfully created view at {}".format(view.full_table_id))