**Elliptic++ Data Preparation for Model**

-----------------

Purpose of this notebook:
- Preprocess data to remove duplicates, join tables in advance of transaction classification, sub-network and ranking models.
- This step is external to the pre-processing step as part of the model development
- Determine the linearity of data

Two tables are edited:
- txn_features_clean (additional features added)
- wallets_features_classes_combined (duplicates are removed)

The table 'txn_features_clean' is a key table for the txn classification exercise and 'wallets_features_clean' is used for address classification and address analysis.


In [30]:
# Data cleaning and manipulation
import pandas as pd
import numpy as np
import math
import time
import matplotlib.pyplot as plt

# linearity of data
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import mutual_info_regression

# GCP libraries
from pandas_gbq import to_gbq # write pandas df to a GCP BigQuery table
import gcsfs
import importlib.util
import os
import inspect

# Set up display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Suppress FutureWarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


----------------------

Pre-process 'txn_features' df by adding classes and additional features to the transaction features dataset.

----------------------

Add classes to the transaction dataset. These classes are from the original Elliptic dataset.

In [4]:
# Read in txn_classes table
%%bigquery df_txn_classes
select * from `extreme-torch-467913-m6.txn.txn_classes`;

Query is running:   0%|          |

Downloading:   0%|          |

In [5]:
# Read in txn_features table
%%bigquery df_txn_features
select * from `extreme-torch-467913-m6.txn.txn_features`;

Query is running:   0%|          |

Downloading:   0%|          |

In [6]:
print(df_txn_classes.shape)
print(df_txn_features.shape)

(203769, 2)
(203769, 184)


In [7]:
# Map classes to a name
df_txn_classes['class_label'] = df_txn_classes['class'].map({1: 'Illicit', 2: 'Licit', 3: 'Unknown'})

# add in class code and labels
df_txn_features_clean = pd.merge(df_txn_features, df_txn_classes, how = 'left', on = 'txId')

In [8]:
# Move class number and label columns to index number 2
columns_to_move = ['class','class_label']
target_position = 2  # 0-based index for the third position

# Pop the columns in the desired order
for col in columns_to_move:
    col_data = df_txn_features_clean.pop(col)
    df_txn_features_clean.insert(target_position, col, col_data)
    target_position += 1  # Increment target position for the next column

Add additional features to the dataset

In [9]:
# Ratio of total incoming BTC to outgoing BTC
# A high ratio suggests funds are being accumulated rather than spent
# A low ratio suggests funds are quickly moving through
df_txn_features_clean["in_out_ratio"] = df_txn_features_clean["in_BTC_total"] / df_txn_features_clean["out_BTC_total"]

# Ratio of input addresses to output addresses
# High values could indicate consolidation of funds
# Low values may suggest distribution across multiple recipients
df_txn_features_clean["input_output_ratio"] = df_txn_features_clean["num_input_addresses"] / df_txn_features_clean["num_output_addresses"]

# Dispersion of outgoing BTC values
# High dispersion might suggest mixing transactions
df_txn_features_clean["BTC_dispersion"] = df_txn_features_clean["out_BTC_max"] - df_txn_features_clean["out_BTC_min"]

# Flag transactions with fees higher than the 95th percentile
# High fees may indicate urgency or attempts to prioritize transactions
df_txn_features_clean["high_fees_flag"] = np.where(df_txn_features_clean["fees"].isna(), np.nan, (df_txn_features_clean["fees"] > df_txn_features_clean["fees"].quantile(0.95)).astype(float))

# Flag transactions with extremely small BTC amounts
# Microtransactions may be used in dust attacks or laundering
df_txn_features_clean["micro_txn_flag"] = np.where(df_txn_features_clean["total_BTC"].isna(), np.nan, (df_txn_features_clean["total_BTC"] < 0.001).astype(float))

# Transaction density: size of transaction per address
# 'size' represents the transaction size in bytes, which affects fees and complexity
# Larger transactions indicate more inputs/outputs and can suggest mixing services or high-volume transactions
# Higher values may indicate complex transactions
df_txn_features_clean["txn_density"] = df_txn_features_clean["size"] / (df_txn_features_clean["num_input_addresses"] + df_txn_features_clean["num_output_addresses"]).replace(0, np.nan)

# Fee-related features
# Measures efficiency of fees paid per transaction size or inputs
df_txn_features_clean["fees_per_byte"] = df_txn_features_clean["fees"] / df_txn_features_clean["size"].replace(0, np.nan)
df_txn_features_clean["fees_ratio"] = df_txn_features_clean["fees"] / df_txn_features_clean["total_BTC"].replace(0, np.nan)
df_txn_features_clean["fees_per_input"] = df_txn_features_clean["fees"] / df_txn_features_clean["num_input_addresses"].replace(0, np.nan)

# Flag transactions where either the total input or total output BTC is a whole number
# Rounded amounts can indicate laundering or structuring behavior
df_txn_features_clean["rounded_amount_flag"] = np.where(df_txn_features_clean[["in_BTC_total", "out_BTC_total"]].isna().any(axis=1), np.nan, ((df_txn_features_clean["in_BTC_total"] % 1 == 0) | (df_txn_features_clean["out_BTC_total"] % 1 == 0)).astype(float))

# Percentile rank of the number of input addresses within the dataset
# Higher percentile suggests larger transactions with multiple inputs
df_txn_features_clean["input_address_percentile"] = df_txn_features_clean["num_input_addresses"].rank(pct=True)

# Percentile rank of the number of output addresses within the dataset
# Higher percentile suggests transactions distributing funds across multiple addresses
df_txn_features_clean["output_address_percentile"] = df_txn_features_clean["num_output_addresses"].rank(pct=True)


In [10]:
print(df_txn_features_clean.shape)
df_txn_features_clean.head(1)

(203769, 198)


Unnamed: 0,txId,Time step,class,class_label,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,Local_feature_8,Local_feature_9,Local_feature_10,Local_feature_11,Local_feature_12,Local_feature_13,Local_feature_14,Local_feature_15,Local_feature_16,Local_feature_17,Local_feature_18,Local_feature_19,Local_feature_20,Local_feature_21,Local_feature_22,Local_feature_23,Local_feature_24,Local_feature_25,Local_feature_26,Local_feature_27,Local_feature_28,Local_feature_29,Local_feature_30,Local_feature_31,Local_feature_32,Local_feature_33,Local_feature_34,Local_feature_35,Local_feature_36,Local_feature_37,Local_feature_38,Local_feature_39,Local_feature_40,Local_feature_41,Local_feature_42,Local_feature_43,Local_feature_44,Local_feature_45,Local_feature_46,Local_feature_47,Local_feature_48,Local_feature_49,Local_feature_50,Local_feature_51,Local_feature_52,Local_feature_53,Local_feature_54,Local_feature_55,Local_feature_56,Local_feature_57,Local_feature_58,Local_feature_59,Local_feature_60,Local_feature_61,Local_feature_62,Local_feature_63,Local_feature_64,Local_feature_65,Local_feature_66,Local_feature_67,Local_feature_68,Local_feature_69,Local_feature_70,Local_feature_71,Local_feature_72,Local_feature_73,Local_feature_74,Local_feature_75,Local_feature_76,Local_feature_77,Local_feature_78,Local_feature_79,Local_feature_80,Local_feature_81,Local_feature_82,Local_feature_83,Local_feature_84,Local_feature_85,Local_feature_86,Local_feature_87,Local_feature_88,Local_feature_89,Local_feature_90,Local_feature_91,Local_feature_92,Local_feature_93,Aggregate_feature_1,Aggregate_feature_2,Aggregate_feature_3,Aggregate_feature_4,Aggregate_feature_5,Aggregate_feature_6,Aggregate_feature_7,Aggregate_feature_8,Aggregate_feature_9,Aggregate_feature_10,Aggregate_feature_11,Aggregate_feature_12,Aggregate_feature_13,Aggregate_feature_14,Aggregate_feature_15,Aggregate_feature_16,Aggregate_feature_17,Aggregate_feature_18,Aggregate_feature_19,Aggregate_feature_20,Aggregate_feature_21,Aggregate_feature_22,Aggregate_feature_23,Aggregate_feature_24,Aggregate_feature_25,Aggregate_feature_26,Aggregate_feature_27,Aggregate_feature_28,Aggregate_feature_29,Aggregate_feature_30,Aggregate_feature_31,Aggregate_feature_32,Aggregate_feature_33,Aggregate_feature_34,Aggregate_feature_35,Aggregate_feature_36,Aggregate_feature_37,Aggregate_feature_38,Aggregate_feature_39,Aggregate_feature_40,Aggregate_feature_41,Aggregate_feature_42,Aggregate_feature_43,Aggregate_feature_44,Aggregate_feature_45,Aggregate_feature_46,Aggregate_feature_47,Aggregate_feature_48,Aggregate_feature_49,Aggregate_feature_50,Aggregate_feature_51,Aggregate_feature_52,Aggregate_feature_53,Aggregate_feature_54,Aggregate_feature_55,Aggregate_feature_56,Aggregate_feature_57,Aggregate_feature_58,Aggregate_feature_59,Aggregate_feature_60,Aggregate_feature_61,Aggregate_feature_62,Aggregate_feature_63,Aggregate_feature_64,Aggregate_feature_65,Aggregate_feature_66,Aggregate_feature_67,Aggregate_feature_68,Aggregate_feature_69,Aggregate_feature_70,Aggregate_feature_71,Aggregate_feature_72,in_txs_degree,out_txs_degree,total_BTC,fees,size,num_input_addresses,num_output_addresses,in_BTC_min,in_BTC_max,in_BTC_mean,in_BTC_median,in_BTC_total,out_BTC_min,out_BTC_max,out_BTC_mean,out_BTC_median,out_BTC_total,in_out_ratio,input_output_ratio,BTC_dispersion,high_fees_flag,micro_txn_flag,txn_density,fees_per_byte,fees_ratio,fees_per_input,rounded_amount_flag,input_address_percentile,output_address_percentile
0,30549576,8,3,Unknown,-0.17,-0.1805,1.0186,-0.122,-0.0439,-0.113,-0.0616,-0.1606,-0.1665,-0.0497,-0.1629,-0.0287,-0.0354,-0.043,-0.0133,-0.0542,-0.1684,-0.17,-0.1728,-1.3737,-1.3715,-0.1397,-0.1489,-0.0801,-0.1557,-0.0108,-0.0121,-0.1397,-0.1489,-0.0801,-0.1557,-0.0107,-0.012,-0.0247,-0.0313,-0.023,-0.0262,0.0014,0.0015,-0.2272,-0.2394,-0.0753,-0.235,0.0375,0.0434,-0.2272,-0.2432,-0.0979,-0.2359,0.0366,0.0423,-0.4138,-0.4882,-0.2326,-0.4674,0.0488,0.053,-0.0392,-0.1729,-0.1631,-0.1609,-1.3163,-1.3154,-0.0391,-0.1729,-0.1631,-0.1609,-1.3163,-1.3154,-0.017,-0.03,-0.0176,-0.0151,-0.1408,-0.1403,-0.0954,-0.2644,-0.2506,-0.2638,-0.1691,-0.1672,-0.059,-0.2624,-0.2552,-0.2593,-0.1872,-0.1853,-0.293,-0.7516,-0.6821,-0.7115,1.1355,1.1353,-0.1233,-0.1656,-0.1168,-0.1466,-0.0147,-0.0188,-1.0061,-1.0346,-0.0835,-1.0277,-0.0888,-0.0904,0.0446,-0.056,-0.1345,0.0069,-0.0032,-0.0041,-1.0963,-1.2673,-0.3499,-1.2304,-0.0044,-0.0042,-0.1164,-0.1766,-0.1373,-0.1525,-0.0261,-0.0277,-0.0376,-0.1111,-0.0977,-0.0807,0.0031,0.0024,-0.1243,-0.1991,-0.1872,-0.2136,-1.1596,-1.1601,-0.9502,-0.964,-0.2502,-0.9792,1.342,1.3407,-0.1729,-0.4576,-0.4225,-0.4408,-1.016,-1.0162,-0.9689,0.147,1.3663,-0.4648,-1.1169,-1.1169,-0.2168,-0.6179,-0.5771,-0.6136,0.2411,0.2414,0.0183,-0.0875,-0.1312,-0.0975,-0.1206,-0.1198,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [32]:
df_counts = (
    df_txn_features_clean
    .groupby(["class", "class_label"])
    .size()
    .reset_index(name="count")
)

df_counts

Unnamed: 0,class,class_label,count
0,1,Illicit,4545
1,2,Licit,42019
2,3,Unknown,157205


----------------------

Remove duplicates in 'df_wallet_features' dataset.

----------------------

In [11]:
# Read in combined wallet features table from BigQuery
%%bigquery df_wallet
select * from `extreme-torch-467913-m6.actor.wallets_features`

Query is running:   0%|          |

Downloading:   0%|          |

The below dataset shows the aggregated transactions sent, received with values across multiple timestamps. However, this dataset contains duplicates because the aggregated values per address are duplicated due the way the dataset was joined, which has resulted in the address information being multipled by the number of transactions the address has made across all timestamps.

We can see that there are 1.2m rows across 58 columns. However, there are only 823k unique addresses. This means that there are many duplicate rows we need to remove.

In [12]:
df_wallet[df_wallet['address'] == '1H1WS2tFx5yCsxtefs9PSTdqVX2mduyf3b'].sort_values(by='Time step')

Unnamed: 0,address,Time step,num_txs_as_sender,num_txs_as receiver,first_block_appeared_in,last_block_appeared_in,lifetime_in_blocks,total_txs,first_sent_block,first_received_block,num_timesteps_appeared_in,btc_transacted_total,btc_transacted_min,btc_transacted_max,btc_transacted_mean,btc_transacted_median,btc_sent_total,btc_sent_min,btc_sent_max,btc_sent_mean,btc_sent_median,btc_received_total,btc_received_min,btc_received_max,btc_received_mean,btc_received_median,fees_total,fees_min,fees_max,fees_mean,fees_median,fees_as_share_total,fees_as_share_min,fees_as_share_max,fees_as_share_mean,fees_as_share_median,blocks_btwn_txs_total,blocks_btwn_txs_min,blocks_btwn_txs_max,blocks_btwn_txs_mean,blocks_btwn_txs_median,blocks_btwn_input_txs_total,blocks_btwn_input_txs_min,blocks_btwn_input_txs_max,blocks_btwn_input_txs_mean,blocks_btwn_input_txs_median,blocks_btwn_output_txs_total,blocks_btwn_output_txs_min,blocks_btwn_output_txs_max,blocks_btwn_output_txs_mean,blocks_btwn_output_txs_median,num_addr_transacted_multiple,transacted_w_address_total,transacted_w_address_min,transacted_w_address_max,transacted_w_address_mean,transacted_w_address_median
1219877,1H1WS2tFx5yCsxtefs9PSTdqVX2mduyf3b,1,12.0,14.0,391200.0,485959.0,94759.0,26.0,391200.0,399266.0,14.0,218.5086,0.0053,50.0,8.4042,0.2479,66.348,0.0,50.0,2.5518,0.0,152.1606,0.0,50.0,5.8523,0.0125,0.016,0.0002,0.0023,0.0006,0.0005,0.0071,0.0,0.0012,0.0003,0.0002,94759.0,0.0,22178.0,3790.36,2011.0,92743.0,4.0,24189.0,8431.1818,6042.0,86693.0,0.0,30242.0,6668.6923,2016.0,5.0,62.0,1.0,4.0,1.1481,1.0
1219878,1H1WS2tFx5yCsxtefs9PSTdqVX2mduyf3b,1,12.0,14.0,391200.0,485959.0,94759.0,26.0,391200.0,399266.0,14.0,218.5086,0.0053,50.0,8.4042,0.2479,66.348,0.0,50.0,2.5518,0.0,152.1606,0.0,50.0,5.8523,0.0125,0.016,0.0002,0.0023,0.0006,0.0005,0.0071,0.0,0.0012,0.0003,0.0002,94759.0,0.0,22178.0,3790.36,2011.0,92743.0,4.0,24189.0,8431.1818,6042.0,86693.0,0.0,30242.0,6668.6923,2016.0,5.0,62.0,1.0,4.0,1.1481,1.0
692648,1H1WS2tFx5yCsxtefs9PSTdqVX2mduyf3b,5,12.0,14.0,391200.0,485959.0,94759.0,26.0,391200.0,399266.0,14.0,218.5086,0.0053,50.0,8.4042,0.2479,66.348,0.0,50.0,2.5518,0.0,152.1606,0.0,50.0,5.8523,0.0125,0.016,0.0002,0.0023,0.0006,0.0005,0.0071,0.0,0.0012,0.0003,0.0002,94759.0,0.0,22178.0,3790.36,2011.0,92743.0,4.0,24189.0,8431.1818,6042.0,86693.0,0.0,30242.0,6668.6923,2016.0,5.0,62.0,1.0,4.0,1.1481,1.0
403558,1H1WS2tFx5yCsxtefs9PSTdqVX2mduyf3b,5,12.0,14.0,391200.0,485959.0,94759.0,26.0,391200.0,399266.0,14.0,218.5086,0.0053,50.0,8.4042,0.2479,66.348,0.0,50.0,2.5518,0.0,152.1606,0.0,50.0,5.8523,0.0125,0.016,0.0002,0.0023,0.0006,0.0005,0.0071,0.0,0.0012,0.0003,0.0002,94759.0,0.0,22178.0,3790.36,2011.0,92743.0,4.0,24189.0,8431.1818,6042.0,86693.0,0.0,30242.0,6668.6923,2016.0,5.0,62.0,1.0,4.0,1.1481,1.0
1181102,1H1WS2tFx5yCsxtefs9PSTdqVX2mduyf3b,5,12.0,14.0,391200.0,485959.0,94759.0,26.0,391200.0,399266.0,14.0,218.5086,0.0053,50.0,8.4042,0.2479,66.348,0.0,50.0,2.5518,0.0,152.1606,0.0,50.0,5.8523,0.0125,0.016,0.0002,0.0023,0.0006,0.0005,0.0071,0.0,0.0012,0.0003,0.0002,94759.0,0.0,22178.0,3790.36,2011.0,92743.0,4.0,24189.0,8431.1818,6042.0,86693.0,0.0,30242.0,6668.6923,2016.0,5.0,62.0,1.0,4.0,1.1481,1.0
1109071,1H1WS2tFx5yCsxtefs9PSTdqVX2mduyf3b,5,12.0,14.0,391200.0,485959.0,94759.0,26.0,391200.0,399266.0,14.0,218.5086,0.0053,50.0,8.4042,0.2479,66.348,0.0,50.0,2.5518,0.0,152.1606,0.0,50.0,5.8523,0.0125,0.016,0.0002,0.0023,0.0006,0.0005,0.0071,0.0,0.0012,0.0003,0.0002,94759.0,0.0,22178.0,3790.36,2011.0,92743.0,4.0,24189.0,8431.1818,6042.0,86693.0,0.0,30242.0,6668.6923,2016.0,5.0,62.0,1.0,4.0,1.1481,1.0
130863,1H1WS2tFx5yCsxtefs9PSTdqVX2mduyf3b,7,12.0,14.0,391200.0,485959.0,94759.0,26.0,391200.0,399266.0,14.0,218.5086,0.0053,50.0,8.4042,0.2479,66.348,0.0,50.0,2.5518,0.0,152.1606,0.0,50.0,5.8523,0.0125,0.016,0.0002,0.0023,0.0006,0.0005,0.0071,0.0,0.0012,0.0003,0.0002,94759.0,0.0,22178.0,3790.36,2011.0,92743.0,4.0,24189.0,8431.1818,6042.0,86693.0,0.0,30242.0,6668.6923,2016.0,5.0,62.0,1.0,4.0,1.1481,1.0
966575,1H1WS2tFx5yCsxtefs9PSTdqVX2mduyf3b,7,12.0,14.0,391200.0,485959.0,94759.0,26.0,391200.0,399266.0,14.0,218.5086,0.0053,50.0,8.4042,0.2479,66.348,0.0,50.0,2.5518,0.0,152.1606,0.0,50.0,5.8523,0.0125,0.016,0.0002,0.0023,0.0006,0.0005,0.0071,0.0,0.0012,0.0003,0.0002,94759.0,0.0,22178.0,3790.36,2011.0,92743.0,4.0,24189.0,8431.1818,6042.0,86693.0,0.0,30242.0,6668.6923,2016.0,5.0,62.0,1.0,4.0,1.1481,1.0
1043639,1H1WS2tFx5yCsxtefs9PSTdqVX2mduyf3b,7,12.0,14.0,391200.0,485959.0,94759.0,26.0,391200.0,399266.0,14.0,218.5086,0.0053,50.0,8.4042,0.2479,66.348,0.0,50.0,2.5518,0.0,152.1606,0.0,50.0,5.8523,0.0125,0.016,0.0002,0.0023,0.0006,0.0005,0.0071,0.0,0.0012,0.0003,0.0002,94759.0,0.0,22178.0,3790.36,2011.0,92743.0,4.0,24189.0,8431.1818,6042.0,86693.0,0.0,30242.0,6668.6923,2016.0,5.0,62.0,1.0,4.0,1.1481,1.0
403557,1H1WS2tFx5yCsxtefs9PSTdqVX2mduyf3b,10,12.0,14.0,391200.0,485959.0,94759.0,26.0,391200.0,399266.0,14.0,218.5086,0.0053,50.0,8.4042,0.2479,66.348,0.0,50.0,2.5518,0.0,152.1606,0.0,50.0,5.8523,0.0125,0.016,0.0002,0.0023,0.0006,0.0005,0.0071,0.0,0.0012,0.0003,0.0002,94759.0,0.0,22178.0,3790.36,2011.0,92743.0,4.0,24189.0,8431.1818,6042.0,86693.0,0.0,30242.0,6668.6923,2016.0,5.0,62.0,1.0,4.0,1.1481,1.0


Remove the 'timestep' field and then remove duplicates. The expected result is a de-duplicated dataset with 822,942 representing 1 row per address.

In [13]:
df_wallet_clean = df_wallet.drop(columns=['Time step'])
df_wallet_clean = df_wallet_clean.drop_duplicates()
df_wallet_clean.shape

(822942, 56)

Now we can see that the 'Time step' field and the duplicates have been removed. The outcome is 1 row per address, resulting in 822,942 rows and 57 columns.

In [14]:
df_wallet_clean[df_wallet_clean['address'] == '1H1WS2tFx5yCsxtefs9PSTdqVX2mduyf3b']

Unnamed: 0,address,num_txs_as_sender,num_txs_as receiver,first_block_appeared_in,last_block_appeared_in,lifetime_in_blocks,total_txs,first_sent_block,first_received_block,num_timesteps_appeared_in,btc_transacted_total,btc_transacted_min,btc_transacted_max,btc_transacted_mean,btc_transacted_median,btc_sent_total,btc_sent_min,btc_sent_max,btc_sent_mean,btc_sent_median,btc_received_total,btc_received_min,btc_received_max,btc_received_mean,btc_received_median,fees_total,fees_min,fees_max,fees_mean,fees_median,fees_as_share_total,fees_as_share_min,fees_as_share_max,fees_as_share_mean,fees_as_share_median,blocks_btwn_txs_total,blocks_btwn_txs_min,blocks_btwn_txs_max,blocks_btwn_txs_mean,blocks_btwn_txs_median,blocks_btwn_input_txs_total,blocks_btwn_input_txs_min,blocks_btwn_input_txs_max,blocks_btwn_input_txs_mean,blocks_btwn_input_txs_median,blocks_btwn_output_txs_total,blocks_btwn_output_txs_min,blocks_btwn_output_txs_max,blocks_btwn_output_txs_mean,blocks_btwn_output_txs_median,num_addr_transacted_multiple,transacted_w_address_total,transacted_w_address_min,transacted_w_address_max,transacted_w_address_mean,transacted_w_address_median
6335,1H1WS2tFx5yCsxtefs9PSTdqVX2mduyf3b,12.0,14.0,391200.0,485959.0,94759.0,26.0,391200.0,399266.0,14.0,218.5086,0.0053,50.0,8.4042,0.2479,66.348,0.0,50.0,2.5518,0.0,152.1606,0.0,50.0,5.8523,0.0125,0.016,0.0002,0.0023,0.0006,0.0005,0.0071,0.0,0.0012,0.0003,0.0002,94759.0,0.0,22178.0,3790.36,2011.0,92743.0,4.0,24189.0,8431.1818,6042.0,86693.0,0.0,30242.0,6668.6923,2016.0,5.0,62.0,1.0,4.0,1.1481,1.0


-------------------
View shape of txn data (linear v non-linear)

-------------------

In [15]:
df_txn_features_clean.head(1)

Unnamed: 0,txId,Time step,class,class_label,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,Local_feature_8,Local_feature_9,Local_feature_10,Local_feature_11,Local_feature_12,Local_feature_13,Local_feature_14,Local_feature_15,Local_feature_16,Local_feature_17,Local_feature_18,Local_feature_19,Local_feature_20,Local_feature_21,Local_feature_22,Local_feature_23,Local_feature_24,Local_feature_25,Local_feature_26,Local_feature_27,Local_feature_28,Local_feature_29,Local_feature_30,Local_feature_31,Local_feature_32,Local_feature_33,Local_feature_34,Local_feature_35,Local_feature_36,Local_feature_37,Local_feature_38,Local_feature_39,Local_feature_40,Local_feature_41,Local_feature_42,Local_feature_43,Local_feature_44,Local_feature_45,Local_feature_46,Local_feature_47,Local_feature_48,Local_feature_49,Local_feature_50,Local_feature_51,Local_feature_52,Local_feature_53,Local_feature_54,Local_feature_55,Local_feature_56,Local_feature_57,Local_feature_58,Local_feature_59,Local_feature_60,Local_feature_61,Local_feature_62,Local_feature_63,Local_feature_64,Local_feature_65,Local_feature_66,Local_feature_67,Local_feature_68,Local_feature_69,Local_feature_70,Local_feature_71,Local_feature_72,Local_feature_73,Local_feature_74,Local_feature_75,Local_feature_76,Local_feature_77,Local_feature_78,Local_feature_79,Local_feature_80,Local_feature_81,Local_feature_82,Local_feature_83,Local_feature_84,Local_feature_85,Local_feature_86,Local_feature_87,Local_feature_88,Local_feature_89,Local_feature_90,Local_feature_91,Local_feature_92,Local_feature_93,Aggregate_feature_1,Aggregate_feature_2,Aggregate_feature_3,Aggregate_feature_4,Aggregate_feature_5,Aggregate_feature_6,Aggregate_feature_7,Aggregate_feature_8,Aggregate_feature_9,Aggregate_feature_10,Aggregate_feature_11,Aggregate_feature_12,Aggregate_feature_13,Aggregate_feature_14,Aggregate_feature_15,Aggregate_feature_16,Aggregate_feature_17,Aggregate_feature_18,Aggregate_feature_19,Aggregate_feature_20,Aggregate_feature_21,Aggregate_feature_22,Aggregate_feature_23,Aggregate_feature_24,Aggregate_feature_25,Aggregate_feature_26,Aggregate_feature_27,Aggregate_feature_28,Aggregate_feature_29,Aggregate_feature_30,Aggregate_feature_31,Aggregate_feature_32,Aggregate_feature_33,Aggregate_feature_34,Aggregate_feature_35,Aggregate_feature_36,Aggregate_feature_37,Aggregate_feature_38,Aggregate_feature_39,Aggregate_feature_40,Aggregate_feature_41,Aggregate_feature_42,Aggregate_feature_43,Aggregate_feature_44,Aggregate_feature_45,Aggregate_feature_46,Aggregate_feature_47,Aggregate_feature_48,Aggregate_feature_49,Aggregate_feature_50,Aggregate_feature_51,Aggregate_feature_52,Aggregate_feature_53,Aggregate_feature_54,Aggregate_feature_55,Aggregate_feature_56,Aggregate_feature_57,Aggregate_feature_58,Aggregate_feature_59,Aggregate_feature_60,Aggregate_feature_61,Aggregate_feature_62,Aggregate_feature_63,Aggregate_feature_64,Aggregate_feature_65,Aggregate_feature_66,Aggregate_feature_67,Aggregate_feature_68,Aggregate_feature_69,Aggregate_feature_70,Aggregate_feature_71,Aggregate_feature_72,in_txs_degree,out_txs_degree,total_BTC,fees,size,num_input_addresses,num_output_addresses,in_BTC_min,in_BTC_max,in_BTC_mean,in_BTC_median,in_BTC_total,out_BTC_min,out_BTC_max,out_BTC_mean,out_BTC_median,out_BTC_total,in_out_ratio,input_output_ratio,BTC_dispersion,high_fees_flag,micro_txn_flag,txn_density,fees_per_byte,fees_ratio,fees_per_input,rounded_amount_flag,input_address_percentile,output_address_percentile
0,30549576,8,3,Unknown,-0.17,-0.1805,1.0186,-0.122,-0.0439,-0.113,-0.0616,-0.1606,-0.1665,-0.0497,-0.1629,-0.0287,-0.0354,-0.043,-0.0133,-0.0542,-0.1684,-0.17,-0.1728,-1.3737,-1.3715,-0.1397,-0.1489,-0.0801,-0.1557,-0.0108,-0.0121,-0.1397,-0.1489,-0.0801,-0.1557,-0.0107,-0.012,-0.0247,-0.0313,-0.023,-0.0262,0.0014,0.0015,-0.2272,-0.2394,-0.0753,-0.235,0.0375,0.0434,-0.2272,-0.2432,-0.0979,-0.2359,0.0366,0.0423,-0.4138,-0.4882,-0.2326,-0.4674,0.0488,0.053,-0.0392,-0.1729,-0.1631,-0.1609,-1.3163,-1.3154,-0.0391,-0.1729,-0.1631,-0.1609,-1.3163,-1.3154,-0.017,-0.03,-0.0176,-0.0151,-0.1408,-0.1403,-0.0954,-0.2644,-0.2506,-0.2638,-0.1691,-0.1672,-0.059,-0.2624,-0.2552,-0.2593,-0.1872,-0.1853,-0.293,-0.7516,-0.6821,-0.7115,1.1355,1.1353,-0.1233,-0.1656,-0.1168,-0.1466,-0.0147,-0.0188,-1.0061,-1.0346,-0.0835,-1.0277,-0.0888,-0.0904,0.0446,-0.056,-0.1345,0.0069,-0.0032,-0.0041,-1.0963,-1.2673,-0.3499,-1.2304,-0.0044,-0.0042,-0.1164,-0.1766,-0.1373,-0.1525,-0.0261,-0.0277,-0.0376,-0.1111,-0.0977,-0.0807,0.0031,0.0024,-0.1243,-0.1991,-0.1872,-0.2136,-1.1596,-1.1601,-0.9502,-0.964,-0.2502,-0.9792,1.342,1.3407,-0.1729,-0.4576,-0.4225,-0.4408,-1.016,-1.0162,-0.9689,0.147,1.3663,-0.4648,-1.1169,-1.1169,-0.2168,-0.6179,-0.5771,-0.6136,0.2411,0.2414,0.0183,-0.0875,-0.1312,-0.0975,-0.1206,-0.1198,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [19]:
# Start timer
start_time = time.time()

X = df_txn_features_clean.drop(columns=['class', 'class_label'])
y = df_txn_features_clean['class']

results = []

for col in X.columns:
    # Drop NaNs for both x and y
    mask = ~(X[col].isna() | y.isna())
    x = X.loc[mask, col].values.reshape(-1, 1)
    y_valid = y.loc[mask].values

    if len(y_valid) < 5:  # skip if too few values
        continue

    # Pearson
    try:
        pearson = np.corrcoef(X.loc[mask, col], y_valid)[0,1]
    except Exception:
        pearson = np.nan

    # Spearman
    try:
        spearman, _ = spearmanr(X.loc[mask, col], y_valid)
    except Exception:
        spearman = np.nan

    # Linear regression R²
    try:
        model = LinearRegression().fit(x, y_valid)
        r2 = model.score(x, y_valid)
    except Exception:
        r2 = np.nan

    # Mutual info
    try:
        mi = mutual_info_regression(x, y_valid, discrete_features=False)[0]
    except Exception:
        mi = np.nan

    # Classify relationship
    if abs(pearson) > 0.5 and r2 > 0.3:
        relation = "linear"
    elif abs(pearson) < 0.3 and (abs(spearman) > 0.5 or mi > 0.1):
        relation = "nonlinear"
    else:
        relation = "weak"

    results.append((col, pearson, spearman, r2, mi, relation))

df_datashape_summary = pd.DataFrame(
    results,
    columns=['feature', 'pearson', 'spearman', 'r2', 'mutual_info', 'relation']
)

# End timer
end_time = time.time()
elapsed_minutes = (end_time - start_time) / 60
print(f"Execution time: {elapsed_minutes:.2f} minutes")


Execution time: 6.74 minutes


In [24]:
df_datashape_summary.head()

Unnamed: 0,feature,pearson,spearman,r2,mutual_info,relation
0,txId,-0.0274,-0.0183,0.0008,0.188,nonlinear
1,Time step,-0.0273,-0.0235,0.0007,0.0187,weak
2,Local_feature_1,0.0326,0.0747,0.0011,0.0357,weak
3,Local_feature_2,-0.074,-0.1226,0.0055,0.1603,nonlinear
4,Local_feature_3,-0.0804,-0.0903,0.0065,0.0151,weak


In [29]:
relation_counts_df = (
    df_datashape_summary['relation']
    .value_counts()
    .reset_index()
)

relation_counts_df

Unnamed: 0,relation,count
0,weak,153
1,nonlinear,43


-------------------

Save as a new table in BigQuery

-------------------

In [None]:
# Define your project ID and table ID
project_id = 'extreme-torch-467913-m6'
table1_id = 'txn.txn_features_clean'
table2_id = 'actor.wallets_features_clean'

In [None]:
# Save DataFrame to BigQuery
#to_gbq(df_txn_features_clean, table1_id, project_id=project_id, if_exists='replace')
#to_gbq(df_wallet_clean, table2_id, project_id=project_id, if_exists='replace')

100%|██████████| 1/1 [00:00<00:00, 6842.26it/s]
100%|██████████| 1/1 [00:00<00:00, 10155.70it/s]
