### POISON HUNTER IMPLEMENTATION 

## DATA PREP

In [39]:
%%bigquery step_1_df
SELECT
  block_number,
  block_timestamp,
  token_address,
  from_address,
  to_address,
  value,
  CAST(value AS NUMERIC)/1000000 AS value_usd,
  transaction_hash
FROM `bigquery-public-data.crypto_ethereum.token_transfers`
WHERE token_address IN (
    '0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48', -- USDC
    '0xdac17f958d2ee523a2206206994597c13d831ec7'  -- USDT
)
AND block_timestamp >= TIMESTAMP("2024-07-01")
AND block_timestamp < TIMESTAMP("2024-08-01")
AND from_address != to_address
ORDER BY block_timestamp

Query is running:   0%|          |

Downloading:   0%|          |

In [40]:
step_1_df.head()

Unnamed: 0,block_number,block_timestamp,token_address,from_address,to_address,value,value_usd,transaction_hash
0,20207949,2024-07-01 00:00:11+00:00,0xdac17f958d2ee523a2206206994597c13d831ec7,0x3ec6be020a96510719c608b966a6a9c4d8451e2d,0x6f0bdcbeb74b568171d14884151cefdfe6e67e82,10000000,10.0,0xdc043489abbb019e2f01c05b21c84954c8462bf4e725...
1,20207949,2024-07-01 00:00:11+00:00,0xdac17f958d2ee523a2206206994597c13d831ec7,0xb921807735d83f0d93c0395f1a10edb06e016da9,0x7563758243a262e96880f178aee7817dcf47ab0f,340573867,340.573867,0x9a7a83cb918efa64d905867865bc7d6d46357d97225a...
2,20207949,2024-07-01 00:00:11+00:00,0xdac17f958d2ee523a2206206994597c13d831ec7,0xa9d1e08c7793af67e9d92fe308d5697fb81d3e43,0x5181ef4d130be083624aadb2229ffcf440a53f97,497411600,497.4116,0x43c631eaa109f6957376cb67fd4dc0692e737bc89c94...
3,20207949,2024-07-01 00:00:11+00:00,0xdac17f958d2ee523a2206206994597c13d831ec7,0xc05352bd44fb0d0beab927645470a27f460a106f,0x4f9fc4e0b79c1cbf16e68863ad5e9de6a94a346c,758456785,758.456785,0x654f452c914a1afa7ba5155faaf60e5f5b1f61c82ee5...
4,20207949,2024-07-01 00:00:11+00:00,0xdac17f958d2ee523a2206206994597c13d831ec7,0xba340a75c9398d9d99c1fe16c6500713cedbf6a7,0x7563758243a262e96880f178aee7817dcf47ab0f,266890000,266.89,0x70171e21c55003637f9352a8be75f37ee28a6964830e...


In [41]:
print(f"number of txns: {len(step_1_df)}")

number of txns: 7291135


In [44]:
import pandas as pd

min_block = step_1_df['block_number'].min()
max_block = step_1_df['block_number'].max()

def filter_by_block(start_block: int, end_block: int, df: pd.DataFrame = step_1_df) -> pd.DataFrame:
    """
    Filters a pandas DataFrame by block_number range and returns a new DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing a 'block_number' column.
        start_block (int): Minimum block_number (inclusive).
        end_block (int): Maximum block_number (inclusive).

    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    # Ensure block_number column exists
    if 'block_number' not in df.columns:
        raise ValueError("DataFrame must contain a 'block_number' column.")
    
    filtered_df = df[(df['block_number'] >= start_block) & (df['block_number'] <= end_block)]
    return filtered_df

In [45]:
print(f"min block: {min_block}")
print(f"max block: {max_block}")

min block: 20207949
max block: 20429972


In [72]:
BLOCK_WDOW = 100
# transaction_set = step_1_df
import time

start = time.time()  # record start time
sus_txns = []
print(f"started at {start}")
for block_num in range(min_block,min_block+100):
    vimtim_block = filter_by_block(block_num,block_num)
    search_window = filter_by_block(block_num+1,block_num+BLOCK_WDOW+1)
    new_df = step2_vectorized(search_window, vimtim_block)
    if not new_df.empty:
        sus_txns.append(new_df)
end = time.time()    # record end time
combined_df_vector = pd.concat(sus_txns, ignore_index=True)
print(f"end time: {end}")

started at 1764382344.7650928
end time: 1764382352.366266


In [67]:
BLOCK_WDOW = 100
# transaction_set = step_1_df
import time

start = time.time()  # record start time
sus_txns = []
print(f"started at {start}")
for block_num in range(min_block,min_block+100):
    vimtim_block = filter_by_block(block_num,block_num)
    search_window = filter_by_block(block_num+1,block_num+BLOCK_WDOW+1)
    new_df = step2_unvectorized(search_window, vimtim_block)
    if not new_df.empty:
        sus_txns.append(new_df)
end = time.time()    # record end time
combined_df_unvector = pd.concat(sus_txns, ignore_index=True)
print(f"end time: {end}")

started at 1764381956.5140414
end time: 1764381979.9795208


In [68]:
print(combined_df_vector.equals(combined_df_unvector))
# combined_df_unvector

False


In [None]:
combined_df_vector

In [65]:
# Shows differences side by side
combined_df_vector.compare(combined_df_unvector)

# # Keep equal values too
# combined_df_vector.compare(combined_df_unvector, keep_equal=True)

# # Keep shape (show all rows)
# combined_df_vector.compare(combined_df_unvector, keep_shape=True)

In [71]:
import pandas as pd
import numpy as np

def step2_vectorized(window_dataframe, block_1):
    
    victim_set = set(block_1['from_address'])
    
    # Vectorized conditions
    mask_zero = (window_dataframe['value_usd'] == 0) & (window_dataframe['from_address'].isin(victim_set))
    mask_dust = (window_dataframe['value_usd'] > 0) & (window_dataframe['value_usd'] <= 10) & (window_dataframe['to_address'].isin(victim_set))
    
    combined_mask = mask_zero | mask_dust
    
    # Early return if no matches
    if not combined_mask.any():
        return pd.DataFrame(columns=list(window_dataframe.columns) + ['attacker_address', 'victim_address', 'attack_type'])
    
    # Filter matching rows
    attacking_transactions = window_dataframe[combined_mask].copy()
    
    # Set attacker address based on which condition matched
    attacking_transactions['attacker_address'] = np.where(
        mask_zero[combined_mask],
        attacking_transactions['to_address'],
        attacking_transactions['from_address']
    )
    
    # Set victim address (opposite of attacker)
    attacking_transactions['victim_address'] = np.where(
        mask_zero[combined_mask],
        attacking_transactions['from_address'],
        attacking_transactions['to_address']
    )
    
    # Set attack type
    attacking_transactions['attack_type'] = np.where(
        mask_zero[combined_mask],
        'zero',
        'dust'
    )
    
    return attacking_transactions


def step2_unvectorized(window_dataframe, block_1):

    victim_set = set(block_1['from_address'])

    rows = []

    for _idx, tx in window_dataframe.iterrows():
        attacker_address = None
        victim_address = None
        attack_type = None
        
        if tx.value_usd == 0 and tx.from_address in victim_set: 
            attacker_address = tx.to_address
            victim_address = tx.from_address
            attack_type = 'zero'
            
        elif 0 < tx.value_usd <= 1 and tx.to_address in victim_set:
            attacker_address = tx.from_address
            victim_address = tx.to_address
            attack_type = 'dust'
        
        if attacker_address is not None:
            row = tx.to_dict()
            row['attacker_address'] = attacker_address
            row['victim_address'] = victim_address
            row['attack_type'] = attack_type
            rows.append(row)

    # Handle empty result
    if not rows:
        return pd.DataFrame(columns=list(window_dataframe.columns) + ['attacker_address', 'victim_address', 'attack_type'])
    
    return pd.DataFrame(rows)
