### POISON HUNTER IMPLEMENTATION 

## DATA PREP

In [68]:
%%bigquery step_1_df
SELECT
  block_number,
  block_timestamp,
  token_address,
  from_address,
  to_address,
  value,
  CAST(value AS NUMERIC)/1000000 AS value_usd,
  transaction_hash
FROM `bigquery-public-data.crypto_ethereum.token_transfers`
WHERE token_address IN (
    '0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48', -- USDC
    '0xdac17f958d2ee523a2206206994597c13d831ec7'  -- USDT
)
AND block_timestamp >= TIMESTAMP("2024-07-01")
AND block_timestamp < TIMESTAMP("2024-08-01")
AND from_address != to_address
ORDER BY block_timestamp

Query is running:   0%|          |

Downloading:   0%|          |

In [84]:
step_1_df.head()

Unnamed: 0,block_number,block_timestamp,token_address,from_address,to_address,value,value_usd,transaction_hash
0,20207949,2024-07-01 00:00:11+00:00,0xdac17f958d2ee523a2206206994597c13d831ec7,0x3ec6be020a96510719c608b966a6a9c4d8451e2d,0x6f0bdcbeb74b568171d14884151cefdfe6e67e82,10000000,10.0,0xdc043489abbb019e2f01c05b21c84954c8462bf4e725...
1,20207949,2024-07-01 00:00:11+00:00,0xdac17f958d2ee523a2206206994597c13d831ec7,0xb921807735d83f0d93c0395f1a10edb06e016da9,0x7563758243a262e96880f178aee7817dcf47ab0f,340573867,340.573867,0x9a7a83cb918efa64d905867865bc7d6d46357d97225a...
2,20207949,2024-07-01 00:00:11+00:00,0xdac17f958d2ee523a2206206994597c13d831ec7,0xa9d1e08c7793af67e9d92fe308d5697fb81d3e43,0x5181ef4d130be083624aadb2229ffcf440a53f97,497411600,497.4116,0x43c631eaa109f6957376cb67fd4dc0692e737bc89c94...
3,20207949,2024-07-01 00:00:11+00:00,0xdac17f958d2ee523a2206206994597c13d831ec7,0xc05352bd44fb0d0beab927645470a27f460a106f,0x4f9fc4e0b79c1cbf16e68863ad5e9de6a94a346c,758456785,758.456785,0x654f452c914a1afa7ba5155faaf60e5f5b1f61c82ee5...
4,20207949,2024-07-01 00:00:11+00:00,0xdac17f958d2ee523a2206206994597c13d831ec7,0xba340a75c9398d9d99c1fe16c6500713cedbf6a7,0x7563758243a262e96880f178aee7817dcf47ab0f,266890000,266.89,0x70171e21c55003637f9352a8be75f37ee28a6964830e...


In [76]:
print(f"number of txns: {len(step_1_df)}")

number of txns: 7291135


In [77]:
import pandas as pd

min_block = step_1_df['block_number'].min()
max_block = step_1_df['block_number'].max()

def filter_by_block(start_block: int, end_block: int, df: pd.DataFrame = step_1_df) -> pd.DataFrame:
    """
    Filters a pandas DataFrame by block_number range and returns a new DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing a 'block_number' column.
        start_block (int): Minimum block_number (inclusive).
        end_block (int): Maximum block_number (inclusive).

    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    # Ensure block_number column exists
    if 'block_number' not in df.columns:
        raise ValueError("DataFrame must contain a 'block_number' column.")
    
    filtered_df = df[(df['block_number'] >= start_block) & (df['block_number'] <= end_block)]
    return filtered_df

In [78]:
print(f"min block: {min_block}")
print(f"max block: {max_block}")

min block: 20207949
max block: 20429972


In [52]:
import pandas as pd
import numpy as np
from typing import Set, Any
# takes a window of transactions and a block with the original txn & identifies all zero-value and tiny-dusty attack
def step2_vectorized(window_dataframe, block_1):
    
    victim_set = set(block_1['from_address'])
    
    # Build victim -> tx_hashes mapping
    victim_to_tx_hashes = block_1.groupby('from_address')['transaction_hash'].apply(set).to_dict()
    
    # Vectorized conditions
    mask_zero = (window_dataframe['value_usd'] == 0) & (window_dataframe['from_address'].isin(victim_set))
    mask_dust = (window_dataframe['value_usd'] > 0) & (window_dataframe['value_usd'] <= 1) & (window_dataframe['to_address'].isin(victim_set))
    
    combined_mask = mask_zero | mask_dust
    
    # Early return if no matches
    if not combined_mask.any():
        return pd.DataFrame(columns=list(window_dataframe.columns) + ['attacker_address', 'victim_address', 'victim_tx_hash', 'attack_type'])
    
    # Filter matching rows
    suspicious_transactions = window_dataframe[combined_mask].copy()
    
    # Get the mask subset for proper alignment
    mask_zero_subset = mask_zero[combined_mask]
    
    # Set attacker address based on which condition matched
    suspicious_transactions['attacker_address'] = np.where(
        mask_zero_subset,
        suspicious_transactions['to_address'],
        suspicious_transactions['from_address']
    )
    
    # Set victim address (opposite of attacker)
    suspicious_transactions['victim_address'] = np.where(
        mask_zero_subset,
        suspicious_transactions['from_address'],
        suspicious_transactions['to_address']
    )
    
    # Set victim tx hash using the mapping
    suspicious_transactions['victim_tx_hash'] = suspicious_transactions['victim_address'].map(
        lambda addr: victim_to_tx_hashes.get(addr, set())
    )
    
    # Set attack type
    suspicious_transactions['attack_type'] = np.where(
        mask_zero_subset,
        'zero',
        'dust'
    )
    
    return suspicious_transactions

In [91]:
BLOCK_WDOW = 100
# transaction_set = step_1_df
import time

start = time.time()  # record start time
sus_txns = []
print(f"started at {start}")
for block_num in range(min_block,min_block+1000):
    victim_block = filter_by_block(block_num,block_num)
    search_window = filter_by_block(block_num+1,block_num+BLOCK_WDOW+1)
    new_df = step2_vectorized(search_window, victim_block)
    if not new_df.empty:
        sus_txns.append(new_df)
end = time.time()    # record end time
combined_df_vector = pd.concat(sus_txns, ignore_index=True)
print(f"end time: {end}")

started at 1764439369.4527228
end time: 1764439449.824726


In [94]:
combined_df_vector.sort_values(
    by="transaction_hash",
    key=lambda col: col.astype(str)
).reset_index(drop=True).shape

(1701, 12)

In [None]:
import numpy as np
import pandas as pd
import time

# -------------------------------
# 1. DEFINE THE BLOCK RANGE
# -------------------------------
start = time.time()  # record start time
NUM_BLOCKS = 1000
victim_blocks = np.arange(min_block, min_block + NUM_BLOCKS)

# Victim df = all transactions in those blocks
victim_df = step_1_df[step_1_df['block_number'].isin(victim_blocks)].copy()

# Window df = all transactions that could appear in any search window
search_min = min_block + 1
search_max = (min_block + NUM_BLOCKS - 1) + BLOCK_WDOW + 1 

window_df = step_1_df[
    (step_1_df['block_number'] >= search_min) &
    (step_1_df['block_number'] <= search_max)
].copy()


# ----------------------------------------
# 1. BUILD MAPPING: search_block → victim_block(s)
# ----------------------------------------
# For each unique search block in window_df, compute which victim blocks it should pair with.
search_unique_blocks = window_df['block_number'].unique()

# For each search block B_s, valid victims satisfy:
#   B_v ∈ [B_s - BLOCK_WDOW, B_s - 1]
#   and B_v is within our victim range
mapping_rows = []
for b_s in search_unique_blocks:
    v_low = b_s - BLOCK_WDOW - 1  # changed from b_s - BLOCK_WDOW
    v_high = b_s - 1

    # intersect with our victim blocks
    valid_victims = victim_blocks[
        (victim_blocks >= v_low) & (victim_blocks <= v_high)
    ]

    for v in valid_victims:
        mapping_rows.append((b_s, v))

block_map = pd.DataFrame(mapping_rows, columns=['search_block', 'victim_block'])

# ----------------------------------------
# 2. MERGE THE MAPPING WITH window_df
# ----------------------------------------
# Many window_df rows share the same block → they will inherit the same victim_block list
window_df2 = window_df.merge(
    block_map,
    left_on='block_number',
    right_on='search_block',
    how='right'
)

# Clean up helper col
window_df2 = window_df2.drop(columns=['search_block'])

# ----------------------------------------
# 3. FOR EACH victim_block, extract its victims
# ----------------------------------------
# This avoids millions of repeated filters.
victim_groups = {
    blk: grp for blk, grp in victim_df.groupby('block_number')
}

# ----------------------------------------
# 4. RUN step2_vectorized in BULK
# ----------------------------------------
# We now have a single giant window table, but rows know their victim_block.
results = []

for v_block, win_grp in window_df2.groupby('victim_block'):
    victim_block_df = victim_groups.get(v_block)
    if victim_block_df is None:
        continue

    out = step2_vectorized(win_grp, victim_block_df)
    if not out.empty:
        out['victim_block_number'] = v_block
        results.append(out)

# Final output = merged suspicious transactions across ALL blocks
step_2_df = pd.concat(results, ignore_index=True) if results else pd.DataFrame()
end = time.time()  # record start time
print(f"time elapsed for {NUM_BLOCKS} blocks: {end-start} seconds")

time elapsed for 1000 blocks: 8.994812488555908 seconds


In [None]:
step_2_df.shape

(1701, 14)