# 3.4 Whale Risk & Market Insights

In this notebook, we summarize **whale-related risks and market insights** based on:

- Value-based metrics
- Graph centrality metrics
- Flow analysis
- Time-series behavior

The goal is not to predict price, but to characterize **structural and behavioral risk factors** associated with large on-chain players.

## 1. Imports + Load Data

We re-use:

- The ETH transaction dataset
- The heterogeneous graph `G`
- The whale detection table from **3.1**


In [5]:
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import pickle
from pathlib import Path

plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams["axes.grid"] = True

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
sys.path.append(PROJECT_ROOT)

from src.data.load_data import load_clean_transactions


In [7]:
# Load transactions
tx = load_clean_transactions()
print("Transactions:", len(tx))

# Load graph
HETERO_GRAPH_PATH = os.path.join(PROJECT_ROOT, "data", "processed", "heterogeneous_graph.gpickle")
with Path(HETERO_GRAPH_PATH).open("rb") as f:
    G = pickle.load(f)

# Load whale table
WHALE_PATH = os.path.join(PROJECT_ROOT, "data", "processed", "whale_detection_value_and_centrality.parquet")
whales = pd.read_parquet(WHALE_PATH)

print("Whale table shape:", whales.shape)
whales.head()


Transactions: 13268
Whale table shape: (7796, 20)


Unnamed: 0,total_out_value,n_out_tx,total_in_value,n_in_tx,n_total_tx,net_flow,is_out_whale,is_in_whale,is_whale_value,in_degree,out_degree,degree,pagerank,hub_score,authority_score,is_whale_degree,is_whale_pagerank,is_whale_authority,is_whale_centrality,is_whale
0x0000000000000068f116a894984e2db1123eb395,0.0,0.0,9.49143e+19,15.0,15.0,9.49143e+19,False,False,False,14,0,14,0.000286,-0.0,2.2747760000000003e-17,False,False,False,False,False
0x0000000000001ff3684f28c67538d4d072c22734,0.0,0.0,6.716182e+20,47.0,47.0,6.716182e+20,False,False,False,40,0,40,0.000513,-0.0,0.000671533,False,False,False,False,False
0x0000000000a39bb272e79075ade125fd351887ac,0.0,0.0,1.401e+20,23.0,23.0,1.401e+20,False,False,False,20,0,20,0.000374,-0.0,8.615999e-06,False,False,False,False,False
0x00000000219ab540356cbb839cbe05303d7705fa,0.0,0.0,1.853817e+22,347.0,347.0,1.853817e+22,False,False,False,311,0,311,0.0062,-0.0,3.90484e-09,True,True,False,True,True
0x00000047bb99ea4d791bb749d970de71ee0b1a34,0.0,0.0,1.306812e+20,15.0,15.0,1.306812e+20,False,False,False,10,2,12,0.000195,3.6e-05,0.0002554334,False,False,False,False,False


## 2. Concentration of Volume and Activity

We first quantify **how concentrated** activity and volume are:

- Share of total volume captured by whales vs non-whales
- Share of volume captured by the top-k whales
- Simple inequality index (e.g., Gini) based on total volume


In [8]:
# Determine whale addresses
whale_addrs = whales.index[whales["is_whale"] == True]
non_whale_addrs = whales.index[whales["is_whale"] == False]

print("Number of whales:", len(whale_addrs))
print("Number of non-whale addresses:", len(non_whale_addrs))


Number of whales: 21
Number of non-whale addresses: 7775


In [9]:
# Total volume per address (value-based metrics already computed in whales table)
# Here we approximate "activity volume" as total_in + total_out
whales["total_volume"] = whales["total_in_value"] + whales["total_out_value"]

total_volume_all = whales["total_volume"].sum()
total_volume_whales = whales.loc[whale_addrs, "total_volume"].sum()
total_volume_non_whales = whales.loc[non_whale_addrs, "total_volume"].sum()

print("Total volume (all addresses):", total_volume_all)
print("Total volume (whales):", total_volume_whales)
print("Total volume (non-whales):", total_volume_non_whales)

print("Share of volume by whales: {:.2%}".format(total_volume_whales / total_volume_all if total_volume_all > 0 else 0))


Total volume (all addresses): 3.1611048982403844e+24
Total volume (whales): 1.2764332383057146e+24
Total volume (non-whales): 1.8846716599346703e+24
Share of volume by whales: 40.38%


In [10]:
# Top-k whales by total volume
top_k = 20
top_whales_by_volume = (
    whales.loc[whale_addrs]
    .sort_values("total_volume", ascending=False)
    .head(top_k)
    .copy()
)

share_top_k = top_whales_by_volume["total_volume"].sum() / total_volume_all if total_volume_all > 0 else 0

print(f"Top {top_k} whales (by total volume):")
display(top_whales_by_volume[["total_volume", "total_in_value", "total_out_value", "n_total_tx"]])

print(f"\nShare of volume captured by top {top_k} whales: {share_top_k:.2%}")


Top 20 whales (by total volume):


Unnamed: 0,total_volume,total_in_value,total_out_value,n_total_tx
0x28c6c06298d514db089934071355e5743bf21d60,3.252496e+23,1.050025e+23,2.202472e+23,900.0
0xe5c248d8d3f3871bd0f68e9c4743459c43bb4e4c,1.822601e+23,9.113012e+22,9.112997e+22,6.0
0xeae7380dd4cef6fbd1144f49e4d1e6964258a4f4,1.124284e+23,4.545459e+22,6.697376e+22,103.0
0xd01607c3c5ecaba394d8be377a08590149325722,1.047843e+23,1.047843e+23,0.0,129.0
0xdfd5293d8e347dfe59e90efd55b2956a1343963d,8.507224e+22,4.385644e+22,4.12158e+22,116.0
0x9696f59e4d72e237be84ffd425dcad154bf96976,8.01513e+22,4.105621e+22,3.909509e+22,110.0
0xf30ba13e4b04ce5dc4d254ae5fa95477800f0eb0,6.308343e+22,2.725734e+22,3.582609e+22,404.0
0xa9d1e08c7793af67e9d92fe308d5697fb81d3e43,5.948422e+22,5.948422e+22,0.0,131.0
0xb5d85cbf7cb3ee0d56b3bb207d5fc4b82f43f511,4.818137e+22,2.381821e+22,2.436315e+22,392.0
0xf8191d98ae98d2f7abdfb63a9b0b812b93c873aa,4.576406e+22,1.816923e+22,2.759483e+22,42.0



Share of volume captured by top 20 whales: 40.36%


## 2.1 Simple Gini Coefficient of Volume

We compute a simple Gini coefficient on `total_volume` to quantify inequality.

> Note: This is an approximation based on observed on-chain volume, not actual balances.

In [11]:
def gini_coefficient(x):
    """
    Compute Gini coefficient for a non-negative 1D array or Series.
    """
    x = np.asarray(x, dtype=float)
    x = x[x >= 0]
    if len(x) == 0:
        return np.nan
    if np.all(x == 0):
        return 0.0
    x_sorted = np.sort(x)
    n = x_sorted.size
    cumx = np.cumsum(x_sorted)
    gini = (2 * np.sum((np.arange(1, n+1) * x_sorted)) - (n + 1) * cumx[-1]) / (n * cumx[-1])
    return gini

gini_all = gini_coefficient(whales["total_volume"])
gini_whales_only = gini_coefficient(whales.loc[whale_addrs, "total_volume"])

print("Gini (all addresses, based on total_volume):", gini_all)
print("Gini (whales only, based on total_volume):", gini_whales_only)


Gini (all addresses, based on total_volume): 0.9425902030478785
Gini (whales only, based on total_volume): 0.5731660050611003


# 3. Behavioral Risk Indicators

We define a few simple behavioral indicators:

- **Dump risk**: whales with large recent net outflows  
- **Accumulation risk**: whales with persistent net inflows  
- **Counterparty concentration**: whales relying on a small set of counterparties

These indicators are *heuristic* and should be interpreted qualitatively, not as trading signals.


## 3.1 Daily Net Flow per Whale （Hourly temporal resolution）

We compute daily net flow for each whale:

- Per whale, per day: inflow - outflow
- This enables:
  - Detecting recent dumps
  - Detecting accumulation periods


In [15]:
# Ensure we have timestamps
if "block_timestamp" not in tx.columns:
    raise ValueError("Transaction table must contain 'block_timestamp' for time-series risk indicators.")

tx["date"] = pd.to_datetime(tx["block_timestamp"]).dt.date

# Restrict to whale-related tx for efficiency
whale_related_tx = tx[
    tx["from_address"].isin(whale_addrs) | tx["to_address"].isin(whale_addrs)
].copy()

whale_related_tx.head()


Unnamed: 0,hash,from_address,to_address,block_number,value,block_timestamp,date,datetime,hour
3,0xa1b7caf05dd498111a40ffe269fefb2ae574dde53da0...,0xe40d548eb4fa4d9188fd21723f2fd377456c0876,0x28c6c06298d514db089934071355e5743bf21d60,23772292,7.999922e+18,2025-11-11 00:00:47+00:00,2025-11-11,2025-11-11 00:00:47+00:00,2025-11-11 00:00:00+00:00
10,0xb46a4f265d89e36dd807074893fe7f5d6eaba6a6826b...,0x06fd4ba7973a0d39a91734bbc35bc2bcaa99e3b0,0x28c6c06298d514db089934071355e5743bf21d60,23772296,1.168418e+19,2025-11-11 00:01:35+00:00,2025-11-11,2025-11-11 00:01:35+00:00,2025-11-11 00:00:00+00:00
19,0x5205bb26ee81af297b28c6144f901730397eeb37d885...,0xa9c61fe59b5702b1d382fd1d5e495887ff34c21d,0x28c6c06298d514db089934071355e5743bf21d60,23772297,6.483657e+19,2025-11-11 00:01:47+00:00,2025-11-11,2025-11-11 00:01:47+00:00,2025-11-11 00:00:00+00:00
33,0x86599b678aa64cbac46faeccaf434ecd44dc227eb57f...,0x0f3c2476fbf0ed09dff00ea7f4ef252dcc72e6f1,0xba3cb449bd2b4adddbc894d8697f5170800eadec,23772306,8.253377e+18,2025-11-11 00:03:35+00:00,2025-11-11,2025-11-11 00:03:35+00:00,2025-11-11 00:00:00+00:00
36,0xf191aab5a4606485f75cdfe57b3d6a8306615688c0f3...,0x6ceb55fecbe094a84c791a4bbf937b6a1928821c,0xba3cb449bd2b4adddbc894d8697f5170800eadec,23772308,1e+19,2025-11-11 00:03:59+00:00,2025-11-11,2025-11-11 00:03:59+00:00,2025-11-11 00:00:00+00:00


In [16]:
tx["is_sender_whale"] = tx["from_address"].isin(whales.index[whales["is_whale"] == True])
tx["is_receiver_whale"] = tx["to_address"].isin(whales.index[whales["is_whale"] == True])


In [13]:
# Ensure timestamp exists
if "block_timestamp" not in tx.columns:
    raise ValueError("Transaction table must contain 'block_timestamp' for time series analysis.")

# Convert to datetime
tx["datetime"] = pd.to_datetime(tx["block_timestamp"])

# Extract hour-level time index (YYYY-MM-DD HH:00:00)
tx["hour"] = tx["datetime"].dt.floor("H")

tx[["block_timestamp", "datetime", "hour"]].head()


  tx["hour"] = tx["datetime"].dt.floor("H")


Unnamed: 0,block_timestamp,datetime,hour
0,2025-11-11 00:00:11+00:00,2025-11-11 00:00:11+00:00,2025-11-11 00:00:00+00:00
1,2025-11-11 00:00:47+00:00,2025-11-11 00:00:47+00:00,2025-11-11 00:00:00+00:00
2,2025-11-11 00:00:47+00:00,2025-11-11 00:00:47+00:00,2025-11-11 00:00:00+00:00
3,2025-11-11 00:00:47+00:00,2025-11-11 00:00:47+00:00,2025-11-11 00:00:00+00:00
4,2025-11-11 00:00:47+00:00,2025-11-11 00:00:47+00:00,2025-11-11 00:00:00+00:00


In [17]:
hourly_inflow = (
    tx[tx["is_receiver_whale"]]
    .groupby("hour")["value"]
    .sum()
    .rename("hourly_inflow")
)

hourly_outflow = (
    tx[tx["is_sender_whale"]]
    .groupby("hour")["value"]
    .sum()
    .rename("hourly_outflow")
)

hourly_flow = pd.concat([hourly_inflow, hourly_outflow], axis=1).fillna(0)
hourly_flow["net_flow"] = hourly_flow["hourly_inflow"] - hourly_flow["hourly_outflow"]

hourly_flow.head()


Unnamed: 0_level_0,hourly_inflow,hourly_outflow,net_flow
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-11-11 00:00:00+00:00,2.808461e+22,1.352357e+22,1.456104e+22
2025-11-11 01:00:00+00:00,1.267335e+22,8.895051e+21,3.778301e+21
2025-11-11 02:00:00+00:00,5.704697e+22,6.610195e+22,-9.054978e+21
2025-11-11 03:00:00+00:00,6.596419e+22,6.553592e+22,4.282639e+20
2025-11-11 04:00:00+00:00,1.314578e+22,1.248204e+22,6.637377e+20


## 3.2 Plot: Daily(Hourly) Inflow vs Outflow