# Trader Profiling Features Analysis

## Objective
Calculate comprehensive trader-level features using SQL queries to support ML framework for new coin trading.

## Feature Categories
1. **Volume & Scale Features** - Trading volume patterns and sizing
2. **Diversification & Specialization Features** - Multi-coin vs focused trading
3. **Timing & Behavioral Features** - Trading frequency and patterns
4. **Bot-like Behavior Features** - Automated trading indicators
5. **SOL PNL Analysis** - Profit/loss tracking per coin
6. **Non-SOL Trade Analysis** - Token-to-token trades


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Import our Solana data analysis utilities
from solana_eda_utils import SolanaDataAnalyzer, format_large_number, truncate_address

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Initialize the data analyzer
print("Initializing Solana Data Analyzer...")
analyzer = SolanaDataAnalyzer()
print("✅ Connected to database!")


Initializing Solana Data Analyzer...
Connected to database: /Volumes/Extreme SSD/DuckDB/solana.duckdb
✅ Connected to database!


## Data Overview
First, let's understand our data structure and scope.


In [2]:
# Data overview query
data_overview_query = """
SELECT 
    COUNT(*) as total_trades,
    COUNT(DISTINCT swapper) as unique_traders,
    COUNT(DISTINCT mint) as unique_coins,
    SUM(CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112' THEN 1 ELSE 0 END) as sol_to_token_trades,
    SUM(CASE WHEN swap_to_mint = 'So11111111111111111111111111111111111111112' THEN 1 ELSE 0 END) as token_to_sol_trades,
    COUNT(*) - SUM(CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112' OR swap_to_mint = 'So11111111111111111111111111111111111111112' THEN 1 ELSE 0 END) as non_sol_trades
FROM first_day_trades;
"""

print("=== DATA OVERVIEW ===")
print("Executing query...")
data_overview = analyzer.execute_query(data_overview_query)

if data_overview is not None:
    print("\n📊 Dataset Summary:")
    for col in data_overview.columns:
        value = data_overview[col].iloc[0]
        print(f"  {col}: {format_large_number(value)}")
    
    print(f"\n📈 Trade Distribution:")
    total = data_overview['total_trades'].iloc[0]
    sol_to_token = data_overview['sol_to_token_trades'].iloc[0]
    token_to_sol = data_overview['token_to_sol_trades'].iloc[0]
    non_sol = data_overview['non_sol_trades'].iloc[0]
    
    print(f"  SOL → Token: {format_large_number(sol_to_token)} ({sol_to_token/total*100:.1f}%)")
    print(f"  Token → SOL: {format_large_number(token_to_sol)} ({token_to_sol/total*100:.1f}%)")
    print(f"  Token → Token: {format_large_number(non_sol)} ({non_sol/total*100:.1f}%)")
    
    display(data_overview)
else:
    print("❌ Failed to execute query")


=== DATA OVERVIEW ===
Executing query...

📊 Dataset Summary:
  total_trades: 325.2M
  unique_traders: 10.1M
  unique_coins: 5.9K
  sol_to_token_trades: 171.4M
  token_to_sol_trades: 150.0M
  non_sol_trades: 3.8M

📈 Trade Distribution:
  SOL → Token: 171.4M (52.7%)
  Token → SOL: 150.0M (46.1%)
  Token → Token: 3.8M (1.2%)


Unnamed: 0,total_trades,unique_traders,unique_coins,sol_to_token_trades,token_to_sol_trades,non_sol_trades
0,325171663,10060971,5877,171410295.0,150006872.0,3754496.0


## 1. Volume & Scale Features
Calculate volume-based trader characteristics.


In [3]:
volume_scale_features_query = """
WITH trader_volume_stats AS (
    SELECT 
        swapper,
        COUNT(*) as total_trades_count,
        
        -- SOL volume features
        SUM(CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112' THEN swap_from_amount ELSE 0 END) as total_sol_spent,
        SUM(CASE WHEN swap_to_mint = 'So11111111111111111111111111111111111111112' THEN swap_to_amount ELSE 0 END) as total_sol_received,
        
        AVG(CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112' THEN swap_from_amount ELSE NULL END) as avg_sol_trade_size,
        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112' THEN swap_from_amount ELSE NULL END) as median_sol_trade_size,
        MAX(CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112' THEN swap_from_amount ELSE 0 END) as max_single_sol_trade,
        MIN(CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112' AND swap_from_amount > 0 THEN swap_from_amount ELSE NULL END) as min_sol_trade_size,
        STDDEV(CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112' THEN swap_from_amount ELSE NULL END) as sol_trade_size_std_dev
        
    FROM first_day_trades 
    GROUP BY swapper
)
SELECT 
    swapper,
    total_trades_count,
    total_sol_spent,
    total_sol_received,
    ROUND(avg_sol_trade_size, 4) as avg_sol_trade_size,
    ROUND(median_sol_trade_size, 4) as median_sol_trade_size,
    ROUND(max_single_sol_trade, 4) as max_single_sol_trade,
    ROUND(min_sol_trade_size, 4) as min_sol_trade_size,
    ROUND(sol_trade_size_std_dev, 4) as sol_trade_size_std_dev,
    ROUND(sol_trade_size_std_dev / NULLIF(avg_sol_trade_size, 0), 4) as trade_size_coefficient_variation,
    ROUND((total_sol_received - total_sol_spent), 4) as net_sol_pnl
FROM trader_volume_stats
ORDER BY total_sol_spent DESC;
"""

print("=== VOLUME & SCALE FEATURES ===")
print("Features: total_trades, total_sol_spent/received, avg/median/max/min trade sizes, std_dev, coefficient_variation, net_sol_pnl")
print("\nExecuting query...")

volume_features = analyzer.execute_query(volume_scale_features_query)

if volume_features is not None:
    print(f"\n📊 Volume & Scale Analysis Complete!")
    print(f"  Total traders analyzed: {format_large_number(len(volume_features))}")
    
    # Show summary statistics
    print(f"\n💰 SOL Volume Statistics:")
    print(f"  Max total SOL spent: {volume_features['total_sol_spent'].max():.2f} SOL")
    print(f"  Avg total SOL spent: {volume_features['total_sol_spent'].mean():.2f} SOL")
    print(f"  Median total SOL spent: {volume_features['total_sol_spent'].median():.2f} SOL")
    
    print(f"\n📈 Trading Activity:")
    print(f"  Max trades per trader: {format_large_number(volume_features['total_trades_count'].max())}")
    print(f"  Avg trades per trader: {volume_features['total_trades_count'].mean():.1f}")
    
    # Show top 10 traders by volume
    print(f"\n🏆 Top 10 Traders by SOL Volume:")
    top_volume = volume_features.head(10).copy()
    top_volume['swapper'] = top_volume['swapper'].apply(lambda x: truncate_address(x))
    display(top_volume)
    
    # Show distribution of trader types by volume
    volume_ranges = pd.cut(volume_features['total_sol_spent'], 
                          bins=[0, 1, 10, 100, 1000, float('inf')], 
                          labels=['<1 SOL', '1-10 SOL', '10-100 SOL', '100-1K SOL', '>1K SOL'])
    
    print(f"\n📊 Trader Distribution by Volume:")
    vol_dist = volume_ranges.value_counts().sort_index()
    for range_label, count in vol_dist.items():
        pct = count / len(volume_features) * 100
        print(f"  {range_label}: {format_large_number(count)} traders ({pct:.1f}%)")
        
else:
    print("❌ Failed to execute volume features query")


=== VOLUME & SCALE FEATURES ===
Features: total_trades, total_sol_spent/received, avg/median/max/min trade sizes, std_dev, coefficient_variation, net_sol_pnl

Executing query...

📊 Volume & Scale Analysis Complete!
  Total traders analyzed: 10.1M

💰 SOL Volume Statistics:
  Max total SOL spent: 3165753.88 SOL
  Avg total SOL spent: 36.16 SOL
  Median total SOL spent: 0.50 SOL

📈 Trading Activity:
  Max trades per trader: 1.6M
  Avg trades per trader: 32.3

🏆 Top 10 Traders by SOL Volume:


Unnamed: 0,swapper,total_trades_count,total_sol_spent,total_sol_received,avg_sol_trade_size,median_sol_trade_size,max_single_sol_trade,min_sol_trade_size,sol_trade_size_std_dev,trade_size_coefficient_variation,net_sol_pnl
0,arsc4jbD...Zh2y,151632,3165754.0,3209894.0,41.7557,15.4261,2970.0,0.0001,107.4546,2.5734,44139.79
1,HV1KXxWF...qP7K,1279901,1562690.0,24395.16,1.3016,0.1759,2330.7915,0.0,7.563,5.8106,-1538295.0
2,8MqRTAQn...G2VW,105378,1221408.0,1237385.0,23.2609,9.4538,2024.633,0.0001,54.173,2.3289,15976.95
3,AD65fgYt...5WFS,68131,1027367.0,1043267.0,30.1528,10.3493,2970.0,0.0,96.0312,3.1848,15900.14
4,D4zVhwuU...yWhL,74918,859792.8,857947.2,22.5348,9.9653,1295.2484,0.0,48.0948,2.1342,-1845.559
5,4DbAcLDy...zAAe,3203,702116.8,350761.8,326.2624,250.0,1000.0,0.0056,241.3691,0.7398,-351354.9
6,7dGrdJRY...uuUu,697640,637618.7,721256.3,1.8932,0.3019,1189.6604,0.0,7.2112,3.8091,83637.63
7,6LXutJvK...guFx,328211,525590.4,426697.1,3.4634,0.5,919.2716,0.0,13.4353,3.8792,-98893.26
8,4xDsmeTW...du71,328739,519065.2,422216.1,3.4232,0.5,1446.581,0.0,13.633,3.9825,-96849.11
9,6U91aKa8...2tbB,328732,518963.8,435603.9,3.4265,0.5,1411.1939,0.0,13.6375,3.98,-83359.86



📊 Trader Distribution by Volume:
  <1 SOL: 4.9M traders (49.0%)
  1-10 SOL: 2.0M traders (20.3%)
  10-100 SOL: 1.8M traders (18.2%)
  100-1K SOL: 274.6K traders (2.7%)
  >1K SOL: 34.4K traders (0.3%)


In [28]:
volume_features.head()

Unnamed: 0,swapper,total_trades_count,total_sol_spent,total_sol_received,avg_sol_trade_size,median_sol_trade_size,max_single_sol_trade,min_sol_trade_size,sol_trade_size_std_dev,trade_size_coefficient_variation,net_sol_pnl
0,arsc4jbDnzaqcCLByyGo7fg7S2SmcFsWUzQuDtLZh2y,151632,3165754.0,3209894.0,41.7557,15.4261,2970.0,0.0001,107.4546,2.5734,44139.79
1,HV1KXxWFaSeriyFvXyx48FqG9BoFbfinB8njCJonqP7K,1279901,1562690.0,24395.16,1.3016,0.1759,2330.7915,0.0,7.563,5.8106,-1538295.0
2,8MqRTAQnjhDYH7TWS1b1DjFog4CLZfySWE5cZeotG2VW,105378,1221408.0,1237385.0,23.2609,9.4538,2024.633,0.0001,54.173,2.3289,15976.95
3,AD65fgYti96iSSzSPaNazV9Bs29m7JbNomGjG4Cp5WFS,68131,1027367.0,1043267.0,30.1528,10.3493,2970.0,0.0,96.0312,3.1848,15900.14
4,D4zVhwuUsFbcaty7wJhNEZ7VEwPHXQ5d2heXPxM5yWhL,74918,859792.8,857947.2,22.5348,9.9653,1295.2484,0.0,48.0948,2.1342,-1845.559


## 2. Diversification & Specialization Features
Analyze multi-coin trading patterns and specialization.


In [4]:
diversification_features_query = """
       WITH trader_diversification_stats AS (
           SELECT 
               swapper,
               MAX(trade_count) as max_trades_on_single_coin
           FROM (
               SELECT 
                   swapper,
                   mint,
                   COUNT(*) as trade_count
               FROM first_day_trades 
               GROUP BY swapper, mint
           ) coin_trades
           GROUP BY swapper
       ),
       trader_coin_concentration AS (
           SELECT 
               swapper,
               COUNT(DISTINCT mint) as unique_coins_traded,
               COUNT(*) as total_trades,
               SUM(CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112' THEN swap_from_amount ELSE 0 END) as total_sol_buy_volume,
               SUM(CASE WHEN swap_to_mint = 'So11111111111111111111111111111111111111112' THEN swap_to_amount ELSE 0 END) as total_sol_sell_volume
           FROM first_day_trades
           GROUP BY swapper
       )
       SELECT 
           t1.swapper,
           t1.unique_coins_traded,
           t1.total_trades,
           ROUND(t1.total_trades::FLOAT / t1.unique_coins_traded, 2) as avg_trades_per_coin,
           ROUND(t2.max_trades_on_single_coin::FLOAT / t1.total_trades, 4) as trade_concentration_ratio
           
       FROM trader_coin_concentration t1 
       INNER JOIN trader_diversification_stats t2 ON t1.swapper = t2.swapper
       ORDER BY t1.unique_coins_traded DESC;
       """

print("=== DIVERSIFICATION & SPECIALIZATION FEATURES ===")
print("Features: unique_coins_traded, avg_trades_per_coin, trade_concentration_ratio")
print("\nExecuting query...")

diversification_features = analyzer.execute_query(diversification_features_query)

if diversification_features is not None:
           print(f"\n📊 Diversification Analysis Complete!")
           print(f"  Total traders analyzed: {format_large_number(len(diversification_features))}")

           # Basic statistics without arbitrary categories
           print(f"\n📈 Diversification Statistics:")
           print(f"  Max coins traded: {diversification_features['unique_coins_traded'].max()}")
           print(f"  Mean coins traded: {diversification_features['unique_coins_traded'].mean():.2f}")
           print(f"  Median coins traded: {diversification_features['unique_coins_traded'].median():.2f}")

           print(f"\n📊 Trade Concentration Statistics:")
           print(f"  Mean concentration ratio: {diversification_features['trade_concentration_ratio'].mean():.4f}")
           print(f"  Median concentration ratio: {diversification_features['trade_concentration_ratio'].median():.4f}")

           # Show top diversified traders
           print(f"\n🏆 Top 10 Most Diversified Traders:")
           top_diversified = diversification_features.head(10).copy()
           top_diversified['swapper'] = top_diversified['swapper'].apply(lambda x: truncate_address(x))
           display(top_diversified[['swapper', 'unique_coins_traded', 'total_trades', 'avg_trades_per_coin', 'trade_concentration_ratio']])

           # Distribution without arbitrary categories - just raw numbers
           print(f"\n📊 Coins Traded Distribution:")
           coins_ranges = pd.cut(diversification_features['unique_coins_traded'],
                                bins=[0, 1, 5, 10, 50, 100, float('inf')],
                                labels=['1 coin', '2-5 coins', '6-10 coins', '11-50 coins', '51-100 coins', '>100 coins'])

           coins_dist = coins_ranges.value_counts().sort_index()
           total_traders = len(diversification_features)
           for range_label, count in coins_dist.items():
               pct = count / total_traders * 100
               print(f"  {range_label}: {format_large_number(count)} traders ({pct:.1f}%)")

else:
           print("❌ Failed to execute diversification features query")

=== DIVERSIFICATION & SPECIALIZATION FEATURES ===
Features: unique_coins_traded, avg_trades_per_coin, trade_concentration_ratio

Executing query...

📊 Diversification Analysis Complete!
  Total traders analyzed: 10.1M

📈 Diversification Statistics:
  Max coins traded: 3321
  Mean coins traded: 4.32
  Median coins traded: 1.00

📊 Trade Concentration Statistics:
  Mean concentration ratio: 0.7374
  Median concentration ratio: 1.0000

🏆 Top 10 Most Diversified Traders:


Unnamed: 0,swapper,unique_coins_traded,total_trades,avg_trades_per_coin,trade_concentration_ratio
0,HV1KXxWF...qP7K,3321,1279901,385.399994,0.023
1,DQeJQ91U...gLLs,3260,5715,1.75,0.0451
2,CaShxDq2...i4xU,3212,7109,2.21,0.0402
3,ZG98FUCj...wmPd,2987,56698,18.98,0.0098
4,5iywveQK...uXWs,2645,6755,2.55,0.0018
5,5YET3Yap...zoPX,2548,284350,111.599998,0.0134
6,ATomG2gR...YoCq,2488,4607,1.85,0.1962
7,2j3MGgjT...142z,2162,13298,6.15,0.0487
8,9nnLbotN...Exn8,1998,327147,163.740005,0.0369
9,6LXutJvK...guFx,1995,328211,164.520004,0.0374



📊 Coins Traded Distribution:
  1 coin: 5.7M traders (56.7%)
  2-5 coins: 2.2M traders (22.2%)
  6-10 coins: 1.1M traders (10.5%)
  11-50 coins: 1.0M traders (10.1%)
  51-100 coins: 39.4K traders (0.4%)
  >100 coins: 15.6K traders (0.2%)


In [29]:
diversification_features.head()

Unnamed: 0,swapper,unique_coins_traded,total_trades,avg_trades_per_coin,trade_concentration_ratio
0,HV1KXxWFaSeriyFvXyx48FqG9BoFbfinB8njCJonqP7K,3321,1279901,385.399994,0.023
1,DQeJQ91Uzcuyk4iAtpQ9FwD8Ddwr62NeWd8hoE1kgLLs,3260,5715,1.75,0.0451
2,CaShxDq2Vbdp2XryjDdUZthbTzwYsvKuH6Knn9pPi4xU,3212,7109,2.21,0.0402
3,ZG98FUCjb8mJ824Gbs6RsgVmr1FhXb2oNiJHa2dwmPd,2987,56698,18.98,0.0098
4,5iywveQKkidqPDKt2CExJcWKex2EXz9kbGcYiZvhuXWs,2645,6755,2.55,0.0018


## 3. Timing & Behavioral Features
Analyze trading frequency and timing patterns.


In [5]:
timing_behavioral_features_query = """
WITH trader_timing_analysis AS (
    SELECT 
        swapper,
        COUNT(*) as total_trades,
        MIN(block_timestamp) as first_trade_time,
        MAX(block_timestamp) as last_trade_time,
        
        -- Trading span and frequency
        EXTRACT(EPOCH FROM (MAX(block_timestamp) - MIN(block_timestamp))) / 86400.0 as trading_span_days,
        CASE WHEN COUNT(*) > 1 THEN 
            EXTRACT(EPOCH FROM (MAX(block_timestamp) - MIN(block_timestamp))) / (COUNT(*) - 1) / 3600.0
        ELSE NULL END as avg_hours_between_trades,
        
        -- Activity concentration
        COUNT(DISTINCT DATE_TRUNC('hour', block_timestamp)) as active_hours,
        COUNT(DISTINCT DATE_TRUNC('day', block_timestamp)) as active_days
    FROM first_day_trades 
    GROUP BY swapper
)
SELECT 
    swapper,
    total_trades,
    ROUND(trading_span_days, 2) as trading_span_days,
    ROUND(total_trades / NULLIF(trading_span_days, 0), 2) as trades_per_day,
    ROUND(avg_hours_between_trades, 4) as avg_hours_between_trades,
    active_hours,
    active_days,
    ROUND(total_trades::FLOAT / active_hours, 2) as trades_per_active_hour
    
FROM trader_timing_analysis
ORDER BY avg_hours_between_trades ASC;
"""

print("=== TIMING & BEHAVIORAL FEATURES ===")
print("Features: trading_span_days, trades_per_day, avg_hours_between_trades, trades_per_active_hour, overall_success_rate")
print("\nExecuting query...")

timing_features = analyzer.execute_query(timing_behavioral_features_query)

if timing_features is not None:
    print(f"\n📊 Timing & Behavioral Analysis Complete!")
    print(f"  Total traders analyzed: {format_large_number(len(timing_features))}")
    
    # Basic timing statistics without arbitrary categories
    print(f"\n⏱️ Timing Statistics:")
    print(f"  Mean hours between trades: {timing_features['avg_hours_between_trades'].mean():.2f}")
    print(f"  Median hours between trades: {timing_features['avg_hours_between_trades'].median():.2f}")
    print(f"  Min hours between trades: {timing_features['avg_hours_between_trades'].min():.2f}")
    print(f"  Max hours between trades: {timing_features['avg_hours_between_trades'].max():.2f}")
    
    print(f"\n📅 Trading Span Statistics:")
    print(f"  Max trading span: {timing_features['trading_span_days'].max():.1f} days")
    print(f"  Mean trading span: {timing_features['trading_span_days'].mean():.1f} days")
    print(f"  Median trading span: {timing_features['trading_span_days'].median():.1f} days")
    
    print(f"\n📊 Activity Statistics:")
    print(f"  Mean trades per active hour: {timing_features['trades_per_active_hour'].mean():.2f}")
    print(f"  Median trades per active hour: {timing_features['trades_per_active_hour'].median():.2f}")
    print(f"  Max trades per active hour: {timing_features['trades_per_active_hour'].max():.0f}")
    
    # Show top traders by activity metrics
    print(f"\n🏆 Top 10 Most Active Traders (by trades per hour):")
    top_active = timing_features.nlargest(10, 'trades_per_active_hour').copy()
    top_active['swapper'] = top_active['swapper'].apply(lambda x: truncate_address(x))
    display(top_active[['swapper', 'total_trades', 'avg_hours_between_trades', 'trades_per_active_hour']])
    
    # Distribution analysis without arbitrary categories
    print(f"\n📊 Hours Between Trades Distribution:")
    hour_ranges = pd.cut(timing_features['avg_hours_between_trades'], 
                        bins=[0, 0.1, 1, 6, 24, float('inf')], 
                        labels=['<0.1h (6min)', '0.1-1h', '1-6h', '6-24h', '>24h'])
    
    hour_dist = hour_ranges.value_counts().sort_index()
    total_traders = len(timing_features)
    for range_label, count in hour_dist.items():
        pct = count / total_traders * 100
        print(f"  {range_label}: {format_large_number(count)} traders ({pct:.1f}%)")
    
    # Trading activity distribution
    print(f"\n📊 Trades Per Day Distribution:")
    tpd_ranges = pd.cut(timing_features['trades_per_day'], 
                       bins=[0, 1, 10, 100, 1000, float('inf')], 
                       labels=['<1/day', '1-10/day', '10-100/day', '100-1K/day', '>1K/day'])
    
    tpd_dist = tpd_ranges.value_counts().sort_index()
    for range_label, count in tpd_dist.items():
        pct = count / total_traders * 100
        print(f"  {range_label}: {format_large_number(count)} traders ({pct:.1f}%)")
        
else:
    print("❌ Failed to execute timing features query")

=== TIMING & BEHAVIORAL FEATURES ===
Features: trading_span_days, trades_per_day, avg_hours_between_trades, trades_per_active_hour, overall_success_rate

Executing query...

📊 Timing & Behavioral Analysis Complete!
  Total traders analyzed: 10.1M

⏱️ Timing Statistics:
  Mean hours between trades: 136.62
  Median hours between trades: 11.49
  Min hours between trades: 0.00
  Max hours between trades: 35824.25

📅 Trading Span Statistics:
  Max trading span: 1509.5 days
  Mean trading span: 30.6 days
  Median trading span: 0.1 days

📊 Activity Statistics:
  Mean trades per active hour: 2.46
  Median trades per active hour: 1.17
  Max trades per active hour: 36027

🏆 Top 10 Most Active Traders (by trades per hour):


Unnamed: 0,swapper,total_trades,avg_hours_between_trades,trades_per_active_hour
469013,XcqrBC8t...FQCW,36027,0.0,36027.0
129150,3KaM5Emu...iyrV,71568,0.0,35784.0
273420,7yHVix14...H3eF,67836,0.0,33918.0
369269,BWdmnN9o...FBMm,199512,0.0,33252.0
171888,FaAzWH3e...Faoz,749525,0.0,32588.039062
12576,FySmHczi...M3g1,149144,0.0,29828.800781
381004,4YqaqpNA...D8KU,113297,0.0,28324.25
27222,BFbwwfnY...vPd9,56197,0.0,28098.5
639685,F9rjfoq3...qBT2,1033028,0.0016,26487.900391
526218,E5xR917V...ZgHs,50537,0.0,25268.5



📊 Hours Between Trades Distribution:
  <0.1h (6min): 752.4K traders (7.5%)
  0.1-1h: 915.7K traders (9.1%)
  1-6h: 841.5K traders (8.4%)
  6-24h: 929.4K traders (9.2%)
  >24h: 2.9M traders (28.9%)

📊 Trades Per Day Distribution:
  <1/day: 2.8M traders (28.0%)
  1-10/day: 1.3M traders (13.1%)
  10-100/day: 997.3K traders (9.9%)
  100-1K/day: 726.4K traders (7.2%)
  >1K/day: 490.7K traders (4.9%)


In [30]:
timing_features.head()

Unnamed: 0,swapper,total_trades,trading_span_days,trades_per_day,avg_hours_between_trades,active_hours,active_days,trades_per_active_hour
0,osjLrEkw3w8g5sniGP4vJTyesvCFZuyut3FTQc9wMGr,2,0.0,,0.0,1,1,2.0
1,9872P9Hp2C3KdPVt8oHe6o335AvQAgs5gLkVtSpMwC6m,2,0.0,,0.0,1,1,2.0
2,E2PUoXMzzaWwVQ7q5hc82p6B1uSjyxNWormusTKrqLt2,3,0.0,,0.0,1,1,3.0
3,6zJRGVm7BTmnHoQHwaqHZ15ZqvWZT3rjqXLEuJhFxudC,3,0.0,,0.0,1,1,3.0
4,HKSUvKsg6mFfQ2i1Vb7xTVj72g3E2GQfUqphioTE8iEr,3,0.0,,0.0,1,1,3.0


In [6]:
bot_behavior_features_query = """
WITH trader_bot_analysis AS (
    SELECT 
        swapper,
        COUNT(*) as total_trades,
        
        -- Round number preferences (hardcoded common round amounts)
  SUM(CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112'
            AND swap_from_amount IN (
                -- Small round amounts
                0.01, 0.05, 0.1, 0.2, 0.25, 0.5, 0.75,
                -- Whole numbers 1-10
                1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0,
                -- Common trading amounts
                15.0, 20.0, 25.0, 50.0, 75.0, 100.0,
                -- Larger round amounts  
                150.0, 200.0, 250.0, 500.0, 750.0, 1000.0,
                -- Very large round amounts
                1500.0, 2000.0, 2500.0, 5000.0, 10000.0
            )
      THEN 1 ELSE 0 END)::FLOAT /
  NULLIF(SUM(CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112' THEN 1 ELSE 0 END), 0) as round_number_preference
        
    FROM first_day_trades 
    GROUP BY swapper
)
SELECT 
    swapper,
    total_trades,
    round_number_preference
FROM trader_bot_analysis
ORDER BY total_trades DESC;
"""

print("=== BOT-LIKE BEHAVIOR FEATURES ===")
print("Features: round_number_preference, trade_size_cv")
print("\nExecuting query...")

bot_features = analyzer.execute_query(bot_behavior_features_query)

if bot_features is not None:
    print(f"\n📊 Bot Behavior Analysis Complete!")
    print(f"  Total traders analyzed: {format_large_number(len(bot_features))}")
    print(bot_features.head)
        
else:
    print("❌ Failed to execute bot behavior features query")


=== BOT-LIKE BEHAVIOR FEATURES ===
Features: round_number_preference, trade_size_cv

Executing query...

📊 Bot Behavior Analysis Complete!
  Total traders analyzed: 10.1M
<bound method NDFrame.head of                                                swapper  total_trades  \
0         HzuK5PCN6gi8gaKHZwRMhXS4sJiHyUFM3dtBHXLykVQU       1599922   
1         HV1KXxWFaSeriyFvXyx48FqG9BoFbfinB8njCJonqP7K       1279901   
2         F9rjfoq3YAz1e7Sn2Q5p64u9BTosqQEvjFNg8RgnqBT2       1033028   
3         FaAzWH3ehYrH84B5zPS1wtmf1jLDNFFimpTMbUD8Faoz        749525   
4         7dGrdJRYtsNR8UYxZ3TnifXGjGc9eRYLq9sELwYpuuUu        697640   
...                                                ...           ...   
10060967  DChiTnL1yyCcu545GAVQXtpgQA1tkRz9GkVmUCVbd38w             1   
10060968  CdQFEEaTPdm4rpNHzW3mNZi1FVKf5YjTcaFENYBvnKdQ             1   
10060969  3nDEBj15EmyjWvVpFdBUoyS6HjHtksqNbuh6pRbQDxAy             1   
10060970  FvhGyXwYwmZnxCWdhn9C1u9JRuT2zWzBH5RSFA2mXGzp             1   
1006097

In [31]:
bot_features.head()

Unnamed: 0,swapper,total_trades,round_number_preference
0,HzuK5PCN6gi8gaKHZwRMhXS4sJiHyUFM3dtBHXLykVQU,1599922,0.0
1,HV1KXxWFaSeriyFvXyx48FqG9BoFbfinB8njCJonqP7K,1279901,0.00183
2,F9rjfoq3YAz1e7Sn2Q5p64u9BTosqQEvjFNg8RgnqBT2,1033028,0.0
3,FaAzWH3ehYrH84B5zPS1wtmf1jLDNFFimpTMbUD8Faoz,749525,3e-06
4,7dGrdJRYtsNR8UYxZ3TnifXGjGc9eRYLq9sELwYpuuUu,697640,0.000609


In [7]:
sol_pnl_per_coin_query = """
  -- SOL PNL tracking per trader per coin
  WITH trader_coin_flows AS (
      SELECT
          swapper,
          mint,

          -- SOL flows per coin
          SUM(CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112' THEN swap_from_amount ELSE 0 END) as sol_spent_on_coin,
          SUM(CASE WHEN swap_to_mint = 'So11111111111111111111111111111111111111112' THEN swap_to_amount ELSE 0 END) as sol_received_from_coin,

          -- Trade counts per coin
          COUNT(CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112' THEN 1 END) as buy_trades,
          COUNT(CASE WHEN swap_to_mint = 'So11111111111111111111111111111111111111112' THEN 1 END) as sell_trades,
          COUNT(*) as total_coin_trades,

          -- Timing on this coin
          MIN(block_timestamp) as first_trade_on_coin,
          MAX(block_timestamp) as last_trade_on_coin

      FROM first_day_trades
      GROUP BY swapper, mint
  )
  SELECT
      swapper,
      mint,
      ROUND(sol_spent_on_coin, 4) as sol_spent_on_coin,
      ROUND(sol_received_from_coin, 4) as sol_received_from_coin,
      buy_trades,
      sell_trades,
      total_coin_trades,
      ROUND((sol_received_from_coin - sol_spent_on_coin), 4) as net_sol_pnl_per_coin,
      CASE WHEN sol_spent_on_coin > 0 THEN
          ROUND((sol_received_from_coin - sol_spent_on_coin) / sol_spent_on_coin, 4)
      ELSE NULL END as roi_on_coin,
      ROUND(EXTRACT(EPOCH FROM (last_trade_on_coin - first_trade_on_coin)) / 3600.0, 2) as hours_active_on_coin

  FROM trader_coin_flows
  ORDER BY ABS(sol_received_from_coin - sol_spent_on_coin) DESC;
  """

print("=== SOL PNL PER COIN ANALYSIS ===")
print("Features: sol_spent/received per coin, buy/sell trades, net_pnl, roi, trading_pattern, hours_active")
print("\nExecuting query (this may take a moment for large datasets)...")


sol_pnl_features = analyzer.execute_query(sol_pnl_per_coin_query)

if sol_pnl_features is not None:
      print(f"\n📊 SOL PNL Per Coin Analysis Complete!")
      print(f"  Total trader-coin combinations analyzed: {format_large_number(len(sol_pnl_features))}")

      # Overall PNL statistics
      total_pnl = sol_pnl_features['net_sol_pnl_per_coin'].sum()
      positive_pnl = sol_pnl_features[sol_pnl_features['net_sol_pnl_per_coin'] > 0]
      negative_pnl = sol_pnl_features[sol_pnl_features['net_sol_pnl_per_coin'] < 0]

      print(f"\n💰 Overall PNL Statistics:")
      print(f"  Total net PNL: {total_pnl:.2f} SOL")
      print(f"  Profitable positions: {format_large_number(len(positive_pnl))} ({len(positive_pnl)/len(sol_pnl_features)*100:.1f}%)")
      print(f"  Losing positions: {format_large_number(len(negative_pnl))} ({len(negative_pnl)/len(sol_pnl_features)*100:.1f}%)")

      if len(positive_pnl) > 0:
          print(f"  Mean profit per winning position: {positive_pnl['net_sol_pnl_per_coin'].mean():.4f} SOL")
          print(f"  Max single position profit: {positive_pnl['net_sol_pnl_per_coin'].max():.2f} SOL")

      if len(negative_pnl) > 0:
          print(f"  Mean loss per losing position: {negative_pnl['net_sol_pnl_per_coin'].mean():.4f} SOL")
          print(f"  Max single position loss: {negative_pnl['net_sol_pnl_per_coin'].min():.2f} SOL")

      # ROI analysis (for positions with spending)
      roi_data = sol_pnl_features[sol_pnl_features['roi_on_coin'].notna()]
      if len(roi_data) > 0:
          print(f"\n📊 ROI Analysis (positions with SOL spent):")
          print(f"  Positions with ROI data: {format_large_number(len(roi_data))}")
          print(f"  Mean ROI: {roi_data['roi_on_coin'].mean()*100:.2f}%")
          print(f"  Median ROI: {roi_data['roi_on_coin'].median()*100:.2f}%")
          print(f"  Positive ROI positions: {len(roi_data[roi_data['roi_on_coin'] > 0])} ({len(roi_data[roi_data['roi_on_coin'] > 
  0])/len(roi_data)*100:.1f}%)")

      # Top profit/loss positions
      print(f"\n🏆 Top 5 Most Profitable Positions:")
      top_profits = sol_pnl_features.nlargest(5, 'net_sol_pnl_per_coin').copy()
      top_profits['swapper'] = top_profits['swapper'].apply(lambda x: truncate_address(x))
      top_profits['mint'] = top_profits['mint'].apply(lambda x: truncate_address(x))
      display(top_profits[['swapper', 'mint', 'sol_spent_on_coin', 'sol_received_from_coin', 'net_sol_pnl_per_coin', 'roi_on_coin']])

      print(f"\n💸 Top 5 Biggest Losses:")
      top_losses = sol_pnl_features.nsmallest(5, 'net_sol_pnl_per_coin').copy()
      top_losses['swapper'] = top_losses['swapper'].apply(lambda x: truncate_address(x))
      top_losses['mint'] = top_losses['mint'].apply(lambda x: truncate_address(x))
      display(top_losses[['swapper', 'mint', 'sol_spent_on_coin', 'sol_received_from_coin', 'net_sol_pnl_per_coin', 'roi_on_coin']])

else:
      print("❌ Failed to execute SOL PNL features query")

=== SOL PNL PER COIN ANALYSIS ===
Features: sol_spent/received per coin, buy/sell trades, net_pnl, roi, trading_pattern, hours_active

Executing query (this may take a moment for large datasets)...

📊 SOL PNL Per Coin Analysis Complete!
  Total trader-coin combinations analyzed: 43.4M

💰 Overall PNL Statistics:
  Total net PNL: -5829123.14 SOL
  Profitable positions: 13.7M (31.5%)
  Losing positions: 27.9M (64.2%)
  Mean profit per winning position: 4.8752 SOL
  Max single position profit: 51111.85 SOL
  Mean loss per losing position: -2.5984 SOL
  Max single position loss: -147471.01 SOL

📊 ROI Analysis (positions with SOL spent):
  Positions with ROI data: 40.6M
  Mean ROI: 1859374.56%
  Median ROI: -40.04%
  Positive ROI positions: 11337309 (27.9%)

🏆 Top 5 Most Profitable Positions:


Unnamed: 0,swapper,mint,sol_spent_on_coin,sol_received_from_coin,net_sol_pnl_per_coin,roi_on_coin
4,AupTbxAr...jrWU,xyzR4s6H...8M1P,6000.0,57111.8536,51111.8536,8.5186
7,DLcw9YVY...ZV7R,5z3EqYQo...mrRC,0.0,37248.1004,37248.1004,
9,7dGrdJRY...uuUu,Bo9jh3ws...vUsU,27915.4176,57644.267,29728.8494,1.065
11,MfDuWeqS...GVWa,SonicxvL...dDES,996.5677,26818.633,25822.0653,25.911
13,AMd5bXpf...zsjT,A8bcY1eS...CNDL,2715.0,26003.6578,23288.6578,8.5778



💸 Top 5 Biggest Losses:


Unnamed: 0,swapper,mint,sol_spent_on_coin,sol_received_from_coin,net_sol_pnl_per_coin,roi_on_coin
0,4DbAcLDy...zAAe,PSG1RJpL...Yfeo,294671.0287,147200.0172,-147471.0115,-0.5005
1,4DbAcLDy...zAAe,LiNgojrW...Dqof,228000.0,113909.8603,-114090.1397,-0.5004
2,4DbAcLDy...zAAe,Pain8Ljd...j86J,179445.7268,89651.955,-89793.7718,-0.5004
3,HV1KXxWF...qP7K,ECY31gWw...pump,58609.7887,1716.4313,-56893.3574,-0.9707
5,HV1KXxWF...qP7K,Bo9jh3ws...vUsU,48527.5693,298.3224,-48229.2469,-0.9939


In [32]:
sol_pnl_features.head()

Unnamed: 0,swapper,mint,sol_spent_on_coin,sol_received_from_coin,buy_trades,sell_trades,total_coin_trades,net_sol_pnl_per_coin,roi_on_coin,hours_active_on_coin
0,4DbAcLDyhCLX7rKPx55xTQA6D8w2poSg3xwW6NzozAAe,PSG1RJpLVmHPwNZm7kP7UrDByYPUHzh6Q4ffA3TYfeo,294671.0287,147200.0172,1286,614,1900,-147471.0115,-0.5005,2.02
1,4DbAcLDyhCLX7rKPx55xTQA6D8w2poSg3xwW6NzozAAe,LiNgojrWAuWjsLTHghPwi23b46bMDUQhwmg5aWkDqof,228000.0,113909.8603,228,134,362,-114090.1397,-0.5004,0.37
2,4DbAcLDyhCLX7rKPx55xTQA6D8w2poSg3xwW6NzozAAe,Pain8LjdMXzL1CLMwy2H6cvdUNCDiWgCWwYBYyrj86J,179445.7268,89651.955,638,303,941,-89793.7718,-0.5004,0.67
3,HV1KXxWFaSeriyFvXyx48FqG9BoFbfinB8njCJonqP7K,ECY31gWwxy4s2VnMkYhmqDkrV75KrwR2yTtsnrnSpump,58609.7887,1716.4313,16263,248,22900,-56893.3574,-0.9707,22.69
4,AupTbxArPau5H97izWurgska1hEvFNrYM1U8Yy9ijrWU,xyzR4s6H724bUq6q7MTqWxUnhi8LM5fiKKUq38h8M1P,6000.0,57111.8536,1,19,20,51111.8536,8.5186,0.64


In [8]:
non_sol_trades_query = """
-- Non-SOL trades analysis (as requested)
WITH trader_trade_classification AS (
    SELECT 
        swapper,
        COUNT(*) as total_trades,
        
        -- Trade type classification
        COUNT(CASE WHEN swap_from_mint = 'So11111111111111111111111111111111111111112' THEN 1 END) as sol_to_token_trades,
        COUNT(CASE WHEN swap_to_mint = 'So11111111111111111111111111111111111111112' THEN 1 END) as token_to_sol_trades,
        COUNT(CASE WHEN swap_from_mint != 'So11111111111111111111111111111111111111112' 
                     AND swap_to_mint != 'So11111111111111111111111111111111111111112' THEN 1 END) as token_to_token_trades,
        
        -- Token diversity in non-SOL trades
        COUNT(DISTINCT CASE WHEN swap_from_mint != 'So11111111111111111111111111111111111111112' 
                             AND swap_to_mint != 'So11111111111111111111111111111111111111112' 
                        THEN swap_from_mint END) as unique_from_tokens_non_sol,
        COUNT(DISTINCT CASE WHEN swap_from_mint != 'So11111111111111111111111111111111111111112' 
                             AND swap_to_mint != 'So11111111111111111111111111111111111111112' 
                        THEN swap_to_mint END) as unique_to_tokens_non_sol,

        
    FROM first_day_trades
    GROUP BY swapper
)
SELECT 
    swapper,
    total_trades,
    sol_to_token_trades,
    token_to_sol_trades,
    token_to_token_trades,
    unique_from_tokens_non_sol,
    unique_to_tokens_non_sol,
    
    -- Calculate trade distribution percentages
    ROUND(sol_to_token_trades::FLOAT / total_trades, 4) as sol_to_token_percentage,
    ROUND(token_to_sol_trades::FLOAT / total_trades, 4) as token_to_sol_percentage,
    ROUND(token_to_token_trades::FLOAT / total_trades, 4) as token_to_token_percentage,
    
    -- Buy/sell balance
    CASE WHEN token_to_sol_trades > 0 THEN
        ROUND(sol_to_token_trades::FLOAT / token_to_sol_trades, 4) 
    ELSE NULL END as buy_sell_ratio

FROM trader_trade_classification
ORDER BY token_to_token_trades DESC;
"""

print("=== NON-SOL TRADE ANALYSIS ===") 
print("Features: trade type counts/percentages, buy_sell_ratio, success rates by type, trading_style, arbitrage indicators")
print("\nExecuting query...")

non_sol_features = analyzer.execute_query(non_sol_trades_query)

if non_sol_features is not None:
      print(f"\n📊 Non-SOL Trade Analysis Complete!")
      print(f"  Total traders analyzed: {format_large_number(len(non_sol_features))}")

      # Trade type statistics
      total_sol_to_token = non_sol_features['sol_to_token_trades'].sum()
      total_token_to_sol = non_sol_features['token_to_sol_trades'].sum()
      total_token_to_token = non_sol_features['token_to_token_trades'].sum()
      total_all_trades = total_sol_to_token + total_token_to_sol + total_token_to_token

      print(f"\n📊 Overall Trade Type Distribution:")
      print(f"  SOL → Token: {format_large_number(total_sol_to_token)} ({total_sol_to_token/total_all_trades*100:.1f}%)")
      print(f"  Token → SOL: {format_large_number(total_token_to_sol)} ({total_token_to_sol/total_all_trades*100:.1f}%)")
      print(f"  Token → Token: {format_large_number(total_token_to_token)} ({total_token_to_token/total_all_trades*100:.1f}%)")

      # Token arbitrage analysis
      arbitrageurs = non_sol_features[non_sol_features['token_to_token_trades'] > 0]
      high_diversity = non_sol_features[non_sol_features['unique_from_tokens_non_sol'] + non_sol_features['unique_to_tokens_non_sol'] > 10]

      print(f"\n🔄 Token Trading Activity:")
      print(f"  Token arbitrageurs: {format_large_number(len(arbitrageurs))} ({len(arbitrageurs)/len(non_sol_features)*100:.1f}%)")
      print(f"  High token diversity: {format_large_number(len(high_diversity))} ({len(high_diversity)/len(non_sol_features)*100:.1f}%)")

      # Token diversity analysis
      print(f"\n📊 Token Diversity Statistics:")
      print(f"  Mean unique tokens (from): {non_sol_features['unique_from_tokens_non_sol'].mean():.2f}")
      print(f"  Mean unique tokens (to): {non_sol_features['unique_to_tokens_non_sol'].mean():.2f}")
      print(f"  Max unique tokens (from): {non_sol_features['unique_from_tokens_non_sol'].max()}")
      print(f"  Max unique tokens (to): {non_sol_features['unique_to_tokens_non_sol'].max()}")

      # Top token traders
      if len(arbitrageurs) > 0:
          print(f"\n🏆 Top 5 Token-to-Token Traders:")
          top_arb = arbitrageurs.nlargest(5, 'token_to_token_trades').copy()
          top_arb['swapper'] = top_arb['swapper'].apply(lambda x: truncate_address(x))
          display(top_arb[['swapper', 'total_trades', 'token_to_token_trades', 'token_to_token_percentage', 'unique_from_tokens_non_sol',
  'unique_to_tokens_non_sol']])

else:
      print("❌ Failed to execute non-SOL trades features query")


=== NON-SOL TRADE ANALYSIS ===
Features: trade type counts/percentages, buy_sell_ratio, success rates by type, trading_style, arbitrage indicators

Executing query...

📊 Non-SOL Trade Analysis Complete!
  Total traders analyzed: 10.1M

📊 Overall Trade Type Distribution:
  SOL → Token: 171.4M (52.7%)
  Token → SOL: 150.0M (46.1%)
  Token → Token: 3.8M (1.2%)

🔄 Token Trading Activity:
  Token arbitrageurs: 631.2K (6.3%)
  High token diversity: 6.8K (0.1%)

📊 Token Diversity Statistics:
  Mean unique tokens (from): 0.10
  Mean unique tokens (to): 0.09
  Max unique tokens (from): 685
  Max unique tokens (to): 659

🏆 Top 5 Token-to-Token Traders:


Unnamed: 0,swapper,total_trades,token_to_token_trades,token_to_token_percentage,unique_from_tokens_non_sol,unique_to_tokens_non_sol
0,CapuXNQo...LVps,332370,70277,0.2114,550,565
1,GGztQqQ6...jgSJ,329347,69567,0.2112,553,550
2,2MFoS3MP...Cj3h,328425,69331,0.2111,550,560
3,6U91aKa8...2tbB,328732,69033,0.21,551,558
4,BQ72nSv9...GQDV,328435,68965,0.21,551,555


In [33]:
non_sol_features.head()

Unnamed: 0,swapper,total_trades,sol_to_token_trades,token_to_sol_trades,token_to_token_trades,unique_from_tokens_non_sol,unique_to_tokens_non_sol,sol_to_token_percentage,token_to_sol_percentage,token_to_token_percentage,buy_sell_ratio
0,CapuXNQoDviLvU1PxFiizLgPNQCxrsag1uMeyk6zLVps,332370,153128,108965,70277,550,565,0.4607,0.3278,0.2114,1.4053
1,GGztQqQ6pCPaJQnNpXBgELr5cs3WwDakRbh1iEMzjgSJ,329347,151986,107794,69567,553,550,0.4615,0.3273,0.2112,1.41
2,2MFoS3MPtvyQ4Wh4M9pdfPjz6UhVoNbFbGJAskCPCj3h,328425,151145,107949,69331,550,560,0.4602,0.3287,0.2111,1.4002
3,6U91aKa8pmMxkJwBCfPTmUEfZi6dHe7DcFq2ALvB2tbB,328732,151457,108242,69033,551,558,0.4607,0.3293,0.21,1.3992
4,BQ72nSv9f3PRyRKCBnHLVrerrv37CYTHm5h3s9VSGQDV,328435,151778,107692,68965,551,555,0.4621,0.3279,0.21,1.4094


# Consolidate features and store

In [10]:
def consolidate_trader_features():
      """
      Merge all trader-level features into one comprehensive DataFrame
      """
      print("=== CONSOLIDATING TRADER FEATURES ===")

      # Start with volume features as base (has all traders)
      print("Starting with volume features as base...")
      consolidated_features = volume_features.copy()

      # Merge diversification features
      print("Merging diversification features...")
      consolidated_features = consolidated_features.merge(
          diversification_features[['swapper', 'unique_coins_traded', 'avg_trades_per_coin',
  'trade_concentration_ratio']],
          on='swapper',
          how='left'
      )

      # Merge timing features
      print("Merging timing features...")
      consolidated_features = consolidated_features.merge(
          timing_features[['swapper', 'trading_span_days', 'trades_per_day', 'avg_hours_between_trades',
                          'active_hours', 'active_days', 'trades_per_active_hour']],
          on='swapper',
          how='left'
      )

      # Merge bot features
      print("Merging bot features...")
      consolidated_features = consolidated_features.merge(
          bot_features[['swapper', 'round_number_preference']],
          on='swapper',
          how='left'
      )

      # Merge non-SOL trade features
      print("Merging non-SOL trade features...")
      consolidated_features = consolidated_features.merge(
          non_sol_features[['swapper', 'sol_to_token_trades', 'token_to_sol_trades',
  'token_to_token_trades',
                           'unique_from_tokens_non_sol', 'unique_to_tokens_non_sol',
  'sol_to_token_percentage',
                           'token_to_sol_percentage', 'token_to_token_percentage', 'buy_sell_ratio']],
          on='swapper',
          how='left'
      )

      print(f"\n📊 Consolidated Features Complete!")
      print(f"  Total traders: {format_large_number(len(consolidated_features))}")
      print(f"  Total features: {len(consolidated_features.columns)}")
      print(f"  Memory usage: {consolidated_features.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

      # Show feature summary
      print(f"\n📈 Feature Categories:")
      volume_cols = [col for col in consolidated_features.columns if any(x in col.lower() for x in
  ['sol_spent', 'sol_received', 'trade_size', 'pnl'])]
      timing_cols = [col for col in consolidated_features.columns if any(x in col.lower() for x in
  ['hours', 'days', 'span'])]
      diversity_cols = [col for col in consolidated_features.columns if any(x in col.lower() for x in
  ['coins', 'concentration', 'token'])]

      print(f"  Volume & Scale: {len(volume_cols)} features")
      print(f"  Timing & Behavior: {len(timing_cols)} features")
      print(f"  Diversification: {len(diversity_cols)} features")
      print(f"  Bot Detection: 1 feature")

      return consolidated_features

  # Execute consolidation
consolidated_features = consolidate_trader_features()

=== CONSOLIDATING TRADER FEATURES ===
Starting with volume features as base...
Merging diversification features...
Merging timing features...
Merging bot features...
Merging non-SOL trade features...

📊 Consolidated Features Complete!
  Total traders: 10.1M
  Total features: 30
  Memory usage: 2810.8 MB

📈 Feature Categories:
  Volume & Scale: 8 features
  Timing & Behavior: 4 features
  Diversification: 10 features
  Bot Detection: 1 feature


In [11]:
consolidated_features.head()

Unnamed: 0,swapper,total_trades_count,total_sol_spent,total_sol_received,avg_sol_trade_size,median_sol_trade_size,max_single_sol_trade,min_sol_trade_size,sol_trade_size_std_dev,trade_size_coefficient_variation,net_sol_pnl,unique_coins_traded,avg_trades_per_coin,trade_concentration_ratio,trading_span_days,trades_per_day,avg_hours_between_trades,active_hours,active_days,trades_per_active_hour,round_number_preference,sol_to_token_trades,token_to_sol_trades,token_to_token_trades,unique_from_tokens_non_sol,unique_to_tokens_non_sol,sol_to_token_percentage,token_to_sol_percentage,token_to_token_percentage,buy_sell_ratio
0,arsc4jbDnzaqcCLByyGo7fg7S2SmcFsWUzQuDtLZh2y,151632,3165754.0,3209894.0,41.7557,15.4261,2970.0,0.0001,107.4546,2.5734,44139.79,264.0,574.359985,0.0304,52.29,2899.6,0.0083,921,52,164.639999,0.0,75816,75816,0,0,0,0.5,0.5,0.0,1.0
1,HV1KXxWFaSeriyFvXyx48FqG9BoFbfinB8njCJonqP7K,1279901,1562690.0,24395.16,1.3016,0.1759,2330.7915,0.0,7.563,5.8106,-1538295.0,3321.0,385.399994,0.023,401.49,3187.87,0.0075,5301,326,241.449997,0.00183,1200601,10386,68914,41,345,0.938,0.0081,0.0538,115.598
2,8MqRTAQnjhDYH7TWS1b1DjFog4CLZfySWE5cZeotG2VW,105378,1221408.0,1237385.0,23.2609,9.4538,2024.633,0.0001,54.173,2.3289,15976.95,1349.0,78.120003,0.0087,113.04,932.23,0.0257,2220,114,47.470001,0.0,52509,52869,0,0,0,0.4983,0.5017,0.0,0.9932
3,AD65fgYti96iSSzSPaNazV9Bs29m7JbNomGjG4Cp5WFS,68131,1027367.0,1043267.0,30.1528,10.3493,2970.0,0.0,96.0312,3.1848,15900.14,553.0,123.199997,0.0319,152.79,445.93,0.0538,1662,134,40.990002,0.0,34072,34059,0,0,0,0.5001,0.4999,0.0,1.0004
4,D4zVhwuUsFbcaty7wJhNEZ7VEwPHXQ5d2heXPxM5yWhL,74918,859792.8,857947.2,22.5348,9.9653,1295.2484,0.0,48.0948,2.1342,-1845.559,1270.0,58.990002,0.0242,344.14,217.69,0.1102,2771,229,27.040001,0.034885,38154,36747,17,4,5,0.5093,0.4905,0.0002,1.0383


In [13]:
def create_features_database(consolidated_features, sol_pnl_features):
      """
      Simple function to create a new DuckDB database and insert consolidated features
      """
      import duckdb
      import os

      # Database path
      features_db_path = '/Volumes/Extreme SSD/DuckDB/solana.duckdb'

      # Remove existing database if it exists
      if os.path.exists(features_db_path):
          os.remove(features_db_path)
          print(f"Removed existing database")

      # Create new connection
      con = duckdb.connect(features_db_path)

      # Insert DataFrames directly
      con.execute("CREATE TABLE trader_features AS SELECT * FROM consolidated_features")
      con.execute("CREATE TABLE trader_coin_performance AS SELECT * FROM sol_pnl_features")

      # Add basic indexes
      con.execute("CREATE INDEX idx_trader_swapper ON trader_features(swapper)")
      con.execute("CREATE INDEX idx_coin_swapper_mint ON trader_coin_performance(swapper, mint)")

      # Verify
      trader_count = con.execute("SELECT COUNT(*) FROM trader_features").fetchone()[0]
      coin_count = con.execute("SELECT COUNT(*) FROM trader_coin_performance").fetchone()[0]

      con.close()

      print(f"✅ Database created: {features_db_path}")
      print(f"   Trader features: {format_large_number(trader_count)} records")
      print(f"   Trader-coin performance: {format_large_number(coin_count)} records")

      return features_db_path

def connect_to_features_db():
      """
      Connect to the features database
      """
      import duckdb
      return duckdb.connect('/Volumes/Extreme SSD/DuckDB/solana.duckdb')

  # Execute
analyzer.close()
features_db_path = create_features_database(consolidated_features, sol_pnl_features)

✅ Database created: /Volumes/Extreme SSD/DuckDB/solana.duckdb
   Trader features: 10.1M records
   Trader-coin performance: 43.4M records


In [14]:
import duckdb
def quick_verify():
      """
      Quick verification of key metrics
      """
      con = duckdb.connect('/Volumes/Extreme SSD/DuckDB/solana.duckdb')

      print("=== QUICK VERIFICATION ===")

      # Basic counts
      trader_count = con.execute("SELECT COUNT(*) FROM trader_features").fetchone()[0]
      coin_count = con.execute("SELECT COUNT(*) FROM trader_coin_performance").fetchone()[0]

      # Key stats
      stats = con.execute("""
          SELECT 
              AVG(total_sol_spent) as avg_volume,
              MAX(total_sol_spent) as max_volume,
              AVG(unique_coins_traded) as avg_coins,
              MAX(unique_coins_traded) as max_coins
          FROM trader_features
      """).fetchone()

      print(f"Traders: {format_large_number(trader_count)}")
      print(f"Trader-coin pairs: {format_large_number(coin_count)}")
      print(f"Avg volume: {stats[0]:.2f} SOL")
      print(f"Max volume: {stats[1]:.2f} SOL")
      print(f"Avg coins per trader: {stats[2]:.1f}")
      print(f"Max coins per trader: {stats[3]}")

      con.close()

  # Quick check
quick_verify()

=== QUICK VERIFICATION ===
Traders: 10.1M
Trader-coin pairs: 43.4M
Avg volume: 36.16 SOL
Max volume: 3165753.88 SOL
Avg coins per trader: 4.3
Max coins per trader: 3321.0
