# Trader Clustering Model

Build clustering model using:
1. First day trading data: `trader_features` and `trader_coin_performance` 
2. First two hours trading data: `trader_coin_features`

Goal: Identify trader patterns to predict behavior when new coins launch

For this notebook: check the data quality and prepare the data for clustering

In [1]:
import sys
sys.path.append('/Users/noel/projects/trading_eda/solana')

from solana_eda_utils import SolanaDataAnalyzer
import pandas as pd
import numpy as np
import duckdb

# Connect to database
con = duckdb.connect('/Volumes/Extreme SSD/DuckDB/solana.duckdb', read_only=True)
print("Database connection established")

Database connection established


## 1. Basic Data Check - Available Tables

In [2]:
# Check available tables
tables = con.execute('SHOW TABLES').fetchdf()
print("Available tables:")
print(tables)

# Check table sizes
print("\nTable sizes:")
for table in tables['name']:
    count = con.execute(f'SELECT COUNT(*) as count FROM {table}').fetchdf()
    print(f"{table}: {count.iloc[0]['count']:,} rows")

Available tables:
                                   name
0                  coin_first_two_hours
1                      first_day_trades
2  trader_coin_first_two_hours_features
3               trader_coin_performance
4                       trader_features

Table sizes:
coin_first_two_hours: 133,394,160 rows
first_day_trades: 325,171,663 rows
trader_coin_first_two_hours_features: 17,585,219 rows
trader_coin_performance: 43,441,089 rows
trader_features: 10,060,972 rows


## 2. Examine First Day Trading Data

In [3]:
# Check trader_features table structure
trader_features_info = con.execute('DESCRIBE trader_features').fetchdf()
print("trader_features table structure:")
print(trader_features_info)

# Sample data
trader_features_sample = con.execute('SELECT * FROM trader_features LIMIT 5').fetchdf()
print("\ntrader_features sample:")
print(trader_features_sample)

trader_features table structure:
                         column_name column_type null   key default extra
0                            swapper     VARCHAR  YES  None    None  None
1                 total_trades_count      BIGINT  YES  None    None  None
2                    total_sol_spent      DOUBLE  YES  None    None  None
3                 total_sol_received      DOUBLE  YES  None    None  None
4                 avg_sol_trade_size      DOUBLE  YES  None    None  None
5              median_sol_trade_size      DOUBLE  YES  None    None  None
6               max_single_sol_trade      DOUBLE  YES  None    None  None
7                 min_sol_trade_size      DOUBLE  YES  None    None  None
8             sol_trade_size_std_dev      DOUBLE  YES  None    None  None
9   trade_size_coefficient_variation      DOUBLE  YES  None    None  None
10                       net_sol_pnl      DOUBLE  YES  None    None  None
11               unique_coins_traded      DOUBLE  YES  None    None  None
12   

In [4]:
# Check trader_coin_performance table structure  
performance_info = con.execute('DESCRIBE trader_coin_performance').fetchdf()
print("trader_coin_performance table structure:")
print(performance_info)

# Sample data
performance_sample = con.execute('SELECT * FROM trader_coin_performance LIMIT 5').fetchdf()
print("\ntrader_coin_performance sample:")
print(performance_sample)

trader_coin_performance table structure:
              column_name column_type null   key default extra
0                 swapper     VARCHAR  YES  None    None  None
1                    mint     VARCHAR  YES  None    None  None
2       sol_spent_on_coin      DOUBLE  YES  None    None  None
3  sol_received_from_coin      DOUBLE  YES  None    None  None
4              buy_trades      BIGINT  YES  None    None  None
5             sell_trades      BIGINT  YES  None    None  None
6       total_coin_trades      BIGINT  YES  None    None  None
7    net_sol_pnl_per_coin      DOUBLE  YES  None    None  None
8             roi_on_coin      DOUBLE  YES  None    None  None
9    hours_active_on_coin      DOUBLE  YES  None    None  None

trader_coin_performance sample:
                                        swapper  \
0  4DbAcLDyhCLX7rKPx55xTQA6D8w2poSg3xwW6NzozAAe   
1  4DbAcLDyhCLX7rKPx55xTQA6D8w2poSg3xwW6NzozAAe   
2  4DbAcLDyhCLX7rKPx55xTQA6D8w2poSg3xwW6NzozAAe   
3  HV1KXxWFaSeriyFvXyx48FqG9B

## 3. Examine First Two Hours Trading Data

In [5]:
# Check if trader_coin_features table exists (from previous notebook)
try:
    coin_features_info = con.execute('DESCRIBE trader_coin_first_two_hours_features').fetchdf()
    print("trader_coin_features table structure:")
    print(coin_features_info)
    
    # Sample data
    coin_features_sample = con.execute('SELECT * FROM trader_coin_first_two_hours_features LIMIT 5').fetchdf()
    print("\ntrader_coin_features sample:")
    print(coin_features_sample)
    
except Exception as e:
    print(f"trader_coin_first_two_hours_features table not found: {e}")
    print("Need to create this table from the previous notebook first")

trader_coin_features table structure:
                   column_name column_type null   key default extra
0                    trader_id     VARCHAR  YES  None    None  None
1                      coin_id     VARCHAR  YES  None    None  None
2                  trade_count      BIGINT  YES  None    None  None
3              trades_per_hour      DOUBLE  YES  None    None  None
4            time_span_minutes      DOUBLE  YES  None    None  None
5          total_volume_traded      DOUBLE  YES  None    None  None
6               avg_trade_size      DOUBLE  YES  None    None  None
7                trade_size_cv      DOUBLE  YES  None    None  None
8           largest_trade_size      DOUBLE  YES  None    None  None
9         volume_concentration      DOUBLE  YES  None    None  None
10        unique_trading_pairs      BIGINT  YES  None    None  None
11       sol_involvement_ratio       FLOAT  YES  None    None  None
12                   buy_ratio       FLOAT  YES  None    None  None
13        

In [6]:
# Create trader-level PnL features from coin performance data
pnl_features_query = """
SELECT 
    swapper,
    COUNT(*) as total_positions,
    
    -- Win rate
    ROUND(SUM(CASE WHEN net_sol_pnl_per_coin > 0 THEN 1 ELSE 0 END)::FLOAT / COUNT(*), 4) as win_rate,
    
    -- Average PnL per position
    ROUND(AVG(net_sol_pnl_per_coin), 4) as avg_pnl_per_position,
    
    -- Average ROI (only for positions with actual spending)
    ROUND(AVG(CASE WHEN roi_on_coin IS NOT NULL THEN roi_on_coin END), 4) as avg_roi,

    sum(buy_trades)::FLOAT/sum(sell_trades)::FLOAT as overall_buy_sell_ratio
    
FROM trader_coin_performance 
GROUP BY swapper
ORDER BY avg_pnl_per_position DESC;
"""

print("Creating trader-level PnL features...")
trader_pnl_features = con.execute(pnl_features_query).fetchdf()

print(f"Generated features for {len(trader_pnl_features):,} traders")
print("\nSample data:")
print(trader_pnl_features.head(10))

Creating trader-level PnL features...
Generated features for 10,060,972 traders

Sample data:
                                        swapper  total_positions  win_rate  \
0  DLcw9YVYTsgBbAPBfXKV8JS6KRXo6VgLFDM5o1ydZV7R                1       1.0   
1  AMd5bXpf5wh3wrKkDa4c18jNo147WBFsj55rvJvXzsjT                1       1.0   
2  Gwc7ApjwTaFHC6HB7cWEFjsqaayZoAqdvSsiTwbfZv8m                1       1.0   
3  2mzEJFRa9UXvkLBeJMH16j6GUyDxPLnkDVoUUt1MLeej                1       1.0   
4  6Qcx9qWCb8UPZv6R6i4j8CWzxrEYtsodPTNKrZzG7M2q                1       1.0   
5  GUw1Av3EGLMtGJUUY4RgQpR4SR8N2EMyowtD8vuNs8ab                1       1.0   
6  9Tqaa1V36pBHtNXrZS4xrLru8HbV38pRRyh4YxR6bxUw                1       1.0   
7  CHj3vHyMhF6DF3VkwhzgK833o7uvsN7CrPVyUdmbFo5E                1       1.0   
8  9DtTbUgdzFoKvaM7ALpB8LhYHf4Q7ggjphsBHotREj8z                1       1.0   
9  HiiqhjDF8hgeBbnbZNdkyjx6Wvy2oefvh85GokojbRnN                1       1.0   

   avg_pnl_per_position    avg_roi  overall_buy

In [7]:
# Merge trader_features with trader_pnl_features
print("Merging trader features with PnL features...")

# Load trader_features from database
trader_features = con.execute('SELECT * FROM trader_features').fetchdf()
print(f"Loaded {len(trader_features):,} traders from trader_features")

# Merge the datasets
merged_features = trader_features.merge(
    trader_pnl_features[['swapper', 'total_positions', 'win_rate', 'avg_pnl_per_position', 'avg_roi']],
    on='swapper',
    how='left'
)

print(f"Merged dataset: {len(merged_features):,} traders with {len(merged_features.columns)} features")
print(f"Features added: total_positions, win_rate, avg_pnl_per_position, avg_roi")

# Check for missing values in new features
print("\nMissing values in new PnL features:")
pnl_cols = ['total_positions', 'win_rate', 'avg_pnl_per_position', 'avg_roi']
for col in pnl_cols:
    missing = merged_features[col].isna().sum()
    print(f"  {col}: {missing:,} missing ({missing/len(merged_features)*100:.1f}%)")

print("\nSample of merged data:")
print(merged_features.head())

Merging trader features with PnL features...
Loaded 10,060,972 traders from trader_features
Loaded 10,060,972 traders from trader_features
Merged dataset: 10,060,972 traders with 34 features
Features added: total_positions, win_rate, avg_pnl_per_position, avg_roi

Missing values in new PnL features:
  total_positions: 0 missing (0.0%)
  win_rate: 0 missing (0.0%)
  avg_pnl_per_position: 0 missing (0.0%)
  avg_roi: 944,706 missing (9.4%)

Sample of merged data:
                                        swapper  total_trades_count  \
0   arsc4jbDnzaqcCLByyGo7fg7S2SmcFsWUzQuDtLZh2y              151632   
1  HV1KXxWFaSeriyFvXyx48FqG9BoFbfinB8njCJonqP7K             1279901   
2  8MqRTAQnjhDYH7TWS1b1DjFog4CLZfySWE5cZeotG2VW              105378   
3  AD65fgYti96iSSzSPaNazV9Bs29m7JbNomGjG4Cp5WFS               68131   
4  D4zVhwuUsFbcaty7wJhNEZ7VEwPHXQ5d2heXPxM5yWhL               74918   

   total_sol_spent  total_sol_received  avg_sol_trade_size  \
0     3.165754e+06        3.209894e+06        

## 4. Data Preparation Plan for All 34 Features

Based on the comprehensive feature set from trader_feature_engineering.md, here's a detailed plan for preparing all 34 trader-level features for clustering analysis:

### Feature Categories & Transformation Strategy

#### 1. Volume & Scale Features (11 features)
**High skew, need log transformation**
- `total_trades_count`: Log1p transform (handles 0 values)
- `total_sol_spent`, `total_sol_received`: Log1p transform 
- `avg_sol_trade_size`, `median_sol_trade_size`: Log1p transform
- `max_single_sol_trade`: Log1p transform (captures whale behavior)
- `min_sol_trade_size`: Log1p transform (keeps 0 values as-is, log1p handles them naturally)
- `sol_trade_size_std_dev`: Log1p transform
- `trade_size_coefficient_variation`: Already normalized ratio, StandardScaler
- `net_sol_pnl`: Handle negative values → Signed log transform

#### 2. Diversification Features (3 features) 
**Count-based, use sqrt transformation**
- `unique_coins_traded`: Sqrt transform (reduces right skew)
- `avg_trades_per_coin`: Log1p transform
- `trade_concentration_ratio`: Already 0-1 bounded, StandardScaler

#### 3. Timing & Behavioral Features (6 features)
**Mixed scales and distributions**
- `trading_span_days`: Log1p transform (wide range: 0-1509 days)
- `trades_per_day`: Log1p transform (high variance)
- `avg_hours_between_trades`: Fill NULL with (max_value + 1), then Log1p transform + binary indicator `is_multi_trader`
- `active_hours`, `active_days`: Sqrt transform (count data)
- `trades_per_active_hour`: Log1p transform (0-36027 range)

#### 4. Bot Detection Features (1 feature)
**Already normalized**
- `round_number_preference`: 0-1 bounded, StandardScaler only

#### 5. Non-SOL Trade Features (10 features)
**Mix of counts and percentages**
- `sol_to_token_trades`, `token_to_sol_trades`, `token_to_token_trades`: Log1p transform
- `unique_from_tokens_non_sol`, `unique_to_tokens_non_sol`: Sqrt transform
- `sol_to_token_percentage`, `token_to_sol_percentage`, `token_to_token_percentage`: Already 0-1, StandardScaler
- `buy_sell_ratio`: Log1p transform (can be very large)

#### 6. Aggregated Performance Features (4 features)
**Mixed distributions, some with missing values**
- `total_positions`: Log1p transform
- `win_rate`: Already 0-1 bounded, StandardScaler
- `avg_pnl_per_position`: Signed log transform (handles negatives)
- `avg_roi`: Fill missing with (max_value + 1), then StandardScaler + binary indicator `has_buy_history`

### Missing Value Strategy

#### 1. Insider Trade Detection (`avg_roi`)
- **Missing values (9.4%)**: These are insider trades (sell-only, no buy cost basis)
- **Solution**: 
  - Fill missing with `max(avg_roi) + 1` to clearly separate from normal range
  - Create binary feature: `has_buy_history` (1 = normal trader, 0 = insider/airdrop)
  - Apply StandardScaler to filled values

#### 2. Single-Trade Account Handling (`avg_hours_between_trades`)
- **NULL values**: Single-trade accounts (can't calculate interval)
- **Solution**:
  - Fill NULL with `max(avg_hours_between_trades) + 1` 
  - Create binary feature: `is_multi_trader` (1 = multiple trades, 0 = single trade)
  - Apply Log1p transform then StandardScaler

#### 3. Zero Value Handling (`min_sol_trade_size`)
- **Zero values**: Dust trades, airdrops, MEV residuals - valid behavioral signal
- **Solution**: Keep 0 values, apply Log1p transform (handles 0 naturally)
- **No binary indicator needed**: 0 is meaningful information about trading behavior

### Final Pipeline Steps
- Original 34 features + 2 binary indicators = **36 features total**
- Apply transformations → StandardScaler for final normalization
- All features remain numerical (no categorical encoding complexity)

In [9]:
# Step 3: Apply transformations according to the plan
print("\n3. Applying feature transformations...")
df_prep = merged_features.copy()
# Helper function for signed log transform (handles negative values)
def signed_log_transform(x):
    return np.sign(x) * np.log1p(np.abs(x))

# 1. Volume & Scale Features - Log1p transform (except coefficient of variation)
volume_log_features = [
    'total_trades_count', 'total_sol_spent', 'total_sol_received',
    'avg_sol_trade_size', 'median_sol_trade_size', 'max_single_sol_trade',
    'min_sol_trade_size', 'sol_trade_size_std_dev'
]

for feature in volume_log_features:
    if feature in df_prep.columns:
        df_prep[feature] = np.log1p(df_prep[feature])

# Handle net_sol_pnl with signed log transform
if 'net_sol_pnl' in df_prep.columns:
    df_prep['net_sol_pnl'] = signed_log_transform(df_prep['net_sol_pnl'])

print(f"   Applied log1p transform to {len(volume_log_features)} volume features + signed log to net_sol_pnl")

# 2. Diversification Features - Mixed transforms
if 'unique_coins_traded' in df_prep.columns:
    df_prep['unique_coins_traded'] = np.sqrt(df_prep['unique_coins_traded'])
if 'avg_trades_per_coin' in df_prep.columns:
    df_prep['avg_trades_per_coin'] = np.log1p(df_prep['avg_trades_per_coin'])
# trade_concentration_ratio stays as-is (already 0-1 bounded)

print("   Applied sqrt to unique_coins_traded, log1p to avg_trades_per_coin")

# 3. Timing Features - Mixed transforms  
timing_log_features = [
    'trading_span_days', 'trades_per_day', 'avg_hours_between_trades', 'trades_per_active_hour'
]
timing_sqrt_features = ['active_hours', 'active_days']

for feature in timing_log_features:
    if feature in df_prep.columns:
        df_prep[feature] = np.log1p(df_prep[feature])

for feature in timing_sqrt_features:
    if feature in df_prep.columns:
        df_prep[feature] = np.sqrt(df_prep[feature])

print(f"   Applied log1p to {len(timing_log_features)} timing features, sqrt to {len(timing_sqrt_features)} count features")

# 4. round_number_preference stays as-is (already 0-1 bounded)

# 5. Non-SOL Trade Features - Mixed transforms
trade_log_features = [
    'sol_to_token_trades', 'token_to_sol_trades', 'token_to_token_trades', 'buy_sell_ratio'
]
trade_sqrt_features = ['unique_from_tokens_non_sol', 'unique_to_tokens_non_sol']

for feature in trade_log_features:
    if feature in df_prep.columns:
        df_prep[feature] = np.log1p(df_prep[feature])

for feature in trade_sqrt_features:
    if feature in df_prep.columns:
        df_prep[feature] = np.sqrt(df_prep[feature])

print(f"   Applied log1p to {len(trade_log_features)} trade count features, sqrt to token diversity features")
# sol_to_token_percentage, token_to_sol_percentage, token_to_token_percentage stay as-is (0-1 bounded)

# 6. Performance Features - Mixed transforms
if 'total_positions' in df_prep.columns:
    df_prep['total_positions'] = np.log1p(df_prep['total_positions'])
if 'avg_pnl_per_position' in df_prep.columns:
    df_prep['avg_pnl_per_position'] = signed_log_transform(df_prep['avg_pnl_per_position'])
# win_rate and avg_roi stay as-is for now (will be standardized)

print("   Applied log1p to total_positions, signed log to avg_pnl_per_position")
print("   Transformations completed.")

print(f"\nTransformed dataset shape: {df_prep.shape}")
print(f"Final feature count: {len(df_prep.columns)} (34 original + 2 binary indicators)")


3. Applying feature transformations...
   Applied log1p transform to 8 volume features + signed log to net_sol_pnl
   Applied sqrt to unique_coins_traded, log1p to avg_trades_per_coin
   Applied log1p to 4 timing features, sqrt to 2 count features
   Applied log1p to 4 trade count features, sqrt to token diversity features
   Applied log1p to total_positions, signed log to avg_pnl_per_position
   Transformations completed.

Transformed dataset shape: (10060972, 34)
Final feature count: 34 (34 original + 2 binary indicators)


In [10]:
# Implementation of Data Preparation Plan
import numpy as np
from sklearn.preprocessing import StandardScaler

print("=== IMPLEMENTING DATA PREPARATION PLAN ===")
#print(f"Starting with {len(merged_features)} traders and {len(merged_features.columns)} features")

# Create a copy for processing
#df_prep = merged_features.copy()

# Remove the swapper ID column for processing
#df_prep = df_prep.drop('swapper', axis=1)

print(f"\nProcessing {len(df_prep.columns)} features...")

# Step 1: Handle missing values with max+1 strategy and create binary indicators
print("\n1. Handling missing values...")

# avg_roi: Fill missing with max+1, create binary indicator
avg_roi_max = df_prep['avg_roi'].max()
df_prep['has_buy_history'] = (~df_prep['avg_roi'].isna()).astype(int)
df_prep['avg_roi'] = df_prep['avg_roi'].fillna(avg_roi_max + 1)
print(f"   avg_roi: filled {(~merged_features['avg_roi'].notna()).sum():,} missing values with {avg_roi_max + 1:.2f}")

# avg_hours_between_trades: Fill NULL with max+1, create binary indicator  
hours_max = df_prep['avg_hours_between_trades'].max()
df_prep['is_multi_trader'] = (~df_prep['avg_hours_between_trades'].isna()).astype(int)
df_prep['avg_hours_between_trades'] = df_prep['avg_hours_between_trades'].fillna(hours_max + 1)
print(f"   avg_hours_between_trades: filled {(~merged_features['avg_hours_between_trades'].notna()).sum():,} missing values with {hours_max + 1:.2f}")

# Check for any remaining missing values
remaining_missing = df_prep.isnull().sum().sum()
print(f"   Remaining missing values: {remaining_missing}")

print(f"\nAfter missing value handling: {len(df_prep.columns)} features")
print(f"Added binary indicators: has_buy_history, is_multi_trader")

=== IMPLEMENTING DATA PREPARATION PLAN ===

Processing 34 features...

1. Handling missing values...
   avg_roi: filled 944,706 missing values with 498888926625.00
   avg_hours_between_trades: filled 3,110,879 missing values with 11.49
   Remaining missing values: 20925350

After missing value handling: 36 features
Added binary indicators: has_buy_history, is_multi_trader


## 6. Further data normalization for larger numerical ones

Some of the numerical features may have extreme values, which caused clustering model training warnings, we should normalize them.

In [11]:
df_prep

Unnamed: 0,swapper,total_trades_count,total_sol_spent,total_sol_received,avg_sol_trade_size,median_sol_trade_size,max_single_sol_trade,min_sol_trade_size,sol_trade_size_std_dev,trade_size_coefficient_variation,...,sol_to_token_percentage,token_to_sol_percentage,token_to_token_percentage,buy_sell_ratio,total_positions,win_rate,avg_pnl_per_position,avg_roi,has_buy_history,is_multi_trader
0,arsc4jbDnzaqcCLByyGo7fg7S2SmcFsWUzQuDtLZh2y,11.929218,14.967902,14.981749,3.755503,2.798872,7.996654,0.0001,4.686332,2.5734,...,0.5000,0.5000,0.0000,0.693147,5.579730,1.0000,5.125131,1.520000e-01,1,1
1,HV1KXxWFaSeriyFvXyx48FqG9BoFbfinB8njCJonqP7K,14.062294,14.261920,10.102181,0.833605,0.162034,7.754392,0.0000,2.147451,5.8106,...,0.9380,0.0081,0.0538,4.758732,8.108322,0.0018,-6.140320,-9.922000e-01,1,1
2,8MqRTAQnjhDYH7TWS1b1DjFog4CLZfySWE5cZeotG2VW,11.565319,14.015515,14.028511,3.188866,2.346966,7.613638,0.0001,4.010474,2.3289,...,0.4983,0.5017,0.0000,0.689741,7.207860,0.9978,2.552838,1.188000e-01,1,1
3,AD65fgYti96iSSzSPaNazV9Bs29m7JbNomGjG4Cp5WFS,11.129202,13.842510,13.857868,3.438904,2.429156,7.996654,0.0000,4.575033,3.1848,...,0.5001,0.4999,0.0000,0.693347,6.317165,0.9964,3.392913,2.088910e+01,1,1
4,D4zVhwuUsFbcaty7wJhNEZ7VEwPHXQ5d2heXPxM5yWhL,11.224163,13.664448,13.662299,3.158480,2.394736,7.167230,0.0000,3.893753,2.1342,...,0.5093,0.4905,0.0002,0.712116,7.147559,0.4976,-0.897393,-8.850000e-02,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10060967,8R4bAxT8sPTo9H3du5o44ZjEnZbGaokdEhpXpu33nhWe,2.302585,0.000000,0.887258,,,0.000000,,,,...,0.0000,1.0000,0.0000,0.000000,0.693147,1.0000,0.887274,4.988889e+11,0,1
10060968,6iFf3f46xz8AxUwWobQEdXBkEaf9wMoae8WpBF3fLQxV,2.302585,0.000000,0.900102,,,0.000000,,,,...,0.0000,1.0000,0.0000,0.000000,0.693147,1.0000,0.900121,4.988889e+11,0,1
10060969,6Uh4ADbZk9x4NPzCTQHAAGeLhD2rZQvhFshWjmLigv3j,1.945910,0.000000,0.804676,,,0.000000,,,,...,0.0000,1.0000,0.0000,0.000000,0.693147,1.0000,0.804689,4.988889e+11,0,1
10060970,7UD1qAyK2YE9N5c9vZn8Zg3Pxke6njfrgdjBjTqMrH2g,2.079442,0.000000,0.754282,,,0.000000,,,,...,0.0000,1.0000,0.0000,0.000000,0.693147,1.0000,0.754289,4.988889e+11,0,1


In [None]:
# Save the prepared features to pickle for memory-efficient loading
# Clean up df_prep for clustering - remove ID column and handle remaining missing values
print("=== PREPARING FINAL CLUSTERING DATASET ===")

  # Handle remaining missing values by filling with 0 (these are from insider traders with no buy history)
cluster_features = df_prep.fillna(0)

print(f"Clustering dataset shape: {cluster_features.shape}")
print(f"Remaining missing values: {cluster_features.isnull().sum().sum()}")

  # Select only numeric features for clustering (all should be numeric at this point)
numeric_cols = cluster_features.select_dtypes(include=[np.number]).columns
cluster_features_final = cluster_features[numeric_cols]

print(f"Final feature set: {len(cluster_features_final.columns)} numeric features")
print(f"Features: {list(cluster_features_final.columns)}")

  # Save the processed features to pickle for memory-efficient loading
import pickle

path = '/Volumes/Extreme SSD/trading_data/solana/trader_features/'

  # save merged_features
with open(path + 'merged_features.pkl', 'wb') as f:
      pickle.dump(merged_features, f)

  # Save cluster-ready features 
with open(path + 'cluster_features.pkl', 'wb') as f:
      pickle.dump(cluster_features_final, f)

  # Save trader IDs separately
trader_ids = df_prep['swapper'].copy()
with open(path + 'trader_ids.pkl', 'wb') as f:
      pickle.dump(trader_ids, f)

print(f"\nSaved cluster_features.pkl ({cluster_features_final.shape[0]:,} rows × {cluster_features_final.shape[1]} cols)")
print(f"Saved trader_ids.pkl ({len(trader_ids):,} trader IDs)")

  # Memory usage estimation
memory_gb = (cluster_features_final.memory_usage(deep=True).sum()) / (1024**3)
print(f"Estimated memory usage: {memory_gb:.2f} GB")

=== PREPARING FINAL CLUSTERING DATASET ===
Clustering dataset shape: (10060972, 36)
Remaining missing values: 0
Final feature set: 35 numeric features
Features: ['total_trades_count', 'total_sol_spent', 'total_sol_received', 'avg_sol_trade_size', 'median_sol_trade_size', 'max_single_sol_trade', 'min_sol_trade_size', 'sol_trade_size_std_dev', 'trade_size_coefficient_variation', 'net_sol_pnl', 'unique_coins_traded', 'avg_trades_per_coin', 'trade_concentration_ratio', 'trading_span_days', 'trades_per_day', 'avg_hours_between_trades', 'active_hours', 'active_days', 'trades_per_active_hour', 'round_number_preference', 'sol_to_token_trades', 'token_to_sol_trades', 'token_to_token_trades', 'unique_from_tokens_non_sol', 'unique_to_tokens_non_sol', 'sol_to_token_percentage', 'token_to_sol_percentage', 'token_to_token_percentage', 'buy_sell_ratio', 'total_positions', 'win_rate', 'avg_pnl_per_position', 'avg_roi', 'has_buy_history', 'is_multi_trader']

Saved cluster_features.pkl (10,060,972 rows 

In [14]:
path = '/Volumes/Extreme SSD/trading_data/solana/trader_features/'

In [15]:
# Load the prepared features from pickle files
import pickle
from sklearn.preprocessing import StandardScaler

print("=== LOADING PREPARED DATA ===")

# Load cluster features
with open(path + 'cluster_features.pkl', 'rb') as f:
    cluster_features = pickle.load(f)

# Load trader IDs
with open(path + 'trader_ids.pkl', 'rb') as f:
    trader_ids = pickle.load(f)

print(f"Loaded cluster features: {cluster_features.shape}")
print(f"Loaded trader IDs: {len(trader_ids):,}")

# Apply StandardScaler
print("\n=== APPLYING STANDARDSCALER ===")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(cluster_features)

print(f"Scaled features shape: {X_scaled.shape}")
print(f"Feature means after scaling: {X_scaled.mean(axis=0)[:5]}")  # Show first 5 means (should be ~0)
print(f"Feature std after scaling: {X_scaled.std(axis=0)[:5]}")    # Show first 5 stds (should be ~1)

# Memory usage check
memory_gb = (X_scaled.nbytes) / (1024**3)
print(f"Scaled data memory usage: {memory_gb:.2f} GB")

=== LOADING PREPARED DATA ===
Loaded cluster features: (10060972, 35)
Loaded trader IDs: 10,060,972

=== APPLYING STANDARDSCALER ===
Scaled features shape: (10060972, 35)
Feature means after scaling: [0.00000000e+00 9.25678527e-17 9.25678527e-17 0.00000000e+00
 4.62839264e-17]
Feature std after scaling: [1. 1. 1. 1. 1.]
Scaled data memory usage: 2.62 GB


In [16]:
# DIAGNOSE NUMERICAL ISSUES
print("=== DIAGNOSING NUMERICAL ISSUES ===")

# Check for problematic values in the original features
print("1. Checking for problematic values in cluster_features:")
print(f"   Shape: {cluster_features.shape}")

# Check for infinite values
inf_cols = []
for col in cluster_features.columns:
    inf_count = np.isinf(cluster_features[col]).sum()
    if inf_count > 0:
        inf_cols.append((col, inf_count))

if inf_cols:
    print(f"\n   Infinite values found in {len(inf_cols)} columns:")
    for col, count in inf_cols:
        print(f"     {col}: {count:,} infinite values")
else:
    print("   ✓ No infinite values found")

# Check for NaN values
nan_cols = []
for col in cluster_features.columns:
    nan_count = cluster_features[col].isna().sum()
    if nan_count > 0:
        nan_cols.append((col, nan_count))

if nan_cols:
    print(f"\n   NaN values found in {len(nan_cols)} columns:")
    for col, count in nan_cols:
        print(f"     {col}: {count:,} NaN values")
else:
    print("   ✓ No NaN values found")

# Check for extremely large values
print("\n2. Checking for extremely large values:")
large_value_cols = []
for col in cluster_features.columns:
    max_val = cluster_features[col].max()
    if max_val > 1e10:  # Values larger than 10 billion
        large_value_cols.append((col, max_val))

if large_value_cols:
    print(f"   Large values found in {len(large_value_cols)} columns:")
    for col, max_val in large_value_cols:
        print(f"     {col}: max = {max_val:.2e}")
else:
    print("   ✓ No extremely large values found")

# Check for zero variance columns
print("\n3. Checking for zero/near-zero variance columns:")
zero_var_cols = []
for col in cluster_features.columns:
    var = cluster_features[col].var()
    if var < 1e-10:  # Essentially zero variance
        zero_var_cols.append((col, var))

if zero_var_cols:
    print(f"   Zero variance columns found: {len(zero_var_cols)}")
    for col, var in zero_var_cols:
        print(f"     {col}: variance = {var:.2e}")
else:
    print("   ✓ No zero variance columns found")

# Sample statistics for problematic columns
if large_value_cols:
    print(f"\n4. Detailed statistics for problematic columns:")
    for col, _ in large_value_cols[:3]:  # Show first 3 problematic columns
        col_data = cluster_features[col]
        print(f"\n   {col}:")
        print(f"     Min: {col_data.min():.2e}")
        print(f"     Max: {col_data.max():.2e}")
        print(f"     Mean: {col_data.mean():.2e}")
        print(f"     Std: {col_data.std():.2e}")
        print(f"     >1e9 values: {(col_data > 1e9).sum():,}")
        print(f"     >1e12 values: {(col_data > 1e12).sum():,}")

=== DIAGNOSING NUMERICAL ISSUES ===
1. Checking for problematic values in cluster_features:
   Shape: (10060972, 35)
   ✓ No infinite values found
   ✓ No NaN values found

2. Checking for extremely large values:
   ✓ No extremely large values found

3. Checking for zero/near-zero variance columns:
   ✓ No zero variance columns found


# One time run for normalizing features

In [None]:
# No need to run this cell because we already have a fixed version of cluster_features
# SMART FIX FOR INSIDER ROI PROBLEM
print("=== SMART FIX FOR INSIDER ROI PROBLEM ===")

# Create a properly cleaned version of cluster_features
cluster_features_fixed = cluster_features.copy()

print("1. Fixing the avg_roi insider trader issue...")

# Strategy: Replace the unrealistic 498+ billion ROI with a reasonable marker value
# that still preserves the insider trader signal but doesn't break clustering

# For insider traders (has_buy_history=0), replace extreme ROI with a reasonable marker
insider_mask = cluster_features_fixed['has_buy_history'] == 0
regular_mask = cluster_features_fixed['has_buy_history'] == 1

# Calculate a reasonable marker value based on regular trader ROI distribution
regular_roi_values = cluster_features_fixed.loc[regular_mask, 'avg_roi']
p99_regular_roi = regular_roi_values.quantile(0.99)
p95_regular_roi = regular_roi_values.quantile(0.95)

# Use 10x the 99th percentile as the insider marker (clearly separates but reasonable)
insider_marker_value = max(10 * p99_regular_roi, 1000)  # At least 1000x ROI

print(f"   Regular trader 99th percentile ROI: {p99_regular_roi:.2f}")
print(f"   Using insider marker value: {insider_marker_value:.2f}")
print(f"   Original insider ROI value: {cluster_features_fixed.loc[insider_mask, 'avg_roi'].iloc[0]:.2e}")

# Replace insider ROI values
cluster_features_fixed.loc[insider_mask, 'avg_roi'] = insider_marker_value

print(f"   Fixed {insider_mask.sum():,} insider trader ROI values")

# 2. Cap extreme regular trader ROI values using robust method
print(f"\n2. Capping extreme regular trader ROI values...")

# For regular traders, cap extreme outliers using IQR method
regular_roi_data = cluster_features_fixed.loc[regular_mask, 'avg_roi']
Q1 = regular_roi_data.quantile(0.25)
Q3 = regular_roi_data.quantile(0.75)
IQR = Q3 - Q1

# More conservative bounds for financial data
lower_bound = Q1 - 3 * IQR
upper_bound = Q3 + 3 * IQR

# Apply capping only to regular traders
extreme_regular_mask = regular_mask & (
    (cluster_features_fixed['avg_roi'] < lower_bound) | 
    (cluster_features_fixed['avg_roi'] > upper_bound)
)

if extreme_regular_mask.any():
    print(f"   Capping {extreme_regular_mask.sum():,} extreme regular trader ROI values")
    print(f"   Bounds: [{lower_bound:.2f}, {upper_bound:.2f}]")
    cluster_features_fixed.loc[extreme_regular_mask, 'avg_roi'] = np.clip(
        cluster_features_fixed.loc[extreme_regular_mask, 'avg_roi'],
        lower_bound,
        upper_bound
    )

# 3. Verify the fix
print(f"\n3. Verifying the fix...")
fixed_roi = cluster_features_fixed['avg_roi']
print(f"   New avg_roi statistics:")
print(f"     Min: {fixed_roi.min():.2f}")
print(f"     Max: {fixed_roi.max():.2f}")
print(f"     Mean: {fixed_roi.mean():.2f}")
print(f"     Std: {fixed_roi.std():.2f}")

# Check for numerical stability
max_abs_value = np.abs(fixed_roi).max()
print(f"     Maximum absolute value: {max_abs_value:.2e}")

if max_abs_value < 1e6:
    print("     ✓ Values are now in a reasonable range for clustering")
elif max_abs_value < 1e9:
    print("     ⚠️  Values are better but still large - monitor clustering")
else:
    print("     ❌ Values are still too large - need further preprocessing")

# 4. Apply the fix to all problematic features
print(f"\n4. Applying robust preprocessing to all features...")

for col in cluster_features_fixed.columns:
    if col in ['has_buy_history', 'is_multi_trader']:  # Skip binary indicators
        continue
        
    col_data = cluster_features_fixed[col]
    
    # Skip if already reasonable
    if np.abs(col_data).max() < 1e3:
        continue
    
    # Apply robust capping using IQR method
    Q1 = col_data.quantile(0.25)
    Q3 = col_data.quantile(0.75)
    IQR = Q3 - Q1
    
    if IQR > 0:  # Avoid division by zero
        lower_bound = Q1 - 3 * IQR
        upper_bound = Q3 + 3 * IQR
        
        outlier_count = ((col_data < lower_bound) | (col_data > upper_bound)).sum()
        if outlier_count > 0:
            cluster_features_fixed[col] = col_data.clip(lower_bound, upper_bound)
            print(f"   {col}: capped {outlier_count:,} outliers to [{lower_bound:.2e}, {upper_bound:.2e}]")

print(f"\nFixed dataset shape: {cluster_features_fixed.shape}")

# Update the cluster_features for subsequent use
cluster_features = cluster_features_fixed
print(f"✓ Updated cluster_features with numerically stable data")

import pickle

  # Save cluster-ready features 
with open(path + 'cluster_features.pkl', 'wb') as f:
      pickle.dump(cluster_features, f)

=== SMART FIX FOR INSIDER ROI PROBLEM ===
1. Fixing the avg_roi insider trader issue...
   Regular trader 99th percentile ROI: 3.56
   Using insider marker value: 1000.00
   Original insider ROI value: 4.99e+11
   Fixed 944,706 insider trader ROI values

2. Capping extreme regular trader ROI values...
   Capping 112,801 extreme regular trader ROI values
   Bounds: [-3.98, 2.97]

3. Verifying the fix...
   New avg_roi statistics:
     Min: -1.00
     Max: 1000.00
     Mean: 93.53
     Std: 291.81
     Maximum absolute value: 1.00e+03
     ✓ Values are now in a reasonable range for clustering

4. Applying robust preprocessing to all features...
   avg_roi: capped 944,706 outliers to [-4.09e+00, 3.12e+00]

Fixed dataset shape: (10060972, 35)
✓ Updated cluster_features with numerically stable data
