# 03 - Predictability & Universe Filtering

1. Compute rolling predictability metrics for each ticker
2. Visualize and compare scores across universe and time
3. Select top-N most “learnable” tickers for RL agent
4. Document all decisions, assumptions, and open questions


In [5]:
# SETUP: Imports & Paths ===========================
import jupyter

import os
import pandas as pd


from tqdm import tqdm
from src.data.feature_pipeline import basic_chart_features,load_base_dataframe
from src.predictability.easiness import rolling_sharpe, rolling_r2, rolling_info_ratio, rolling_autocorr
from src.predictability.pipeline import generate_universe_easiness_report
from IPython import display


In [6]:
# LOAD OHLCV ==========================================
ohlcv = load_base_dataframe()
ohlcv.tail()


Unnamed: 0,id,symbol,timestamp,date,open,high,low,close,volume,trade_count,...,vwap_change,trade_count_change,sector_id,industry_id,return_1d,vix,vix_norm,sp500,sp500_norm,market_return_1d
429701,429702,SPY,2025-05-29 04:00:00,2025-05-29,593.06,593.2,586.07,590.05,70073758.0,826143.0,...,-0.00025,0.128714,,,0.003947,0.1918,-0.006732,59.1217,0.004011,0.004011
429702,429703,SPY,2025-05-30 04:00:00,2025-05-30,588.93,591.1299,583.235,589.39,90601242.0,884337.0,...,-0.002564,0.070441,,,-0.001119,0.1857,-0.031804,59.1169,-8.1e-05,-8.1e-05
429703,429704,SPY,2025-06-02 04:00:00,2025-06-02,587.76,592.79,585.06,592.71,61630502.0,728812.0,...,0.003093,-0.175866,,,0.005633,0.1836,-0.011309,59.3594,0.004102,0.004102
429704,429705,SPY,2025-06-03 04:00:00,2025-06-03,592.34,597.08,591.85,596.09,63606204.0,690792.0,...,0.009093,-0.052167,,,0.005703,0.1769,-0.036492,59.7037,0.0058,0.0058
429705,429706,SPY,2025-06-04 04:00:00,2025-06-04,596.96,597.95,595.49,595.93,57314199.0,629200.0,...,0.002856,-0.089161,,,-0.000268,0.1761,-0.004522,59.7081,7.4e-05,7.4e-05


In [7]:
# CROP THE SAMPLE =======================================
tickers = ohlcv['symbol'].unique()

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

# Example: df = your DataFrame with ['symbol', 'date', 'feature1', 'feature2', ...]

def prepare_stock_data(df, feature_cols):
    scaled_data = {}
    scalers = {}
    
    for symbol in df['symbol'].unique():
        df_stock = df[df['symbol'] == symbol].copy()
        scaler = RobustScaler()
        X_scaled = scaler.fit_transform(df_stock[feature_cols])
        scaled_data[symbol] = torch.tensor(X_scaled, dtype=torch.float32)
        scalers[symbol] = scaler
        
    return scaled_data, scalers

In [20]:
import torch
import torch.nn as nn

class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 4),
            nn.ReLU(),
            nn.Linear(4, 2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(2, 4),
            nn.ReLU(),
            nn.Linear(4, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)


In [21]:
from torch.utils.data import DataLoader, TensorDataset

def train_autoencoder(X, input_dim, epochs=30):
    model = Autoencoder(input_dim).to('cpu')  # or 'cuda'
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.MSELoss()
    loader = DataLoader(TensorDataset(X), batch_size=16, shuffle=True)
    
    model.train()
    for epoch in range(epochs):
        for batch in loader:
            x_batch = batch[0]
            output = model(x_batch)
            loss = criterion(output, x_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
    # Evaluate reconstruction error
    model.eval()
    with torch.no_grad():
        reconstruction = model(X)
        error = criterion(reconstruction, X).item()
    
    return error

In [31]:
# Prepare features and scale
feature_cols = [
          
 "day_of_month",                    
 "day_of_week",                      
 "candle_size",                    
 "order_flow",                     
 "candle_body",                    
 "upper_shadow",                   
 "lower_shadow",                   
 "price_change",                   
 "candle_change",                  
 "order_flow_change",              
 "overnight_price_change",         
 "volume_change",                  
 "vwap_change",                    
 "trade_count_change",             
  
 "return_1d",                   
                       
 "vix_norm",                     
   
 "sp500_norm",      
 "market_return_1d",  
]

start = '2023-07-01'
end = '2024-01-01'
df = ohlcv.copy()
df = df[(df['date'] >= start) & (df['date'] < end)]
X_dict, scalers = prepare_stock_data(df, feature_cols)

# Train AE and get errors
stock_errors = {}
for stock, X in X_dict.items():
    print(stock)
    err = train_autoencoder(X, input_dim=X.shape[1])
    stock_errors[stock] = err
    print(err)
    print('')

# Rank stocks by predictability
predictability = pd.DataFrame(stock_errors.items(), columns=['stock', 'reconstruction_error'])
predictability = predictability.sort_values('reconstruction_error')
predictability

MMM
0.7433761358261108

AOS
0.5622619986534119

ABT
0.565942108631134

ABBV
20.699604034423828

ACN
0.5735301971435547

ADBE
0.5855732560157776

AMD
0.49298295378685

AES
0.4879734516143799

AFL
0.46707579493522644

A
0.4845927655696869

APD
0.8816319704055786

ABNB
0.8625376224517822

AKAM
0.3469184339046478

ALB
0.6205811500549316

ARE
1.595885157585144

ALGN
0.5549079179763794

ALLE
0.4947268068790436

LNT
0.6374973654747009

ALL
0.5568215847015381

GOOGL
0.591935932636261

GOOG
1.478196144104004

MO
0.8329681158065796

AMZN
0.47136956453323364

AMCR
0.39527198672294617

AEE
0.6500735878944397

AEP
0.5844867825508118

AXP
0.39808905124664307

AIG
0.5312936902046204

AMT
4.808271884918213

AWK
0.4881037473678589

AMP
0.5649097561836243

AME
0.4262239336967468

AMGN
0.511316180229187

APH
1.0598173141479492

ADI
0.37724801898002625

ANSS
1.529961347579956

AON
1.677653193473816

APA
0.453366219997406

APO
0.6276485919952393

AAPL
0.9055154323577881

AMAT
0.5675666928291321

APTV
0.363

0.5941503643989563

NSC
0.46206843852996826

NTRS
0.528636634349823

NOC
0.5531510710716248

NCLH
0.7017045021057129

NRG
0.49030235409736633

NUE
0.38803789019584656

NVDA
0.4891095757484436

NVR
2.3873045444488525

NXPI
0.6554198265075684

ORLY
0.6817823648452759

OXY
0.4406067132949829

ODFL
0.6014801859855652

OMC
0.7405030131340027

ON
1.3526239395141602

OKE
0.4238237142562866

ORCL
1.8872160911560059

OTIS
0.3701027035713196

PCAR
0.6098496317863464

PKG
0.6223524808883667

PLTR
0.43391454219818115

PANW
0.8559545874595642

PARA
1.4229453802108765

PH
0.6356992125511169

PAYX
1.1177889108657837

PAYC
1.2409025430679321

PYPL
0.44586580991744995

PNR
0.5304880142211914

PEP
0.5434902906417847

PFE
0.536925733089447

PCG
0.5733351707458496

PM
0.5847632884979248

PSX
1.413641333580017

PNW
0.5826900601387024

PNC
0.42966383695602417

POOL
0.5690280795097351

PPG
0.6094688773155212

PPL
0.4593377709388733

PFG
0.4486621618270874

PG
0.5160180330276489

PGR
0.7848565578460693

PLD
0

Unnamed: 0,stock,reconstruction_error
314,MCHP,0.339849
323,MPWR,0.346276
12,AKAM,0.346918
335,NWS,0.356191
390,RJF,0.361449
...,...,...
28,AMT,4.808272
476,GWW,4.832154
302,MAS,8.796748
3,ABBV,20.699604


In [32]:

start = '2024-01-01'
end = '2024-04-01'
df = ohlcv.copy()
df = df[(df['date'] >= start) & (df['date'] < end)]
X_dict, scalers = prepare_stock_data(df, feature_cols)

# Train AE and get errors
stock_errors = {}
for stock, X in X_dict.items():
    print(stock)
    err = train_autoencoder(X, input_dim=X.shape[1])
    stock_errors[stock] = err
    print(err)
    print('')

# Rank stocks by predictability
predictability_b = pd.DataFrame(stock_errors.items(), columns=['stock', 'reconstruction_error'])
predictability_b = predictability_b.sort_values('reconstruction_error')
predictability_b

MMM
1.0826826095581055

AOS
0.4112558662891388

ABT
0.5153186917304993

ABBV
0.39559072256088257

ACN
0.5473902225494385

ADBE
1.1068252325057983

AMD
0.6360206604003906

AES
0.3989236056804657

AFL
0.5487538576126099

A
0.7996038198471069

APD
1.038559913635254

ABNB
0.364623099565506

AKAM
0.6570127010345459

ALB
0.3958744704723358

ARE
0.5540370345115662

ALGN
0.5068752765655518

ALLE
11.94179630279541

LNT
0.8335315585136414

ALL
0.7164478898048401

GOOGL
0.6790471076965332

GOOG
0.7200762033462524

MO
1.1222000122070312

AMZN
1.1865650415420532

AMCR
0.430975079536438

AEE
0.5928172469139099

AEP
0.5691614151000977

AXP
0.8062426447868347

AIG
0.5794419646263123

AMT
0.5544354319572449

AWK
0.4298541247844696

AMP
0.559273898601532

AME
1.765637755393982

AMGN
1.340773344039917

APH
1.4345403909683228

ADI
0.5199127197265625

ANSS
0.7157787680625916

AON
0.45977354049682617

APA
0.6142815351486206

APO
0.598181426525116

AAPL
0.5808325409889221

AMAT
0.49129804968833923

APTV
0.93

0.8826690316200256

NOC
0.5667794942855835

NCLH
0.7542892098426819

NRG
0.6983435750007629

NUE
0.5632669925689697

NVDA
0.7387361526489258

NVR
0.4298762381076813

NXPI
0.5009683966636658

ORLY
0.7401902675628662

OXY
0.5153417587280273

ODFL
3.586805582046509

OMC
0.8095412850379944

ON
0.4567686915397644

OKE
0.6784542202949524

ORCL
1.5595325231552124

OTIS
0.4335445761680603

PCAR
1.0003896951675415

PKG
0.8781482577323914

PLTR
0.5276867151260376

PANW
0.5967246294021606

PARA
0.7080112099647522

PH
0.7870080471038818

PAYX
0.45913252234458923

PAYC
0.8861031532287598

PYPL
1.1623352766036987

PNR
0.7233849763870239

PEP
0.6187525391578674

PFE
2.5926074981689453

PCG
0.4398113787174225

PM
1.1157002449035645

PSX
0.5281376242637634

PNW
0.5963023900985718

PNC
0.71524977684021

POOL
0.7240209579467773

PPG
0.5720359086990356

PPL
0.5264781713485718

PFG
1.2925167083740234

PG
0.5739848613739014

PGR
4.363656044006348

PLD
0.5575632452964783

PRU
0.4703946113586426

PEG
0.477742

Unnamed: 0,stock,reconstruction_error
75,BLDR,0.320072
143,DVN,0.349662
274,KMI,0.362813
11,ABNB,0.364623
437,TER,0.369164
...,...,...
164,EA,15.652533
452,TSN,22.973188
472,V,34.179413
59,BDX,50.364491


In [59]:

start = '2024-04-01'
end = '2024-07-01'
df = ohlcv.copy()
df = df[(df['date'] >= start) & (df['date'] < end)]
X_dict, scalers = prepare_stock_data(df, feature_cols)

# Train AE and get errors
stock_errors = {}
for stock, X in X_dict.items():
    print(stock)
    err = train_autoencoder(X, input_dim=X.shape[1])
    stock_errors[stock] = err
    print(err)
    print('')

# Rank stocks by predictability
predictability_c = pd.DataFrame(stock_errors.items(), columns=['stock', 'reconstruction_error'])
predictability_c = predictability_c.sort_values('reconstruction_error')
predictability_c

MMM
1.2866848707199097

AOS
0.833149254322052

ABT
0.5545132756233215

ABBV
0.6340898871421814

ACN
0.671251654624939

ADBE
0.7574910521507263

AMD
0.44394147396087646

AES
0.5526210069656372

AFL
2.803173780441284

A
0.5543503165245056

APD
0.5818966031074524

ABNB
0.592524528503418

AKAM
1.3867228031158447

ALB
0.4316585063934326

ARE
0.46295344829559326

ALGN
0.5469094514846802

ALLE
0.4108613431453705

LNT
0.7427416443824768

ALL
0.5064908266067505

GOOGL
1.3164409399032593

GOOG
0.41778764128685

MO
0.5652956962585449

AMZN
0.42624589800834656

AMCR
0.6599790453910828

AEE
0.8445772528648376

AEP
0.4998611807823181

AXP
0.6055187582969666

AIG
0.6447604298591614

AMT
0.3422805070877075

AWK
0.7493982315063477

AMP
1.2871077060699463

AME
0.5522438883781433

AMGN
0.5051630139350891

APH
1.6493608951568604

ADI
0.6152040362358093

ANSS
7.963461399078369

AON
0.68248450756073

APA
0.5118212699890137

APO
0.6061991453170776

AAPL
0.5700438022613525

AMAT
0.575707733631134

APTV
0.4698

1.3191678524017334

NSC
0.5075177550315857

NTRS
0.5625267624855042

NOC
0.4981915056705475

NCLH
0.6451396942138672

NRG
0.7948269844055176

NUE
1.1668435335159302

NVDA
0.9591772556304932

NVR
6.837928295135498

NXPI
0.6643688678741455

ORLY
4.638367652893066

OXY
0.5656629800796509

ODFL
2.0686397552490234

OMC
0.5257876515388489

ON
0.6163526773452759

OKE
0.6440336108207703

ORCL
1.335919976234436

OTIS
0.5181556940078735

PCAR
1.5017679929733276

PKG
0.7873134016990662

PLTR
0.7190514206886292

PANW
0.5227710604667664

PARA
0.7532634735107422

PH
0.5628746747970581

PAYX
1.1968674659729004

PAYC
0.6347705125808716

PYPL
1.008309245109558

PNR
39.76082229614258

PEP
0.5361716747283936

PFE
0.5608570575714111

PCG
0.5779955983161926

PM
0.7220839262008667

PSX
0.5623170137405396

PNW
0.6216961741447449

PNC
0.5656431317329407

POOL
0.7177969813346863

PPG
2.879971981048584

PPL
0.6864267587661743

PFG
0.6256890892982483

PG
0.4867713451385498

PGR
0.5034582018852234

PLD
0.81518882

Unnamed: 0,stock,reconstruction_error
28,AMT,0.342281
503,SPY,0.370743
193,FIS,0.379232
474,V,0.385092
267,JNPR,0.390231
...,...,...
442,TPL,8.602320
145,FANG,19.034708
426,SYK,22.909960
298,MPC,36.623161


In [33]:
merged = pd.merge(predictability, predictability_b, on='stock', suffixes=('_a', '_b'))
correlation = merged[['reconstruction_error_a', 'reconstruction_error_b']].corr().iloc[0, 1]
print(f"Correlation between time windows: {correlation:.3f}")

Correlation between time windows: -0.010


In [61]:
merged_2 = pd.merge(predictability_b, predictability_c, on='stock', suffixes=('_b', '_c'))
correlation_2 = merged_2[['reconstruction_error_b', 'reconstruction_error_c']].corr().iloc[0, 1]
print(f"Correlation between time windows: {correlation_2:.3f}")

Correlation between time windows: 0.003


In [34]:
pd.merge(predictability,predictability_b,on="stock")[['reconstruction_error_x','reconstruction_error_y']].corr()

Unnamed: 0,reconstruction_error_x,reconstruction_error_y
reconstruction_error_x,1.0,-0.009967
reconstruction_error_y,-0.009967,1.0


In [None]:
pd.merge(predictability,predictability_c,on="stock")[['reconstruction_error_x','reconstruction_error_y']].corr()

In [35]:
predictability_b[predictability_b['stock']=="MMM"]

Unnamed: 0,stock,reconstruction_error
0,MMM,1.082683


In [51]:
selected =predictability.sort_values(by="reconstruction_error",ascending=True).iloc[:20]['stock'].values

In [58]:
pd.merge(predictability[predictability['stock'].isin(selected)],predictability_b[predictability_b['stock'].isin(selected)],on="stock")[['reconstruction_error_x','reconstruction_error_y']].corr()

Unnamed: 0,reconstruction_error_x,reconstruction_error_y
reconstruction_error_x,1.0,0.992126
reconstruction_error_y,0.992126,1.0


In [63]:
pd.merge(predictability_c[predictability_c['stock'].isin(selected)],predictability_b[predictability_b['stock'].isin(selected)],on="stock")[['reconstruction_error_x','reconstruction_error_y']].corr()

Unnamed: 0,reconstruction_error_x,reconstruction_error_y
reconstruction_error_x,1.0,0.291177
reconstruction_error_y,0.291177,1.0


In [57]:
merged['diff']=abs(merged['reconstruction_error_a']-merged['reconstruction_error_b'])
merged.sort_values(by="diff").iloc[:20]
selected = merged.sort_values(by="diff").iloc[:20]['stock'].values

In [62]:
merged_2['diff']=abs(merged_2['reconstruction_error_b']-merged_2['reconstruction_error_c'])
merged_2.sort_values(by="diff").iloc[:20]
selected = merged.sort_values(by="diff").iloc[:20]['stock'].values

Unnamed: 0,stock,reconstruction_error_b,reconstruction_error_c,diff
91,DRI,0.486482,0.483985,0.002497
392,BIIB,0.918159,0.914961,0.003198
229,T,0.584628,0.588845,0.004218
136,DOC,0.521751,0.516773,0.004978
327,POOL,0.724021,0.717797,0.006224
288,HSIC,0.651061,0.658142,0.007081
239,APO,0.598181,0.606199,0.008018
167,CAT,0.534351,0.543,0.008649
138,XOM,0.522118,0.513443,0.008675
83,EG,0.480822,0.49025,0.009428
