# 03 - Predictability & Universe Filtering

1. Compute rolling predictability metrics for each ticker
2. Visualize and compare scores across universe and time
3. Select top-N most “learnable” tickers for RL agent
4. Document all decisions, assumptions, and open questions


In [7]:
# SETUP: Imports & Paths ===========================
import jupyter

import os
import pandas as pd


from tqdm import tqdm
from src.data.feature_pipeline import basic_chart_features,load_base_dataframe
from src.predictability.easiness import rolling_sharpe, rolling_r2, rolling_info_ratio, rolling_autocorr
from src.predictability.pipeline import generate_universe_easiness_report
from IPython import display
from src.utils.system import boot,notify
from src.experiments.experiment_tracker import ExperimentTracker

In [47]:
features = [
        'store_nbr', 'family', 'dayofweek', 'month', 'day', 'week', 'is_holiday',
        'transactions', 'lag_7', 'lag_14', 'trans_lag_7', 'trans_lag_14',
        'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14',
        'trans_roll_mean_7', 'trans_roll_mean_14',
        'onpromo_lag_7', 'onpromo_mean_14'
]

run_settings={
    "n_estimators":1000, 
    "learning_rate":0.05, 
    "max_depth":6, 
    "random_state":42,
    "regime": "rolling_mean",
    "features": features.copy()
}
run_settings["features"].sort()
run_settings

{'n_estimators': 1000,
 'learning_rate': 0.05,
 'max_depth': 6,
 'random_state': 42,
 'regime': 'rolling_mean',
 'features': ['day',
  'dayofweek',
  'family',
  'is_holiday',
  'lag_14',
  'lag_7',
  'month',
  'onpromo_lag_7',
  'onpromo_mean_14',
  'rolling_mean_14',
  'rolling_mean_7',
  'rolling_std_14',
  'rolling_std_7',
  'store_nbr',
  'trans_lag_14',
  'trans_lag_7',
  'trans_roll_mean_14',
  'trans_roll_mean_7',
  'transactions',
  'week']}

In [48]:
boot()
notify('03 - Predictability & Universe Filtering',title="Train complete", level="info")

In [127]:
# LOAD OHLCV ==========================================
ohlcv_df = load_base_dataframe()
ohlcv_df.tail()
_ohlcv=ohlcv_df.copy()

In [128]:
# CROP THE SAMPLE =======================================
tickers = ohlcv_df['symbol'].unique()

## How to intrepret
| Metric                             | Good Sign                  | Bad Sign                            |
| ---------------------------------- | -------------------------- | ----------------------------------- |
| R² > 0.3                           | Model captures real signal | R² ≈ 0: model is guessing           |
| MAE low (e.g., < 0.05)             | Close predictions          | MAE > 0.1 is noisy                  |
| Scatter points cluster on diagonal | High correlation           | Wide dispersion = model uncertainty |


In [None]:
notify('teste')

In [45]:
result_df.describe()

Unnamed: 0,true_r2,predicted_r2
count,18.0,18.0
mean,0.023697,0.04858
std,0.027648,0.024395
min,0.000226,0.015993
25%,0.003473,0.028728
50%,0.00995,0.044306
75%,0.045028,0.065263
max,0.098219,0.096468


In [124]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import skew, kurtosis, entropy
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm import tqdm
from IPython.display import display

warnings.filterwarnings("ignore")

# === Simulate a realistic dataset ===
np.random.seed(42)
#dates = pd.date_range(start="2022-01-01", end="2024-12-31", freq="D")
dates = pd.date_range(start="2022-01-01", end="2022-06-01", freq="D")
symbols = ['AAPL', 'GOOG', 'MSFT' ]
data = []

for symbol in tqdm(symbols):
    print('Round 1',symbol)
    #trend = np.linspace(100, 200, len(dates)) + np.random.normal(0, 5, len(dates))
    for i, date in enumerate(dates):
        ret = np.random.randn() * 0.01

        data.append({
            'date': date,
            'symbol': symbol,
            'close': ohlcv_df[ohlcv_df['symbol']==symbol]['close'].values,#np.random.rand() * 100 + 100,
            'return_1d': ohlcv_df[ohlcv_df['symbol']==symbol]['return_1d'].values,
            'volume': ohlcv_df[ohlcv_df['symbol']==symbol]['volume'].values#np.random.randint(1e5, 1e6)
        })

df = pd.DataFrame(data)
df['month'] = df['date'].dt.to_period('M')
df['return_1d'] = df['return_1d'].fillna(0)

# === Extract enriched features and labels ===
feature_rows = []
label_rows = []
def flatten_cell(x):
    if isinstance(x, (list, np.ndarray)):
        return float(x[0]) if len(x) > 0 else np.nan
    return float(x)



for symbol in tqdm(symbols):
    print('Round 2',symbol)
    symbol_data = df[df['symbol'] == symbol].sort_values('date')
    grouped = symbol_data.groupby('month')

    months = list(grouped.groups.keys())
    for i in range(len(months) - 1):
        m_t = months[i]
        m_t1 = months[i + 1]

        df_t = grouped.get_group(m_t)
        df_t1 = grouped.get_group(m_t1)

        if len(df_t1) < 10:
            continue
        def flatten_cell(x):
            if isinstance(x, (list, np.ndarray)):
                return float(x[0]) if len(x) > 0 else np.nan
            return float(x)

        r1d = df_t['return_1d'].apply(flatten_cell)
        v = df_t['volume'].apply(flatten_cell)

        # === Features from T ===
        # Ensure return_1d is flat and numeric
        returns = df_t['return_1d'].apply(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x).astype(np.float64).values

        features = {
            'symbol': symbol,
            'month': str(m_t),
            'mean_return': returns.mean(),
            'std_return': returns.std(),
            'skew_return': skew(returns),
            'kurtosis_return': kurtosis(returns),
            'entropy_return': entropy(np.histogram(returns, bins=10, density=True)[0] + 1e-8),
            'volume_mean': v.mean(),#df_t['volume'].mean(),
            'volume_std': v.std() #df_t['volume'].std()
        }


        # === Label from T+1 ===
        y = df_t1['return_1d'].values
        X_dummy = np.arange(len(y)).reshape(-1, 1)
        base_model = LinearRegression().fit(X_dummy, y)
        r2 = base_model.score(X_dummy, y)

        label = int(r2 > 0.1)  # Predictable if R² > 0.1

        feature_rows.append(features)
        label_rows.append(label)

# === Build dataset ===
X_df = pd.DataFrame(feature_rows)
y_df = pd.Series(label_rows, name='is_predictable')

metadata = X_df[['symbol', 'month']]
X = X_df.drop(['symbol', 'month'], axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === Train classifier ===
X_train, X_test, y_train, y_test, meta_train, meta_test = train_test_split(
    X_scaled, y_df, metadata, test_size=0.2, random_state=42
)

clf = RandomForestClassifier(n_estimators=300, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# === Report and confusion matrix ===
report = classification_report(y_test, y_pred, output_dict=True)
conf_mat = confusion_matrix(y_test, y_pred)

# === Create full report DataFrame ===
report_df = pd.DataFrame(report).transpose()
report_df['support'] = report_df['support'].astype(int)

# === Identify top-k easiest stocks (predicted = 1) ===
predictions_df = meta_test.copy()
predictions_df['predicted_label'] = y_pred
predictions_df['true_label'] = y_test.values
top_predictable = predictions_df[predictions_df['predicted_label'] == 1]

print("Top Predictable Stock-Months:")
display(top_predictable.head(10))

# === Plot confusion matrix ===
plt.figure(figsize=(6, 5))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Predictable', 'Predictable'], yticklabels=['Not Predictable', 'Predictable'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

# Display classification report
print("\nClassification Report:")
display(report_df.round(3))


  0%|          | 0/3 [00:00<?, ?it/s]

Round 1 AAPL


 33%|███▎      | 1/3 [00:16<00:32, 16.40s/it]

Round 1 GOOG


 67%|██████▋   | 2/3 [00:32<00:16, 16.05s/it]

Round 1 MSFT


100%|██████████| 3/3 [00:48<00:00, 16.07s/it]
  0%|          | 0/3 [00:00<?, ?it/s]


Round 2 AAPL


AttributeError: 'bool' object has no attribute 'any'

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import skew, kurtosis, entropy
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm import tqdm
from IPython.display import display

warnings.filterwarnings("ignore")

# === Simulate a realistic dataset ===
np.random.seed(42)
dates = ohlcv_df[ohlcv_df['symbol']=="AAPL"]
#dates =  dates[dates['date']>="2022-01-01"&dates['date']<"2022-06-01"]
#dates = dates['date']
dates=dates[(dates['date']>="2022-01-01")&(dates['date']<"2024-01-01")]['date'].values
#pd.date_range(start="2022-01-01", end="2022-06-01", freq="D")
symbols = ohlcv_df['symbol'].unique() #['AAPL', 'GOOG']
data = []

# Replace with your actual ohlcv_df if not simulating
#ohlcv_df = pd.DataFrame()

for symbol in tqdm(symbols):
    print('Round 1', symbol)
    _ohlcv_df= ohlcv_df[ohlcv_df['symbol']==symbol]
    for date in dates:
        #print("date",date)
        d = _ohlcv_df[_ohlcv_df['date']==date]#.iloc[0]
        #print(d)
        close = d['close']#np.random.rand() * 100 + 100
        return_1d = d['return_1d']#np.random.randn() * 0.01
        volume = d['volume']#np.random.randint(1e5, 1e6)
        data.append({
            'date': date,
            'symbol': symbol,
            'close': close,
            'return_1d': return_1d,
            'volume': volume
        })
        # data.append({
        #   'date': date,
        #    'symbol': symbol,
        #    'close': ['close'].values,#np.random.rand() * 100 + 100,
        #    'return_1d': ohlcv_df[ohlcv_df['symbol']==symbol]['return_1d'].values,
        #    'volume': ohlcv_df[ohlcv_df['symbol']==symbol]['volume'].values#np.random.randint(1e5, 1e6)
        #})

df = pd.DataFrame(data)
df['month'] = df['date'].dt.to_period('M')
df['return_1d'] = df['return_1d'].fillna(0)

# === Extract enriched features and labels ===
feature_rows = []
label_rows = []

def flatten_cell(x):
    if isinstance(x, (list, np.ndarray)):
        return float(x[0]) if len(x) > 0 else np.nan
    return float(x)

for symbol in tqdm(symbols):
    print('Round 2', symbol)
    symbol_data = df[df['symbol'] == symbol].sort_values('date')
    grouped = symbol_data.groupby('month')

    months = list(grouped.groups.keys())
    for i in range(len(months) - 1):
        m_t = months[i]
        m_t1 = months[i + 1]

        df_t = grouped.get_group(m_t)
        df_t1 = grouped.get_group(m_t1)

        if len(df_t1) < 10:
            continue

        r1d = df_t['return_1d'].apply(flatten_cell)
        v = df_t['volume'].apply(flatten_cell)

        # Features from T
        returns = r1d.astype(np.float64).values

        features = {
            'symbol': symbol,
            'month': str(m_t),
            'mean_return': returns.mean(),
            'std_return': returns.std(),
            'skew_return': skew(returns),
            'kurtosis_return': kurtosis(returns),
            'entropy_return': entropy(np.histogram(returns, bins=10, density=True)[0] + 1e-8),
            'volume_mean': v.mean(),
            'volume_std': v.std()
        }

        # Label from T+1
        y = df_t1['return_1d'].astype(np.float64).values
        X_dummy = np.arange(len(y)).reshape(-1, 1)
        base_model = LinearRegression().fit(X_dummy, y)
        r2 = base_model.score(X_dummy, y)
        label = int(r2 > 0.1)

        feature_rows.append(features)
        label_rows.append(label)

# === Build dataset ===
X_df = pd.DataFrame(feature_rows)
y_df = pd.Series(label_rows, name='is_predictable')

metadata = X_df[['symbol', 'month']]
X = X_df.drop(['symbol', 'month'], axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === Train classifier ===
X_train, X_test, y_train, y_test, meta_train, meta_test = train_test_split(
    X_scaled, y_df, metadata, test_size=0.2, random_state=42
)

clf = RandomForestClassifier(n_estimators=300, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# === Report and confusion matrix ===
report = classification_report(y_test, y_pred, output_dict=True)
conf_mat = confusion_matrix(y_test, y_pred)

# === Create full report DataFrame ===
report_df = pd.DataFrame(report).transpose()
report_df['support'] = report_df['support'].astype(int)

# === Identify top-k easiest stocks (predicted = 1) ===
predictions_df = meta_test.copy()
predictions_df['predicted_label'] = y_pred
predictions_df['true_label'] = y_test.values
top_predictable = predictions_df[predictions_df['predicted_label'] == 1]

print("Top Predictable Stock-Months:")
display(top_predictable.head(10))

# === Plot confusion matrix ===
plt.figure(figsize=(6, 5))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Predictable', 'Predictable'], yticklabels=['Not Predictable', 'Predictable'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

# Display classification report
print("\nClassification Report:")
display(report_df.round(3))


  0%|          | 0/504 [00:00<?, ?it/s]

Round 1 MMM


  0%|          | 1/504 [00:00<03:06,  2.70it/s]

Round 1 AOS


  0%|          | 2/504 [00:00<03:19,  2.52it/s]

Round 1 ABT


  1%|          | 3/504 [00:01<03:23,  2.47it/s]

In [138]:
_ohlcv_df['']

Unnamed: 0,id,symbol,timestamp,date,open,high,low,close,volume,trade_count,...,vwap_change,trade_count_change,sector_id,industry_id,return_1d,vix,vix_norm,sp500,sp500_norm,market_return_1d
17140,17141,GOOG,2022-01-04 05:00:00,2022-01-04,2911.010,2932.2000,2876.3225,2888.33,1305838.0,78071.0,...,-0.034145,0.988062,,,-0.046830,0.1691,0.018675,47.9354,-0.000630,-0.000630
17141,17142,GOOG,2022-01-05 05:00:00,2022-01-05,2883.620,2885.9600,2750.4700,2753.07,2493515.0,155210.0,...,-0.034145,0.988062,,,-0.046830,0.1973,0.166765,47.0058,-0.019393,-0.019393
17142,17143,GOOG,2022-01-06 05:00:00,2022-01-06,2749.950,2793.7200,2735.2700,2751.02,1621973.0,94830.0,...,-0.012839,-0.389021,,,-0.000745,0.1961,-0.006082,46.9605,-0.000964,-0.000964
17143,17144,GOOG,2022-01-07 05:00:00,2022-01-07,2758.100,2765.0949,2715.7800,2740.09,1064522.0,73778.0,...,-0.006637,-0.221997,,,-0.003973,0.1876,-0.043345,46.7703,-0.004050,-0.004050
17144,17145,GOOG,2022-01-10 05:00:00,2022-01-10,2701.980,2772.8000,2662.8100,2771.48,1868448.0,126154.0,...,-0.002569,0.709914,,,0.011456,0.1940,0.034115,46.7029,-0.001441,-0.001441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17992,17993,GOOG,2025-05-29 04:00:00,2025-05-29,175.000,175.4000,171.7800,172.96,21233590.0,361880.0,...,-0.006075,0.020047,,,-0.002422,0.1918,-0.006732,59.1217,0.004011,0.004011
17993,17994,GOOG,2025-05-30 04:00:00,2025-05-30,172.410,173.4400,168.5250,172.85,36258254.0,439038.0,...,-0.007872,0.213214,,,-0.000636,0.1857,-0.031804,59.1169,-0.000081,-0.000081
17994,17995,GOOG,2025-06-02 04:00:00,2025-06-02,169.010,171.0624,168.6500,170.37,24742877.0,364732.0,...,-0.009509,-0.169247,,,-0.014348,0.1836,-0.011309,59.3594,0.004102,0.004102
17995,17996,GOOG,2025-06-03 04:00:00,2025-06-03,168.865,169.8000,166.6800,167.71,25386713.0,452653.0,...,-0.011214,0.241056,,,-0.015613,0.1769,-0.036492,59.7037,0.005800,0.005800


In [None]:
y_train[y_train == 1]

In [94]:
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# Ensure return_1d is flat and numeric
# Ensure return_1d and volume contain only floats, not arrays/lists
def flatten_cell(x):
    if isinstance(x, (list, np.ndarray)):
        return float(x[0]) if len(x) > 0 else np.nan
    return float(x)

df['return_1d'] = df['return_1d'].apply(flatten_cell)
df['volume'] = df['volume'].apply(flatten_cell)
returns = df_t['return_1d'].astype(np.float64).values
volume_mean = df_t['volume'].mean()
volume_std = df_t['volume'].std()

returns = df_t['return_1d'].apply(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x).astype(np.float64).values

features = {
    'symbol': symbol,
    'month': str(m_t),
    'mean_return': returns.mean(),
    'std_return': returns.std(),
    'skew_return': skew(returns),
    'kurtosis_return': kurtosis(returns),
    'entropy_return': entropy(np.histogram(returns, bins=10, density=True)[0] + 1e-8),
    'volume_mean': df_t['volume'].mean(),
    'volume_std': df_t['volume'].std()
}
features

ValueError: setting an array element with a sequence.

In [95]:
df['return_1d']

0     -0.026600
1     -0.026600
2     -0.026600
3     -0.026600
4     -0.026600
         ...   
755   -0.018893
756   -0.018893
757   -0.018893
758   -0.018893
759   -0.018893
Name: return_1d, Length: 760, dtype: float64

In [90]:
returns = np.concatenate(df_t['return_1d'].values).astype(np.float64)
returns

array([-0.02659989, -0.02659989, -0.01669335, ...,  0.00423201,
        0.00778384, -0.0022138 ])

# OLD CODE BELOW
---

---

---

In [None]:
xxxxxxxxxxxxxx

In [51]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

# Simulate a basic dataframe for demonstration
np.random.seed(42)
dates = pd.date_range(start="2022-01-01", end="2024-12-31", freq="D")
symbols = ohlcv_df['symbol'].unique()
data = []

for symbol in tqdm(symbols):
    for date in dates:
        data.append({
            'date': date,
            'symbol': symbol,
            'close': ohlcv_df[ohlcv_df['symbol']==symbol]['close'].values,#np.random.rand() * 100 + 100,
            'return_1d': ohlcv_df[ohlcv_df['symbol']==symbol]['return_1d'].values,
            'volume': ohlcv_df[ohlcv_df['symbol']==symbol]['volume'].values#np.random.randint(1e5, 1e6)
        })

df = pd.DataFrame(data)

# Convert date to month
df['month'] = df['date'].dt.to_period('M')
df['return_1d'] = df['return_1d'].fillna(0)
df.sort_values(by="date",inplace=True)
df_train = df[df['date']<'2024-06-01']#.sort_values(by="date")
df_test =df[df['date']>='2024-06-01']#.sort_values(by="date")

def generate_datasets(_df):
    # Compute monthly features per stock
    monthly_features = _df.groupby(['symbol', 'month']).agg({
        'return_1d': ['mean', 'std', 'skew'],
        'volume': ['mean', 'std']
    })

    monthly_features.columns = ['_'.join(col) for col in monthly_features.columns]
    monthly_features = monthly_features.reset_index()

    # Prepare sliding window (T, T+1)
    s_rows = []
    m_rows = []
    X_rows = []
    y_values = []

    for symbol in monthly_features['symbol'].unique():
        symbol_data = monthly_features[monthly_features['symbol'] == symbol].sort_values('month')
        for i in range(len(symbol_data) - 1):
            X_t = symbol_data.iloc[i]
            T_plus_1 = symbol_data.iloc[i + 1]

            # Train a simple model on T+1 return_1d using only lagged return as a proxy
            y = _df[(_df['symbol'] == symbol) & (_df['month'] == T_plus_1['month'])]['return_1d'].values
            if len(y) < 10:
                continue
            X_simple = np.arange(len(y)).reshape(-1, 1)
            model = LinearRegression().fit(X_simple, y)
            r2 = r2_score(y, model.predict(X_simple))

            X_rows.append(X_t.drop(['symbol', 'month']).values)
            y_values.append(r2)
            s_rows.append(X_t['symbol'])
            m_rows.append(X_t['month'])

    # Final dataset
    X = np.array(X_rows)
    y = np.array(y_values)
    tickers = s_rows
    months = m_rows
    return X,y,tickers,months
    # Train-test split and regress
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    

X_train, y_train,_,_= generate_datasets(df_train)
X_test, y_test,s_test,m_test= generate_datasets(df_test)
regressor = RandomForestRegressor(n_estimators=200, random_state=42)
regressor.fit(X_train, y_train)
preds = regressor.predict(X_test)

# Display results
result_df = pd.DataFrame({
    'symbol': s_test,
    'month': m_test,
    'true_r2': y_test,
    'predicted_r2': preds
})


from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(y_test, preds, alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red', label='Perfect Prediction')
plt.xlabel("True R²")
plt.ylabel("Predicted R²")
plt.title("True vs Predicted R² (Predictability Score)")
plt.legend()
plt.grid(True)
plt.show()

KeyboardInterrupt: 