# Random Forest

In [2]:
%reload_ext autoreload
%autoreload 2
import sys, os
sys.path.append(os.path.abspath(".."))

from src.data_eng.pipeline import run_pipeline

In [3]:
from src.config import Config

conf = Config(
    #features=[], all features
    add_int_features=True,
    target={'horizon': 5, 'threshold': 0.01},
   # ticker_list=['AAPL','META'], #all tickers
    validate_cutoff='2022-01-01',      # FINAL TEST START
    fold_len='365D',
    fold_mode='expanding',             # or 'sliding'
    sliding_train_years=None,          # set e.g. 5 if using sliding
    #embargo_days=None                  # defaults to horizon=5
)

In [4]:
# skipping for now since we already have data, uncomment if you havent pulled data yet
# run_pipeline(conf)

In [4]:
import pandas as pd

X = pd.read_csv(conf.processed_data_path / f'{conf.ticker_list[0]}.csv')

In [5]:
from src.modeling.eval import make_global_rf_pipeline
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import randint, uniform
from src.modeling.helpers import num_cols_fn

param_distributions = {
    "clf__n_estimators": randint(300, 900),           # more trees = smoother, slower
    "clf__max_depth": randint(3, 20),                 # smaller = more bias, larger = more variance
    "clf__min_samples_leaf": randint(1, 50),          # 5–20 often stabilizes noisy labels
    "clf__min_samples_split": randint(2, 50),
    "clf__max_features": ["sqrt", "log2", 0.2, 0.4, 0.6],
    "clf__bootstrap": [True, False],
    "clf__max_samples": [None, 0.5, 0.7, 0.9],        # only used if bootstrap=True
    "clf__criterion": ["gini", "entropy"]
}
# inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# search = RandomizedSearchCV(
#     estimator=make_global_rf_pipeline(num_cols_fn(X)),
#     param_distributions=param_distributions,
#     n_iter=40,                
#     scoring="roc_auc",        
#     n_jobs=-1,
#     cv=inner_cv,
#     verbose=1,
#     random_state=42
# )





In [6]:
from src.modeling.global_pairs import build_global_fold_pairs
from src.data_eng.folds import load_multi_ticker_collection
from src.modeling.eval import make_global_rf_pipeline
from sklearn.metrics import roc_auc_score, accuracy_score

# 1) get data
collection = load_multi_ticker_collection(conf)

# 2) build global fold pairs
pairs = build_global_fold_pairs(collection)



cv_scores = []

for k, (Xtr, ytr, Xva, yva) in enumerate(pairs):
    # get numeric cols and align NaN masks like you already do
    num_cols = [c for c in Xtr.columns if c not in ("__ticker__", "Date")]
    mask_tr = Xtr[num_cols].isna().any(axis=1)
    Xtr_ = Xtr.loc[~mask_tr]
    ytr_ = ytr.loc[~mask_tr]
    mask_va = Xva[num_cols].isna().any(axis=1)
    Xva_ = Xva.loc[~mask_va]
    yva_ = yva.loc[~mask_va]

    # inner search on training fold
    search = RandomizedSearchCV(
        estimator=make_global_rf_pipeline(num_cols),
        param_distributions=param_distributions,
        n_iter=40,
        scoring="roc_auc",
        n_jobs=-1,
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        verbose=0,
        random_state=42
    )
    search.fit(Xtr_, ytr_)

    # evaluate best on this outer validation fold
    best_pipe = search.best_estimator_
    y_proba = best_pipe.predict_proba(Xva_)[:, 1]
    y_pred = (y_proba >= 0.5).astype(int)

    cv_scores.append({
        "fold": k,
        "roc_auc": roc_auc_score(yva_, y_proba),
        "accuracy": accuracy_score(yva_, y_pred),
        "n_val": len(Xva_),
        "best_params": search.best_params_
    })

48 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
48 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mike/Git-projects/StockPrediction/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mike/Git-projects/StockPrediction/.venv/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/mike/Git-projects/StockPrediction/.venv/lib/python3.13/site-packages/sklearn/pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, *

In [19]:
import numpy as np
arr = np.zeros(len(cv_scores), dtype=float)

for i, result in enumerate(cv_scores):
   # print(result)
   # print(result['roc_auc'])
    arr[i] = float(result['roc_auc'])

# fold with best roc_auc
best_fold_id = pd.Series(arr).idxmax()

best_fold = cv_scores[best_fold_id]

best_fold['best_params']




{'clf__bootstrap': True,
 'clf__criterion': 'gini',
 'clf__max_depth': 15,
 'clf__max_features': 'sqrt',
 'clf__max_samples': None,
 'clf__min_samples_leaf': 15,
 'clf__min_samples_split': 46,
 'clf__n_estimators': 364}

Best Params:
```raw
{'clf__bootstrap': True,
 'clf__criterion': 'gini',
 'clf__max_depth': 15,
 'clf__max_features': 'sqrt',
 'clf__max_samples': None,
 'clf__min_samples_leaf': 15,
 'clf__min_samples_split': 46,
 'clf__n_estimators': 364}
 ```

## Final Test