In [2]:
%reload_ext autoreload
%autoreload 2
import sys, os
sys.path.append(os.path.abspath(".."))

from src.data_eng.pipeline import run_pipeline

Creates a single feature (1 day return) and adds the default target (5 day horizon +1% return) on the tickers from the ticker list

In [7]:
# first set data config

from src.config import Config

conf = Config(
    features=['r_1d'],
    target={'horizon': 5, 'threshold': 0.01},
    #ticker_list=['AAPL','META'],
    train_cutoff='2010-01-01',         # ignored by folds, but keep for provenance
    validate_cutoff='2022-01-01',      # FINAL TEST START
    fold_len='365D',
    fold_mode='expanding',             # or 'sliding'
    sliding_train_years=None,          # set e.g. 5 if using sliding
    embargo_days=None                  # defaults to horizon=5
)




In [None]:



run_pipeline(conf)



begin fetching data from yfinance...
['AAPL', 'MSFT', 'NVDA', 'GOOGL', 'AMZN', 'META', 'TSLA', 'AVGO', 'TSM', 'ORCL', 'WMT', 'JPM', 'INTC', 'UNH', 'HD']
saved: ../data/raw/AAPL.csv
saved: ../data/raw/MSFT.csv
saved: ../data/raw/NVDA.csv
saved: ../data/raw/GOOGL.csv
saved: ../data/raw/AMZN.csv
saved: ../data/raw/META.csv
saved: ../data/raw/TSLA.csv
saved: ../data/raw/AVGO.csv
saved: ../data/raw/TSM.csv
saved: ../data/raw/ORCL.csv
saved: ../data/raw/WMT.csv
saved: ../data/raw/JPM.csv
saved: ../data/raw/INTC.csv
saved: ../data/raw/UNH.csv
saved: ../data/raw/HD.csv
done fetching data
being data cleaning...
done cleaning data
begin feature engineering
make features for AAPL
make features for MSFT
make features for NVDA
make features for GOOGL
make features for AMZN
make features for META
make features for TSLA
make features for AVGO
make features for TSM
make features for ORCL
make features for WMT
make features for JPM
make features for INTC
make features for UNH
make features for HD
Done 

Read the Data back in after processing to a data bundle. This pre splits our data into Validation

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

from src.data_eng.folds import load_multi_ticker_collection
from src.modeling.global_pairs import build_global_fold_pairs, build_global_insample_and_test

# 1) get data
collection = load_multi_ticker_collection(conf)

# 2) build global fold pairs
pairs = build_global_fold_pairs(collection)

# 3) simple CV loop for a global model
def make_global_pipeline(numeric_cols):
    pre = ColumnTransformer([
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["__ticker__"]),
    ])
    return Pipeline([
        ("pre", pre),
        ("clf", LogisticRegression(solver="liblinear", C=1.0, max_iter=3000, random_state=42))
    ])

cv_scores = []
for k, (Xtr, ytr, Xva, yva) in enumerate(pairs):
    num_cols = [c for c in Xtr.columns if c not in ("__ticker__", "Date")]
    pipe = make_global_pipeline(num_cols)
    pipe.fit(Xtr, ytr)
    proba = pipe.predict_proba(Xva)[:, 1]
    pred  = (proba >= 0.5).astype(int)
    cv_scores.append({
        "fold": k,
        "roc_auc": roc_auc_score(yva, proba),
        "accuracy": accuracy_score(yva, pred),
        "n_val": len(Xva)
    })




# 4) final test, trained once on all pre-test data
X_ins, y_ins, X_test, y_test = build_global_insample_and_test(collection)
num_cols = [c for c in X_ins.columns if c not in ("__ticker__", "Date")]
final_pipe = make_global_pipeline(num_cols)


assert len(X_ins) == len(y_ins)
assert len(X_test) == len(y_test)
assert set(["__ticker__","Date"]).issubset(X_ins.columns)
assert "__ticker__" in X_test.columns and "Date" in X_test.columns
assert not X_ins.duplicated(subset=["__ticker__", "Date"]).any()



final_pipe.fit(X_ins, y_ins)

final_auc = roc_auc_score(y_test, final_pipe.predict_proba(X_test)[:, 1])
final_auc

0.5039659597311426

[(0.03, 0.49580310839170344, 0.013359730001381975),
 (0.1, 0.49580310839170344, 0.013359730001381975),
 (0.3, 0.49580310839170344, 0.013359730001381975),
 (1, 0.49580310839170344, 0.013359730001381975),
 (3, 0.4958031083917035, 0.013359730001381995),
 (10, 0.4958031083917035, 0.013359730001381995)]