In [2]:
%reload_ext autoreload
%autoreload 2
import sys, os
sys.path.append(os.path.abspath(".."))

from src.data_eng.pipeline import run_pipeline


Create Config

add additional interaction features

In [3]:
from src.config import Config

conf = Config(
    #features=[], all features
    add_int_features=True,
    target={'horizon': 5, 'threshold': 0.01},
   # ticker_list=['AAPL','META'], #all tickers
    validate_cutoff='2022-01-01',      # FINAL TEST START
    fold_len='365D',
    fold_mode='expanding',             # or 'sliding'
    sliding_train_years=None,          # set e.g. 5 if using sliding
    embargo_days=None                  # defaults to horizon=5
)

In [4]:
run_pipeline(conf)

begin fetching data from yfinance...
['AAPL', 'MSFT', 'NVDA', 'GOOGL', 'AMZN', 'META', 'TSLA', 'AVGO', 'TSM', 'ORCL', 'WMT', 'JPM', 'INTC', 'UNH', 'HD']
saved: ../data/raw/AAPL.csv
saved: ../data/raw/MSFT.csv
saved: ../data/raw/NVDA.csv
saved: ../data/raw/GOOGL.csv
saved: ../data/raw/AMZN.csv
saved: ../data/raw/META.csv
saved: ../data/raw/TSLA.csv
saved: ../data/raw/AVGO.csv
saved: ../data/raw/TSM.csv
saved: ../data/raw/ORCL.csv
saved: ../data/raw/WMT.csv
saved: ../data/raw/JPM.csv
saved: ../data/raw/INTC.csv
saved: ../data/raw/UNH.csv
saved: ../data/raw/HD.csv
done fetching data
being data cleaning...
done cleaning data
begin feature engineering
make features for AAPL
make features for MSFT
make features for NVDA
make features for GOOGL
make features for AMZN
make features for META
make features for TSLA
make features for AVGO
make features for TSM
make features for ORCL
make features for WMT
make features for JPM
make features for INTC
make features for UNH
make features for HD
Done 

In [5]:
import numpy as np
import pandas as pd


## Hyperparameter tuning

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import loguniform
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.svm import LinearSVC
# from src.data_eng.get_data import get_train_test_data

# import warnings
# from sklearn.exceptions import ConvergenceWarning

# warnings.filterwarnings("ignore", category=ConvergenceWarning)


# X_test, y_test, X_train, y_train = get_train_test_data(conf)

# def numeric_only(df: pd.DataFrame) -> pd.DataFrame:
#     return df.select_dtypes(include=[np.number]).copy()

# X_train = numeric_only(X_train).replace([np.inf, -np.inf], np.nan).fillna(0.0)
# X_test  = numeric_only(X_test).replace([np.inf, -np.inf], np.nan).fillna(0.0)



# pipe = make_pipeline(
#     StandardScaler(with_mean=False),
#     LinearSVC(
#         max_iter=10000,
#         loss="hinge",          
#         dual=True,             
#         class_weight="balanced"  
#     )
# )
# # params
# param_dist = {
#     "linearsvc__C": loguniform(1e-3, 1) # reduced search area to low C to speed up
# }

# search = RandomizedSearchCV(
#     estimator=pipe,
#     param_distributions=param_dist,
#     n_iter=50,                   
#     scoring="roc_auc",           
#     cv=5,
#     random_state=42,
#     verbose=1,
#     n_jobs=-1
# )

# search.fit(X_train, y_train)

# print("Best parameters:", search.best_params_)
# print("Best score:", search.best_score_)

# best_C = search.best_params_['linearsvc__C'] # 0.00115279871282324

# best_model = search.best_estimator_


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters: {'linearsvc__C': np.float64(0.00115279871282324)}
Best score: 0.5307823198394807


### Linear SVC

In [7]:
from sklearn.svm import LinearSVC
# 1) build preprocessor with imputation to kill NaNs from lags/rolls
from src.modeling.eval import make_global_pipeline

def get_new_lvc(C):
    return LinearSVC(
            penalty="l2",
            loss="hinge",           
            dual=True,             
            class_weight="balanced",
            C=C,
            max_iter=20000,         
            random_state=42
        )

In [None]:


from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from src.data_eng.folds import load_multi_ticker_collection
from src.modeling.global_pairs import build_global_fold_pairs, build_global_insample_and_test

# 1) get data
collection = load_multi_ticker_collection(conf)

# 2) build global fold pairs
pairs = build_global_fold_pairs(collection)

C_grid = np.logspace(-4, 0, 15)  # or loguniform samples

outer_results = []
for k, (Xtr, ytr, Xva, yva) in enumerate(pairs):
    num_cols = [c for c in Xtr.columns if c not in ("__ticker__", "Date")]
    # build inner CV on Xtr/ytr
    inner = StratifiedKFold(n_splits=3, shuffle=False)  # no shuffling for time series if you care about order
    
    best_auc, best_c = -np.inf, 0.0
    for C in C_grid:
        aucs = []
        for itrain, ival in inner.split(Xtr, ytr):
            X_itr, X_iva = Xtr.iloc[itrain], Xtr.iloc[ival]
            y_itr, y_iva = ytr.iloc[itrain], ytr.iloc[ival]
            pipe = make_global_pipeline(num_cols,
                get_new_lvc(C)
            )
            pipe.fit(X_itr, y_itr)
            s = pipe.decision_function(X_iva)
            aucs.append(roc_auc_score(y_iva, s))
        mean_auc = np.mean(aucs)
        if mean_auc > best_auc:
            best_auc, best_c = mean_auc, C

    # train with best_c on full outer-train, evaluate on outer-val
    final_fold_pipe = make_global_pipeline(num_cols,
        get_new_lvc(best_c)
    )
    final_fold_pipe.fit(Xtr, ytr)
    s_val = final_fold_pipe.decision_function(Xva)
    p_val = (s_val >= 0).astype(int)
    outer_results.append({
        "fold": k,
        "C": best_c,
        "roc_auc": roc_auc_score(yva, s_val),
        "accuracy": (p_val == yva).mean(),
        "n_val": len(Xva),
    })

# pick a final C (e.g., median of best Cs)
final_C = float(np.median([r["C"] for r in outer_results]))

In [15]:
print(f'best C from fold testing: {final_C}')

best C from fold testing: 0.00019306977288832496


In [16]:
# 4) final test
X_ins, y_ins, X_test, y_test = build_global_insample_and_test(collection)
num_cols = [c for c in X_ins.columns if c not in ("__ticker__", "Date")]

mask_ins  = X_ins[num_cols].isna().any(axis=1)
X_ins     = X_ins.loc[~mask_ins]
y_ins     = y_ins.loc[~mask_ins]

mask_test = X_test[num_cols].isna().any(axis=1)
X_test    = X_test.loc[~mask_test]
y_test    = y_test.loc[~mask_test]



final_pipe = make_global_pipeline(num_cols, get_new_lvc(final_C))
final_pipe.fit(X_ins, y_ins)

final_auc = roc_auc_score(y_test, final_pipe.decision_function(X_test))
final_acc = accuracy_score(y_test, (final_pipe.decision_function(X_test) >= 0).astype(int))
print({"final_auc": final_auc, "final_acc": final_acc})

{'final_auc': 0.5376971114539777, 'final_acc': 0.46143329975536046}


output:
{'final_auc': 0.5352621760548336, 'final_acc': 0.4633724699272492}

This is not great, as expected

### Stochastic Gradient Descent Classifier

In [10]:
from sklearn.linear_model import SGDClassifier

def get_new_sgdclf(a):
    return SGDClassifier(
        penalty='l2',
        loss='hinge',
        alpha=a,
        class_weight='balanced',
        max_iter=2000,
        random_state=42
    )

In [11]:
coll_sgd = load_multi_ticker_collection(conf)
pairs_sgd = build_global_fold_pairs(coll_sgd)


cv_scores = []
for k, (Xtr, ytr, Xva, yva) in enumerate(pairs):
    num_cols = [c for c in Xtr.columns if c not in ("__ticker__", "Date")]
    
    # align drops: build a mask from *X* then apply to X and y
    mask_tr = Xtr[num_cols].isna().any(axis=1)
    Xtr = Xtr.loc[~mask_tr]
    ytr = ytr.loc[~mask_tr]

    mask_va = Xva[num_cols].isna().any(axis=1)
    Xva = Xva.loc[~mask_va]
    yva = yva.loc[~mask_va]

    

    pipe = make_global_pipeline(num_cols, model=get_new_sgdclf(a=.001))
    pipe.fit(Xtr, ytr)

    
    scores = pipe.decision_function(Xva)              
    preds  = (scores >= 0).astype(int)                # 0 threshold for hinge

    cv_scores.append({
        "fold": k,
        "roc_auc": roc_auc_score(yva, scores),
        "accuracy": accuracy_score(yva, preds),
        "n_val": len(Xva)
    })


In [12]:
cv_scores

[{'fold': 0,
  'roc_auc': 0.5487777533004987,
  'accuracy': 0.5480302212628171,
  'n_val': 3706},
 {'fold': 1,
  'roc_auc': 0.5439859440044083,
  'accuracy': 0.541014570966001,
  'n_val': 3706},
 {'fold': 2,
  'roc_auc': 0.5368376021437871,
  'accuracy': 0.5041857953011072,
  'n_val': 3703},
 {'fold': 3,
  'roc_auc': 0.49939322736210434,
  'accuracy': 0.510275824770146,
  'n_val': 3698},
 {'fold': 4,
  'roc_auc': 0.5288436579319654,
  'accuracy': 0.5223961144090664,
  'n_val': 3706},
 {'fold': 5,
  'roc_auc': 0.5346197973809476,
  'accuracy': 0.5213052858683926,
  'n_val': 3708},
 {'fold': 6,
  'roc_auc': 0.4926487373263685,
  'accuracy': 0.49232012934519,
  'n_val': 3711},
 {'fold': 7,
  'roc_auc': 0.5289953222134888,
  'accuracy': 0.47653721682847894,
  'n_val': 3708},
 {'fold': 8,
  'roc_auc': 0.5486712706592282,
  'accuracy': 0.507940930621343,
  'n_val': 3589}]