In [2]:
%reload_ext autoreload
%autoreload 2
import sys, os
sys.path.append(os.path.abspath(".."))

from src.data_eng.pipeline import run_pipeline


In [9]:
from src.config import Config

conf = Config(
    #features=[],
    add_int_features=True,
    target={'horizon': 5, 'threshold': 0.01},
    ticker_list=['AAPL','META'],
    validate_cutoff='2022-01-01',      # FINAL TEST START
    fold_len='365D',
    fold_mode='expanding',             # or 'sliding'
    sliding_train_years=None,          # set e.g. 5 if using sliding
    embargo_days=None                  # defaults to horizon=5
)

In [14]:
run_pipeline(conf)

begin fetching data from yfinance...
['AAPL', 'META']
saved: ../data/raw/AAPL.csv
saved: ../data/raw/META.csv
done fetching data
being data cleaning...
done cleaning data
begin feature engineering
make features for AAPL
make features for META
Done egineering features
writing csvs...
saved: /Users/mike/Documents/GitHub/StockPrediction/data/processed/AAPL.csv
saved: /Users/mike/Documents/GitHub/StockPrediction/data/processed/META.csv


In [11]:
import numpy as np
import pandas as pd


In [20]:

from sklearn.metrics import roc_auc_score, accuracy_score

from src.data_eng.folds import load_multi_ticker_collection
from src.modeling.global_pairs import build_global_fold_pairs, build_global_insample_and_test

# 1) get data
collection = load_multi_ticker_collection(conf)

# 2) build global fold pairs
pairs = build_global_fold_pairs(collection)

## Hyperparameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from src.data_eng.get_data import get_train_test_data

import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)


X_test, y_test, X_train, y_train = get_train_test_data(conf)

def numeric_only(df: pd.DataFrame) -> pd.DataFrame:
    return df.select_dtypes(include=[np.number]).copy()

X_train = numeric_only(X_train).replace([np.inf, -np.inf], np.nan).fillna(0.0)
X_test  = numeric_only(X_test).replace([np.inf, -np.inf], np.nan).fillna(0.0)



pipe = make_pipeline(
    StandardScaler(with_mean=False),
    LinearSVC(
        max_iter=20000,
        loss="hinge",          # fixed as requested
        dual=True,             # hinge requires dual=True
        class_weight="balanced"  # fixed as requested
    )
)
# params
param_dist = {
    "linearsvc__C": loguniform(1e-3, 1e3)
}

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=50,                   
    scoring="roc_auc",           
    cv=5,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)
print("Best score:", search.best_score_)

best_model = search.best_estimator_


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters: {'linearsvc__C': np.float64(0.0013289448722869186)}
Best score: 0.5420395985788524


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier








def make_global_pipeline(numeric_cols):
    pre = ColumnTransformer([
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["__ticker__"]),
    ])
    return Pipeline([
        ("pre", pre),
        ("clf", LinearSVC(penalty='l2', C=1.0, max_iter=3000, random_state=42))
    ])