In [9]:
import sys, pathlib

proj_root = pathlib.Path("..").resolve()
if str(proj_root) not in sys.path:
    sys.path.insert(0, str(proj_root))

from src.data_loading import load_eurusd
from src.external_data import load_vix
from src.features import add_features
from src.modeling import (
    train_test_split_time,
    build_logreg_pipeline,
    build_decision_tree,
    build_random_forest,
    build_gradient_boosting,
)
from src.evaluation import evaluate_classifier
from src.config import FEATURE_COLS, TARGET_COL

import pandas as pd

# 1) Load data
df = load_eurusd()
vix = load_vix()

# 2) Merge VIX
df = df.merge(vix, on="date", how="left")
df["vix"] = df["vix"].ffill()

# 3) Features
df_feat = add_features(df)

# 4) Time split
X_train, X_test, y_train, y_test = train_test_split_time(df_feat, test_size=0.2)

print("df_feat shape:", df_feat.shape)
print("Features used:", [c for c in FEATURE_COLS if c in df_feat.columns])
print("Target balance:", df_feat[TARGET_COL].value_counts(normalize=True).round(3))


df_feat shape: (9267, 20)
Features used: ['ret_lag1', 'ret_lag2', 'ret_lag3', 'ret_lag5', 'ret_rollmean_5', 'ret_rollstd_5', 'ret_rollstd_10', 'ret_rollstd_20', 'abs_ret_lag1', 'range_pct', 'vix_lag1']
Target balance: y_up
1    0.502
0    0.498
Name: proportion, dtype: float64


In [10]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)
pred = dummy.predict(X_test)

print("Dummy accuracy:", round(accuracy_score(y_test, pred), 3))

dummy_s = DummyClassifier(strategy="stratified", random_state=0)
dummy_s.fit(X_train, y_train)
proba = dummy_s.predict_proba(X_test)[:, 1]
print("Dummy ROC-AUC:", round(roc_auc_score(y_test, proba), 3))


Dummy accuracy: 0.502
Dummy ROC-AUC: 0.507


In [11]:
models = {
    "LogReg": build_logreg_pipeline(),
    "DecisionTree": build_decision_tree(),
    "RandomForest": build_random_forest(),
    "GradientBoosting": build_gradient_boosting(),
}

results = []

for name, m in models.items():
    m.fit(X_train, y_train)
    acc, auc = evaluate_classifier(m, X_train, y_train, X_test, y_test, name=name)
    results.append((name, acc, auc))



=== LogReg ===
Accuracy: 0.504
ROC-AUC : 0.496
Confusion matrix:
 [[334 589]
 [331 600]]

Classification report:
               precision    recall  f1-score   support

           0      0.502     0.362     0.421       923
           1      0.505     0.644     0.566       931

    accuracy                          0.504      1854
   macro avg      0.503     0.503     0.493      1854
weighted avg      0.503     0.504     0.494      1854

=== DecisionTree ===
Accuracy: 0.517
ROC-AUC : 0.522
Confusion matrix:
 [[291 632]
 [263 668]]

Classification report:
               precision    recall  f1-score   support

           0      0.525     0.315     0.394       923
           1      0.514     0.718     0.599       931

    accuracy                          0.517      1854
   macro avg      0.520     0.516     0.496      1854
weighted avg      0.520     0.517     0.497      1854

=== RandomForest ===
Accuracy: 0.497
ROC-AUC : 0.499
Confusion matrix:
 [[313 610]
 [322 609]]

Classification r

In [14]:
from src.modeling import grid_search_timeseries
from sklearn.ensemble import RandomForestClassifier

rf_base = RandomForestClassifier(random_state=0, n_jobs=-1)

param_grid_rf = {
    "n_estimators": [200, 400],
    "max_depth": [4, 6, 8],
    "min_samples_leaf": [20, 50],
}

rf_best, rf_params, rf_cvscore = grid_search_timeseries(
    rf_base, param_grid_rf, X_train, y_train, n_splits=5
)

print("Best RF params:", rf_params)
print("CV ROC-AUC:", round(rf_cvscore, 3))

acc_rf_best, auc_rf_best = evaluate_classifier(
    rf_best, X_train, y_train, X_test, y_test, name="RandomForest tuned"
)


Best RF params: {'max_depth': 8, 'min_samples_leaf': 20, 'n_estimators': 200}
CV ROC-AUC: 0.497
=== RandomForest tuned ===
Accuracy: 0.497
ROC-AUC : 0.505
Confusion matrix:
 [[325 598]
 [335 596]]

Classification report:
               precision    recall  f1-score   support

           0      0.492     0.352     0.411       923
           1      0.499     0.640     0.561       931

    accuracy                          0.497      1854
   macro avg      0.496     0.496     0.486      1854
weighted avg      0.496     0.497     0.486      1854



In [13]:
results.append(("RandomForest tuned", acc_rf_best, auc_rf_best))

pd.DataFrame(results, columns=["model", "accuracy", "roc_auc"]).sort_values("roc_auc", ascending=False)


Unnamed: 0,model,accuracy,roc_auc
1,DecisionTree,0.51726,0.522168
3,GradientBoosting,0.511327,0.519898
4,RandomForest tuned,0.496764,0.50477
2,RandomForest,0.497303,0.4991
0,LogReg,0.503776,0.495801
