# EURUSD 1H — ML Clasificación (LONG / SHORT / NO TRADE) con umbrales por validación temporal

Plantilla para:
- descargar datos (EURUSD 1H)
- limpiar y chequear
- explorar rápido (EDA ligera)
- crear features sin fuga (solo velas cerradas)
- target 3-clases: LONG(1) / NO TRADE(0) / SHORT(-1) usando zona muerta (deadzone) en pips
- split temporal train/val/test
- entrenar clasificador (Gradient Boosting)
- elegir th_up y th_down en validation (sin mirar test)
- producir señal final LONG/SHORT/NO TRADE a partir de probabilidades

> Nota: educativo. No es consejo financiero.


## 0) Instalación (una vez)

En una terminal:

```bash
pip install yfinance pandas numpy scikit-learn matplotlib
```


In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf


## 1) Descargar datos EURUSD 1H (3 años)

In [None]:
symbol = "EURUSD=X"
df = yf.download(symbol, period="3y", interval="1h", auto_adjust=False, progress=False)

df = df.rename(columns=str.lower)
df = df.dropna().copy()
df.index = pd.to_datetime(df.index)
df = df.sort_index()

df.head(), df.tail(), df.shape


## 2) Limpieza básica + checks

In [None]:
df = df[(df["high"] >= df["low"]) & (df["close"] > 0)].copy()
df = df[~df.index.duplicated(keep="last")].copy()

print("NaNs por columna:")
display(df.isna().sum())
print("Rango fechas:", df.index.min(), "->", df.index.max())
print("Filas:", len(df))


## 3) Exploración rápida (EDA ligera)

In [None]:
ret_1h = df["close"].pct_change()

print("Ret 1H mean:", ret_1h.mean(), "std:", ret_1h.std())
print("Skew:", ret_1h.skew(), "Kurt:", ret_1h.kurt())

plt.figure()
ret_1h.dropna().hist(bins=100)
plt.title("Distribución retornos 1H (close pct_change)")
plt.show()

plt.figure()
(df["close"]).plot()
plt.title("EURUSD Close 1H")
plt.show()


## 4) Features (solo información pasada, sin fuga)

In [None]:
def add_features(data: pd.DataFrame) -> pd.DataFrame:
    d = data.copy()

    d["ret_1"]  = d["close"].pct_change(1)
    d["ret_3"]  = d["close"].pct_change(3)
    d["ret_6"]  = d["close"].pct_change(6)
    d["ret_12"] = d["close"].pct_change(12)
    d["ret_24"] = d["close"].pct_change(24)

    d["sma_10"] = d["close"].rolling(10).mean()
    d["sma_30"] = d["close"].rolling(30).mean()
    d["sma_ratio"] = d["sma_10"] / d["sma_30"] - 1

    d["vol_20"] = d["ret_1"].rolling(20).std()

    d["range"] = (d["high"] - d["low"]) / d["close"]

    d["body"] = (d["close"] - d["open"]) / d["close"]
    d["upper_wick"] = (d["high"] - d[["close","open"]].max(axis=1)) / d["close"]
    d["lower_wick"] = (d[["close","open"]].min(axis=1) - d["low"]) / d["close"]

    return d

df_feat = add_features(df).dropna().copy()
df_feat.head()


## 5) Target 3-clases (LONG / NO TRADE / SHORT) con zona muerta en pips

- 1 = LONG si la próxima vela supera +deadzone
- 0 = NO TRADE si está dentro de la zona muerta
- -1 = SHORT si baja más que -deadzone


In [None]:
pip = 0.0001
deadzone_pips = 1.5
dead = deadzone_pips * pip

fwd_ret = df_feat["close"].shift(-1) / df_feat["close"] - 1

df_feat["target"] = 0
df_feat.loc[fwd_ret >  dead, "target"] = 1
df_feat.loc[fwd_ret < -dead, "target"] = -1

df_feat = df_feat.dropna().copy()

feature_cols = [
    "ret_1","ret_3","ret_6","ret_12","ret_24",
    "sma_ratio","vol_20","range",
    "body","upper_wick","lower_wick"
]

X = df_feat[feature_cols]
y = df_feat["target"]

print("Distribución target:")
display(y.value_counts(normalize=True).sort_index())
X.shape


## 6) Split temporal (train / validation / test)

In [None]:
n = len(df_feat)
train_end = int(n * 0.70)
val_end   = int(n * 0.80)

X_train, y_train = X.iloc[:train_end], y.iloc[:train_end]
X_val,   y_val   = X.iloc[train_end:val_end], y.iloc[train_end:val_end]
X_test,  y_test  = X.iloc[val_end:], y.iloc[val_end:]

df_train = df_feat.iloc[:train_end].copy()
df_val   = df_feat.iloc[train_end:val_end].copy()
df_test  = df_feat.iloc[val_end:].copy()

X_train.shape, X_val.shape, X_test.shape


## 7) Entrenar clasificador (multiclase) y probabilidades

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, balanced_accuracy_score

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)

proba_val  = gb.predict_proba(X_val)
proba_test = gb.predict_proba(X_test)
classes = gb.classes_

pred_val = gb.predict(X_val)

print("Classes:", classes)
print("VAL Balanced Accuracy:", balanced_accuracy_score(y_val, pred_val))
print(classification_report(y_val, pred_val))


## 8) Probabilidades → señal (LONG/SHORT/NO TRADE)

In [None]:
def proba_to_signal_multiclass(proba: np.ndarray, classes: np.ndarray, th_up: float, th_down: float) -> np.ndarray:
    idx_down = np.where(classes == -1)[0][0]
    idx_up   = np.where(classes ==  1)[0][0]

    signal = np.zeros(len(proba), dtype=int)
    signal[proba[:, idx_up] >= th_up] = 1
    signal[proba[:, idx_down] >= th_down] = -1

    both = (proba[:, idx_up] >= th_up) & (proba[:, idx_down] >= th_down)
    signal[both] = np.where(proba[both, idx_up] > proba[both, idx_down], 1, -1)
    return signal


## 9) Backtest con costos (para elegir umbrales en VALIDATION)

In [None]:
def backtest_from_signal(
    prices: pd.Series,
    signal: pd.Series,
    spread_pips: float = 1.0,
    slippage_pips: float = 0.2,
    commission_round_turn: float = 0.0
):
    df_bt = pd.DataFrame({"close": prices, "pos": signal}).copy()

    df_bt["fwd_ret"] = df_bt["close"].pct_change().shift(-1)
    df_bt["pos_prev"] = df_bt["pos"].shift(1).fillna(0)
    df_bt["trade"] = (df_bt["pos"] != df_bt["pos_prev"]).astype(int)

    pip_value = 0.0001
    total_pips = spread_pips + slippage_pips
    df_bt["cost_ret"] = df_bt["trade"] * (total_pips * pip_value) / df_bt["close"]
    df_bt["commission_ret"] = df_bt["trade"] * commission_round_turn

    df_bt["strategy_ret"] = df_bt["pos"] * df_bt["fwd_ret"] - df_bt["cost_ret"] - df_bt["commission_ret"]

    df_bt = df_bt.dropna().copy()
    df_bt["equity"] = (1 + df_bt["strategy_ret"]).cumprod()
    return df_bt

def max_drawdown(equity: pd.Series) -> float:
    peak = equity.cummax()
    dd = equity / peak - 1
    return float(dd.min())

def sharpe_approx(returns: pd.Series, periods_per_year: int = 252*24) -> float:
    mu = returns.mean()
    sd = returns.std()
    if sd == 0 or np.isnan(sd):
        return np.nan
    return float((mu / sd) * np.sqrt(periods_per_year))


## 10) Elegir th_up y th_down en VALIDATION (grid search temporal)

In [None]:
spread_pips = 1.0
slippage_pips = 0.2
commission_round_turn = 0.0

grid_up = np.round(np.arange(0.35, 0.66, 0.02), 2)
grid_dn = np.round(np.arange(0.35, 0.66, 0.02), 2)

rows = []
for th_up in grid_up:
    for th_down in grid_dn:
        sig_val = proba_to_signal_multiclass(proba_val, classes, th_up, th_down)
        sig_val_s = pd.Series(sig_val, index=df_val.index)

        bt_val = backtest_from_signal(
            prices=df_val["close"],
            signal=sig_val_s,
            spread_pips=spread_pips,
            slippage_pips=slippage_pips,
            commission_round_turn=commission_round_turn
        )

        eq_final = float(bt_val["equity"].iloc[-1])
        mdd = max_drawdown(bt_val["equity"])
        sh = sharpe_approx(bt_val["strategy_ret"])
        trades = int((bt_val["pos"] != bt_val["pos"].shift(1).fillna(0)).sum())

        if trades < 30:
            continue

        rows.append((th_up, th_down, eq_final, mdd, sh, trades))

res = pd.DataFrame(rows, columns=["th_up","th_down","eq_final","mdd","sharpe","trades"])
res = res.sort_values(["sharpe","eq_final"], ascending=[False, False]).reset_index(drop=True)

display(res.head(10))

best = res.iloc[0]
best_th_up = float(best["th_up"])
best_th_down = float(best["th_down"])

print("Best thresholds from VALIDATION:", best_th_up, best_th_down)
print("VAL Sharpe:", float(best["sharpe"]), "VAL equity:", float(best["eq_final"]), "VAL mdd:", float(best["mdd"]), "VAL trades:", int(best["trades"]))


## 11) Reentrenar en TRAIN+VAL y evaluar una sola vez en TEST

In [None]:
X_trainval = X.iloc[:val_end]
y_trainval = y.iloc[:val_end]

gb_final = GradientBoostingClassifier(random_state=42)
gb_final.fit(X_trainval, y_trainval)

proba_test_final = gb_final.predict_proba(X_test)
classes_final = gb_final.classes_

sig_test = proba_to_signal_multiclass(proba_test_final, classes_final, best_th_up, best_th_down)
sig_test_s = pd.Series(sig_test, index=df_test.index)

bt_test = backtest_from_signal(
    prices=df_test["close"],
    signal=sig_test_s,
    spread_pips=spread_pips,
    slippage_pips=slippage_pips,
    commission_round_turn=commission_round_turn
)

eq_final = float(bt_test["equity"].iloc[-1])
mdd = max_drawdown(bt_test["equity"])
sh = sharpe_approx(bt_test["strategy_ret"])
trades = int((bt_test["pos"] != bt_test["pos"].shift(1).fillna(0)).sum())

print("TEST results (thresholds chosen on VAL):")
print("Final equity:", eq_final)
print("Max Drawdown:", mdd)
print("Sharpe approx:", sh)
print("Trades:", trades)

plt.figure()
plt.plot(bt_test.index, bt_test["equity"], label="Strategy (test)")
plt.legend()
plt.title("EURUSD 1H — Equity curve (TEST)")
plt.show()

print("Señal en TEST (proporciones):")
display(sig_test_s.value_counts(normalize=True).sort_index())


## 12) Señal "en vivo" (última vela cerrada)

In [None]:
last_X = df_feat[feature_cols].iloc[[-1]]

last_proba = gb_final.predict_proba(last_X)[0]
last_classes = classes_final

idx_down = np.where(last_classes == -1)[0][0]
idx_up   = np.where(last_classes ==  1)[0][0]

p_down = float(last_proba[idx_down])
p_up   = float(last_proba[idx_up])

last_signal = proba_to_signal_multiclass(
    proba=np.array([last_proba]),
    classes=last_classes,
    th_up=best_th_up,
    th_down=best_th_down
)[0]

map_sig = {1:"LONG", 0:"NO TRADE", -1:"SHORT"}

print("P(UP)  :", p_up)
print("P(DOWN):", p_down)
print("Signal :", map_sig[int(last_signal)])
