## Data Preparation

In [60]:
import numpy as np
import polars as pl
import talib
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, precision_recall_curve, make_scorer, f1_score
)
from sklearn.utils import resample
from xgboost import XGBClassifier
import joblib
import matplotlib.pyplot as plt
import optuna
from numba import njit

In [61]:
# Carica dati CSV con Polars
def load_data(csv_path):
    df = pl.read_csv(csv_path)
    df = df.with_columns(
        (pl.col('Date') + ' ' + pl.col('Time')).alias('datetime')
    ).drop(['Date', 'Time'])
    df = df.with_columns(pl.col('datetime').str.to_datetime('%Y.%m.%d %H:%M'))
    df = df.set_sorted('datetime')
    return df

In [62]:
import numpy as np
import polars as pl
import talib

# Carica dati CSV con Polars
def load_data(csv_path):
    df = pl.read_csv(csv_path)
    df = df.with_columns(
        (pl.col('Date') + ' ' + pl.col('Time')).alias('datetime')
    ).drop(['Date', 'Time'])
    df = df.with_columns(pl.col('datetime').str.to_datetime('%Y.%m.%d %H:%M'))
    df = df.set_sorted('datetime')
    return df

@njit(fastmath=True)
def simulate_close_numba(current_price, sigma, n_sim=10000, dt=1.0, mu=0.0):
    Z = np.random.randn(n_sim)
    log_ret = (mu - 0.5 * sigma ** 2) * dt + sigma * np.sqrt(dt) * Z
    return current_price * np.exp(log_ret)

# Calcolo feature tecniche + variabili storiche
def calculate_features(df, history_bars=5):
    close = df['Close'].to_numpy()
    high = df['High'].to_numpy()
    low = df['Low'].to_numpy()
    open_ = df['Open'].to_numpy()

    features = {
        'ATR': talib.ATR(high, low, close, timeperiod=14),
        'RSI': talib.RSI(close, timeperiod=14),
    }


    log_ret = np.log(close / np.roll(close, 1))
    log_ret[0] = 0
    sigma = pd.Series(log_ret).rolling(window=100).std().to_numpy()
    features["sigma"] = sigma

    
    mc_mean = []
    mc_std = []
    mc_max = []
    mc_min = []
    mc_prob_up = []
    
    for i in range(len(close)):
        if np.isnan(close[i]) or np.isnan(sigma[i]):
          mc_mean.append(np.nan)
          mc_std.append(np.nan)
          mc_max.append(np.nan)
          mc_min.append(np.nan)
          mc_prob_up.append(np.nan)
        continue

    sims = simulate_close_numba(close[i], sigma[i], n_sim=1000)

    mc_mean.append(sims.mean())
    mc_std.append(sims.std())
    mc_max.append(sims.max())
    mc_min.append(sims.min())
    mc_prob_up.append(np.mean(sims > close[i]))

    features.update({
        "MC_mean": mc_mean,
        "MC_std": mc_std,
        "MC_max": mc_max,
        "MC_min": mc_min,
       "MC_prob_up": mc_prob_up
    })
    
    upper, mid, lower = talib.BBANDS(close, timeperiod=14, nbdevup=2, nbdevdn=2)
    features.update({
        'BB_upper': upper,
        'BB_mid': mid,
        'BB_lower': lower,
        'SMA_10': talib.SMA(close, 10),
        'SMA_20': talib.SMA(close, 20),
        'EMA_10': talib.EMA(close, 10),
        'EMA_20': talib.EMA(close, 20),
    })

    macd, macdsignal, macdhist = talib.MACD(close, 12, 26, 9)
    features.update({
        'MACD': macd,
        'MACD_signal': macdsignal,
        'MACD_hist': macdhist,
    })

    # Supertrend personalizzato
    def supertrend(high, low, close, period=10, multiplier=3):
        atr = talib.ATR(high, low, close, period)
        hl2 = (high + low) / 2
        upperband = hl2 + (multiplier * atr)
        lowerband = hl2 - (multiplier * atr)

        trend = np.zeros(len(close))
        direction = 1

        for i in range(1, len(close)):
            if close[i] > upperband[i - 1]:
                direction = 1
            elif close[i] < lowerband[i - 1]:
                direction = -1
            trend[i] = upperband[i] if direction == -1 else lowerband[i]
        return trend

    features['Supertrend'] = supertrend(high, low, close)

    # Pattern candlestick TA-Lib
    patterns = talib.get_function_groups()["Pattern Recognition"]
    for pattern in patterns:
        func = getattr(talib, pattern)
        features[pattern] = np.sign(func(open_, high, low, close))

    features_df = pl.DataFrame(features)

    # Shift delle feature e dei prezzi grezzi
    all_shifts = []
    for i in range(1, history_bars + 1):
        shifted_features = features_df.select([pl.col(c).shift(i) for c in features_df.columns])
        shifted_features = shifted_features.rename({col: f"{col}_t{i}" for col in shifted_features.columns})
        all_shifts.append(shifted_features)

    # Shift anche di Close, High, Low
    for price_col in ['Close', 'High', 'Low']:
        for i in range(1, history_bars + 1):
            df = df.with_columns(pl.col(price_col).shift(i).alias(f"{price_col}_t{i}"))

    all_shifted = pl.concat(all_shifts, how="horizontal")
    return df.hstack(features_df).hstack(all_shifted)

# Calcola livelli teorici TP e SL
def calculate_tp_sl(df, features, tp_mult=8, sl_mult=1):
    close = df['Close'].to_numpy()
    atr = features['ATR'].to_numpy()

    tp = close + tp_mult * atr
    sl = close - sl_mult * atr

    features = features.with_columns([
        pl.Series('TP_theoretical', tp),
        pl.Series('SL_theoretical', sl)
    ])
    return features


def create_feature_dataset(csv_path, max_rows=1000):
    df = load_data(csv_path)

    # ✅ Limita a max_rows righe dopo ordinamento
    df = df.head(max_rows)

    df_with_features = calculate_features(df)
    df_with_tp_sl = calculate_tp_sl(df_with_features, df_with_features)

    final_df = df_with_tp_sl.drop_nulls()
    return final_df

In [63]:
# Esecuzione
if __name__ == "__main__":
    csv_path = '../close_pred/data/EURUSD_M1_2013_2024.csv'
    feature_dataset = create_feature_dataset(csv_path, max_rows=1000)
    feature_dataset.write_parquet('feature_dataset.parquet')
    print("✅ Dataset con feature tecniche, storiche e livelli TP/SL (max 1000 righe) salvato correttamente.")


ShapeError: could not create a new DataFrame: height of column 'MC_mean' (100) does not match height of column 'ATR' (1000)

In [None]:
import polars as pl

def explode_polars(
    input_parquet: str,
    output_parquet: str,
    n_bars: int = 10
) -> None:
    """
    Per ogni riga (barra) del dataset di input:
    - Replica le successive n_bars righe come "barre future"
    - Mantieni i livelli TP_theoretical e SL_theoretical della riga base
    - Calcola target:
        +1 se close_{t+offset+1} >= TP_theoretical
        -1 se close_{t+offset+1} <= SL_theoretical
         0 altrimenti
    """
    df = pl.read_parquet(input_parquet)
    required = {"TP_theoretical", "SL_theoretical", "Close"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Mancano colonne: {missing}")

    df = df.with_row_index("base_idx")

    df_base = df
    df_future = df.rename({"base_idx": "future_idx", "Close": "future_close"})

    offsets = pl.DataFrame({"offset": list(range(1, n_bars + 1))})

    expl = df_base.join(offsets, how="cross")
    expl = expl.with_columns((pl.col("base_idx") + pl.col("offset")).alias("future_idx"))

    # ✅ shift di +1 sul future_idx per ottenere close_{t+offset+1}
    expl = expl.with_columns((pl.col("future_idx") + 1).alias("target_idx"))

    # join con close della riga successiva
    expl = expl.join(df.select(["base_idx", "Close"]).rename({
        "base_idx": "target_idx",
        "Close": "target_close"
    }), on="target_idx", how="left")

    # ✅ target calcolata su close_{t+offset+1}
    expl = expl.with_columns(
        pl.when(pl.col("target_close") >= pl.col("TP_theoretical")).then(1)
          .when(pl.col("target_close") <= pl.col("SL_theoretical")).then(-1)
          .otherwise(0).alias("target")
    )

    expl = expl.drop_nulls()
    expl.write_parquet(output_parquet)
    print(f"✅ Dataset esploso salvato in: {output_parquet}")

if __name__ == '__main__':
    explode_polars('feature_dataset.parquet','exploded_dataset.parquet',n_bars=10)

✅ Dataset esploso salvato in: exploded_dataset.parquet


In [None]:
# 1. Carica e pulisci dataset
df = pd.read_parquet('../close_pred/exploded_dataset.parquet')
df = df.dropna()
print(f"Righe totali: {len(df)}")
df.columns.tolist()

Righe totali: 9555


['base_idx',
 'Open',
 'High',
 'Low',
 'Close',
 'Volume',
 'datetime',
 'Close_t1',
 'Close_t2',
 'Close_t3',
 'Close_t4',
 'Close_t5',
 'High_t1',
 'High_t2',
 'High_t3',
 'High_t4',
 'High_t5',
 'Low_t1',
 'Low_t2',
 'Low_t3',
 'Low_t4',
 'Low_t5',
 'ATR',
 'RSI',
 'BB_upper',
 'BB_mid',
 'BB_lower',
 'SMA_10',
 'SMA_20',
 'EMA_10',
 'EMA_20',
 'MACD',
 'MACD_signal',
 'MACD_hist',
 'Supertrend',
 'CDL2CROWS',
 'CDL3BLACKCROWS',
 'CDL3INSIDE',
 'CDL3LINESTRIKE',
 'CDL3OUTSIDE',
 'CDL3STARSINSOUTH',
 'CDL3WHITESOLDIERS',
 'CDLABANDONEDBABY',
 'CDLADVANCEBLOCK',
 'CDLBELTHOLD',
 'CDLBREAKAWAY',
 'CDLCLOSINGMARUBOZU',
 'CDLCONCEALBABYSWALL',
 'CDLCOUNTERATTACK',
 'CDLDARKCLOUDCOVER',
 'CDLDOJI',
 'CDLDOJISTAR',
 'CDLDRAGONFLYDOJI',
 'CDLENGULFING',
 'CDLEVENINGDOJISTAR',
 'CDLEVENINGSTAR',
 'CDLGAPSIDESIDEWHITE',
 'CDLGRAVESTONEDOJI',
 'CDLHAMMER',
 'CDLHANGINGMAN',
 'CDLHARAMI',
 'CDLHARAMICROSS',
 'CDLHIGHWAVE',
 'CDLHIKKAKE',
 'CDLHIKKAKEMOD',
 'CDLHOMINGPIGEON',
 'CDLIDENTICAL3CRO

In [None]:
# 2. Encoding target
label_map = {-1: 0, 0: 1, 1: 2}
df['y'] = df['target'].map(label_map)
df

Unnamed: 0,base_idx,Open,High,Low,Close,Volume,datetime,Close_t1,Close_t2,Close_t3,...,CDLUPSIDEGAP2CROWS_t5,CDLXSIDEGAP3METHODS_t5,TP_theoretical,SL_theoretical,offset,future_idx,target_idx,target_close,target,y
330,33,1.32174,1.32191,1.32172,1.32177,0,2013-01-01 17:38:00,1.32175,1.32128,1.32091,...,0,0,1.322244,1.321533,1,34,35,1.32175,0,1
331,33,1.32174,1.32191,1.32172,1.32177,0,2013-01-01 17:38:00,1.32175,1.32128,1.32091,...,0,0,1.322244,1.321533,2,35,36,1.32177,0,1
332,33,1.32174,1.32191,1.32172,1.32177,0,2013-01-01 17:38:00,1.32175,1.32128,1.32091,...,0,0,1.322244,1.321533,3,36,37,1.32164,0,1
333,33,1.32174,1.32191,1.32172,1.32177,0,2013-01-01 17:38:00,1.32175,1.32128,1.32091,...,0,0,1.322244,1.321533,4,37,38,1.32167,0,1
334,33,1.32174,1.32191,1.32172,1.32177,0,2013-01-01 17:38:00,1.32175,1.32128,1.32091,...,0,0,1.322244,1.321533,5,38,39,1.32160,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9880,990,1.32431,1.32467,1.32427,1.32465,0,2013-01-02 09:35:00,1.32430,1.32439,1.32444,...,0,0,1.325225,1.324362,2,992,993,1.32468,0,1
9881,990,1.32431,1.32467,1.32427,1.32465,0,2013-01-02 09:35:00,1.32430,1.32439,1.32444,...,0,0,1.325225,1.324362,3,993,994,1.32520,0,1
9882,991,1.32466,1.32485,1.32466,1.32484,0,2013-01-02 09:36:00,1.32465,1.32430,1.32439,...,0,0,1.325403,1.324559,1,992,993,1.32468,0,1
9883,991,1.32466,1.32485,1.32466,1.32484,0,2013-01-02 09:36:00,1.32465,1.32430,1.32439,...,0,0,1.325403,1.324559,2,993,994,1.32520,0,1


In [None]:
# 3. Bilanciamento con undersampling
df_0 = df[df['y'] == 0]
df_1 = df[df['y'] == 1]
df_2 = df[df['y'] == 2]
n_min = min(len(df_0), len(df_2))
df_1_under = resample(df_1, replace=False, n_samples=n_min, random_state=42)
df_balanced = pd.concat([df_0, df_1_under, df_2], ignore_index=True)
print(f"Bilanciato: {df_balanced['y'].value_counts().to_dict()}")

Bilanciato: {0: 2519, 1: 1469, 2: 1469}


In [None]:
# 4. Feature selection iniziale
exclude_cols = [col for col in ['datetime', 'row_nr', 'offset', 'target', 'y'] if col in df_balanced.columns]
X_full = df_balanced.drop(columns=exclude_cols)
y = df_balanced['y']
X_full = X_full.astype(np.float32)

In [None]:
# 5. Primo training per importance
X_sample, _, y_sample, _ = train_test_split(X_full, y, train_size=2000, stratify=y, random_state=42)
model_temp = XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    eval_metric='mlogloss',
    tree_method='hist',
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=42,
    n_jobs=-1
)
model_temp.fit(X_sample, y_sample)

AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=100,
              n_jobs=-1, num_class=3, num_parallel_tree=None, ...)

In [None]:
importances = model_temp.feature_importances_
important_features = X_full.columns[importances > 0.0025]
print(f"Feature selezionate: {len(important_features)} su {X_full.shape[1]}")
pd.Series(important_features).to_csv("selected_features.csv", index=False)

Feature selezionate: 135 su 470


In [None]:
X = X_full[important_features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# 6. Optuna tuning
def objective(trial):
    params = {
        'objective': 'multi:softprob',
        'num_class': 3,
        'eval_metric': 'mlogloss',
        'tree_method': 'hist',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'n_estimators': trial.suggest_int('n_estimators', 100, 400),
        'subsample': trial.suggest_float('subsample', 0.6, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.95),
        'random_state': 42,
        'n_jobs': -1
    }
    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred, average='macro')

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)
print("Best trial:", study.best_trial.params)

# 7. Train finale con best params
final_model = XGBClassifier(
    **study.best_trial.params,
    objective='multi:softprob',
    num_class=3,
    eval_metric='mlogloss',
    tree_method='hist',
    random_state=42,
    n_jobs=-1
)
final_model.fit(X_train, y_train)

# 8. Probabilità e threshold
probs = final_model.predict_proba(X_test)

def get_best_f1_threshold(y_true, y_prob):
    precision, recall, thresholds = precision_recall_curve(y_true.astype(int), y_prob)
    f1 = 2 * precision * recall / (precision + recall + 1e-9)
    best_idx = np.argmax(f1[:-1])
    return thresholds[best_idx], precision[best_idx], recall[best_idx], f1[best_idx]

best_thresh_0, *_ = get_best_f1_threshold(y_test == 0, probs[:, 0])
best_thresh_2, *_ = get_best_f1_threshold(y_test == 2, probs[:, 2])

y_pred_opt = np.where(probs[:, 0] >= best_thresh_0, 0,
                      np.where(probs[:, 2] >= best_thresh_2, 2, 1))
inv_map = {0: -1, 1: 0, 2: 1}
y_test_labels = y_test.map(inv_map)
y_pred_labels = pd.Series(y_pred_opt).map(inv_map)

print("\n=== REPORT THRESHOLD OTTIMALI ===")
print(confusion_matrix(y_test_labels, y_pred_labels, labels=[-1, 0, 1]))
print(classification_report(y_test_labels, y_pred_labels, target_names=["SL_hit", "No_hit", "TP_hit"]))


In [None]:
# 9. Salva modello finale
joblib.dump({
    'model': final_model,
    'thresh_sl_opt': best_thresh_0,
    'thresh_tp_opt': best_thresh_2,
    'selected_features': list(important_features),
    'best_params': study.best_trial.params
}, 'xgb_tp_sl_model_thresh.joblib')

print("Modello, threshold e feature salvati in xgb_tp_sl_model_thresh.joblib e selected_features.csv")