In [1]:
from api.BinanceData import BinanceData
from utils.CleanData import CleanData
from variables.Transformations import Transformations
from variables.Target import Target
from variables.TradingIndicators import TradingIndicators
import pandas as pd

In [2]:
# Import de données
BinanceData = BinanceData()
data = BinanceData.load_data("BTCUSDT", days=365)
print(data.head(2))
print(len(data))

       open_time            open            high             low  \
0  1727300100000  63515.17000000  63536.00000000  63494.58000000   
1  1727300400000  63509.79000000  63509.79000000  63487.72000000   

            close       volume     close_time quote_asset_volume  \
0  63509.79000000  32.17707000  1727300399999   2043736.51593170   
1  63504.00000000  15.42867000  1727300699999    979685.51311770   

   number_of_trades taker_buy_base   taker_buy_quote ignore  
0              3231    17.34842000  1101802.95114780      0  
1              2641    10.32801000   655809.68779130      0  
105120


In [3]:
# Nettoyage des données 
cleaner = CleanData()
cleaned_data = cleaner.clean_klines_data(data)
print(cleaned_data.index.name)
print(cleaned_data.tail(3))

open_time
                          high        low      close    volume  \
open_time                                                        
2025-09-25 21:20:00  109589.00  109478.35  109494.05  36.39810   
2025-09-25 21:25:00  109524.00  109312.25  109318.83  21.51965   
2025-09-25 21:30:00  109318.83  109289.74  109289.75   3.68846   

                     quote_asset_volume  number_of_trades  taker_buy_quote  
open_time                                                                   
2025-09-25 21:20:00        3.987154e+06              9437     2.441100e+06  
2025-09-25 21:25:00        2.354127e+06             10186     7.704136e+05  
2025-09-25 21:30:00        4.031948e+05              1553     1.967461e+05  


In [4]:
# Export des données pour travailler sur un ficheir Excel plutôt que de requêter l'API Binance à chaque fois
print(len(cleaned_data))
cleaned_data.to_excel("../data/clean_dataset.xlsx", index=True)

105120


In [6]:
# Import des données depuis le fichier Excel
data = pd.read_excel("../data/clean_dataset.xlsx", index_col=0)

# Création des variables rendement et la cible 
data["return"] = Transformations.return_(data['close'])
data["return_10"] = Transformations.return_10(data['close'])
data["target"] = Target.compute(data["return"])

# ajout des indicateurs techniques
data = TradingIndicators.add_ema(data, price_col="close", window=12, new_col="EMA_12")
data = TradingIndicators.add_macd(data, price_col="close")
data = TradingIndicators.add_bollinger_bands(data, price_col="close")
data = TradingIndicators.add_rsi(data, price_col="close")
data = TradingIndicators.add_atr(data, high_col="high", low_col="low", close_col="close", window=14, new_col="ATR_14")
data = TradingIndicators.add_high_low_range(data, high_col="high", low_col="low")
data = TradingIndicators.add_buy_pressure(data, high_col="high", low_col="low", close_col="close")
data = TradingIndicators.add_volume_pressure(data, taker_buy_col="taker_buy_quote", total_volume_col="quote_asset_volume")
data = TradingIndicators.add_realized_volatility(data, returns_col="return", window=14)

print(data.head(2))
print(data["target"].value_counts())
print(data.columns)

                         high       low     close    volume  \
open_time                                                     
2024-09-25 21:35:00  63536.00  63494.58  63509.79  32.17707   
2024-09-25 21:40:00  63509.79  63487.72  63504.00  15.42867   

                     quote_asset_volume  number_of_trades  taker_buy_quote  \
open_time                                                                    
2024-09-25 21:35:00        2.043737e+06              3231     1.101803e+06   
2024-09-25 21:40:00        9.796855e+05              2641     6.558097e+05   

                       return  return_10  target  ...     MACD  MACD_Signal  \
open_time                                         ...                         
2024-09-25 21:35:00       NaN        NaN       0  ...  0.00000     0.000000   
2024-09-25 21:40:00 -0.000091        NaN       0  ... -0.46188    -0.092376   

                     Bollinger_Upper  Bollinger_Lower  RSI_14  ATR_14  \
open_time                                   

In [7]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance

def feature_selection(df, target_col="target", lag=1, top_k=None):
    """
    Évalue l'importance des features sur l'ensemble de la série
    et retourne celles à garder.
    
    df : DataFrame complet avec la cible
    target_col : colonne cible
    lag : décalage cible
    top_k : garder seulement les k meilleures features (optionnel)
    """
    
    # Décalage de la cible
    df[target_col] = df[target_col].shift(-lag)
    df = df.dropna()
    
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # Scaling
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
    
    # Split train/test (par ex. 80/20)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)
    
    # Fit XGBoost
    model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
    model.fit(X_train, y_train)
    
    # Importance via gain interne XGBoost
    feat_importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
    
    print("Importance des variables (XGBoost):")
    print(feat_importance)
    
    # Importance par permutation (plus robuste)
    perm = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
    perm_importance = pd.Series(perm.importances_mean, index=X.columns).sort_values(ascending=False)
    
    print("\nImportance par permutation:")
    print(perm_importance)
    
    # Si on ne veut garder que top_k features
    if top_k:
        selected = feat_importance.head(top_k).index.tolist()
    else:
        selected = feat_importance.index.tolist()
    
    return selected

feature_selection(data, target_col="target", lag=1, top_k=10)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Importance des variables (XGBoost):
close                  0.054318
low                    0.053604
Bollinger_Lower        0.053576
return_10              0.052452
Bollinger_Upper        0.051329
number_of_trades       0.050565
quote_asset_volume     0.050413
Buy_Pressure           0.050241
MACD                   0.049950
MACD_Signal            0.049650
return                 0.049446
taker_buy_quote        0.049219
volume                 0.049193
EMA_12                 0.049190
Volume_Pressure        0.048792
ATR_14                 0.048734
High_Low_Range         0.048304
RSI_14                 0.048066
Realized_Volatility    0.047382
high                   0.045577
dtype: float32

Importance par permutation:
return_10              0.002821
High_Low_Range         0.002117
RSI_14                 0.001618
ATR_14                 0.001494
high                   0.001194
taker_buy_quote        0.000928
return                 0.000828
close                  0.000528
Volume_Pressure        0

['close',
 'low',
 'Bollinger_Lower',
 'return_10',
 'Bollinger_Upper',
 'number_of_trades',
 'quote_asset_volume',
 'Buy_Pressure',
 'MACD',
 'MACD_Signal']

In [3]:
from models.Models import CryptoModel


crypto_model = CryptoModel(data)


# XGBoost
# xgb_model = crypto_model.xgboost_classification(test_size=0.2)
# Régression logistique
logit_model = crypto_model.logistic_regression(test_size=0.2)

print(logit_model)

=== Logistic Regression Metrics ===
Accuracy : 0.9933
Precision: 0.9387
Recall   : 1.0000
F1-score : 0.9684
Confusion Matrix:
[[18720   141]
 [    0  2160]]
LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
