In [1]:
import requests
import zipfile
import io
import pandas as pd
from datetime import datetime

In [2]:
# URL de base des archives Binance
BASE_URL = "https://data.binance.vision/data/spot/monthly/klines/BTCUSDT/1h/"

In [3]:
def download_month(year: int, month: int):
    """Télécharge les données de trading BTCUSDT pour un mois donné.
    
    Args:
        year (int): Année (ex: 2023)
        month (int): Mois (1-12)
    
    Returns:
        pd.DataFrame: DataFrame contenant les données de trading, ou None si le fichier n'existe pas.
    """
    fname = f"BTCUSDT-1h-{year}-{month:02d}.zip"
    url = BASE_URL + fname
    r = requests.get(url)
    if r.status_code == 200:
        z = zipfile.ZipFile(io.BytesIO(r.content))
        csv_file = z.namelist()[0]
        df = pd.read_csv(
            z.open(csv_file), 
            header=None,
            names=[
                "open_time", "open", "high", "low", "close", "volume",
                "close_time", "quote_asset_volume", "number_of_trades",
                "taker_buy_base", "taker_buy_quote", "ignore"
            ]
        )
        return df
    else:
        print(f"Pas trouvé : {url}")
        return None

In [4]:
def build_dataset(start_year=2017, start_month=8, end_year=None, end_month=None, out_csv="BTCUSDT_1h.csv"):
    """Télécharge et concatène toutes les données disponibles en un CSV unique."""
    if end_year is None or end_month is None:
        now = datetime.utcnow()
        end_year, end_month = now.year, now.month

    dfs = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            if (year == start_year and month < start_month) or (year == end_year and month > end_month):
                continue
            print(f"Téléchargement {year}-{month:02d} ...")
            df = download_month(year, month)
            if df is not None:
                dfs.append(df)

    if dfs:
        all_data = pd.concat(dfs, ignore_index=True)
        # Conversion timestamp en datetime lisible
        all_data["open_time"] = pd.to_datetime(all_data["open_time"], unit="ms", errors="coerce")
        all_data["close_time"] = pd.to_datetime(all_data["close_time"], unit="ms", errors="coerce")
        
        # Delete corrupted rows
        # all_data.dropna(inplace=True, subset=["open_time", "close_time"])
        # Sauvegarde CSV
        all_data.to_csv(out_csv, index=False)
        print(f"\n✅ Dataset sauvegardé dans {out_csv} ({len(all_data)} lignes)")
        return all_data
    else:
        print("❌ Aucun fichier téléchargé.")
        return None

In [5]:
# Exemple d’utilisation : télécharger toutes les données depuis août 2017
df = build_dataset(start_year=2017, start_month=8, out_csv="BTCUSDT_1h_full.csv")

  now = datetime.utcnow()


Téléchargement 2017-08 ...
Téléchargement 2017-09 ...
Téléchargement 2017-10 ...
Téléchargement 2017-11 ...
Téléchargement 2017-12 ...
Téléchargement 2018-01 ...
Téléchargement 2018-02 ...
Téléchargement 2018-03 ...
Téléchargement 2018-04 ...
Téléchargement 2018-05 ...
Téléchargement 2018-06 ...
Téléchargement 2018-07 ...
Téléchargement 2018-08 ...
Téléchargement 2018-09 ...
Téléchargement 2018-10 ...
Téléchargement 2018-11 ...
Téléchargement 2018-12 ...
Téléchargement 2019-01 ...
Téléchargement 2019-02 ...
Téléchargement 2019-03 ...
Téléchargement 2019-04 ...
Téléchargement 2019-05 ...
Téléchargement 2019-06 ...
Téléchargement 2019-07 ...
Téléchargement 2019-08 ...
Téléchargement 2019-09 ...
Téléchargement 2019-10 ...
Téléchargement 2019-11 ...
Téléchargement 2019-12 ...
Téléchargement 2020-01 ...
Téléchargement 2020-02 ...
Téléchargement 2020-03 ...
Téléchargement 2020-04 ...
Téléchargement 2020-05 ...
Téléchargement 2020-06 ...
Téléchargement 2020-07 ...
Téléchargement 2020-08 ...
T

In [7]:
duplicates = df["open_time"].duplicated().sum()
print(f"Nombre de doublons dans open_time : {duplicates}")

df = df.drop_duplicates(subset=["open_time"], keep="first")

duplicates = df["open_time"].duplicated().sum()
print(f"Nombre de doublons dans open_time : {duplicates}")

df.info()

Nombre de doublons dans open_time : 5831
Nombre de doublons dans open_time : 0
<class 'pandas.core.frame.DataFrame'>
Index: 64526 entries, 0 to 64525
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   open_time           64525 non-null  datetime64[ns]
 1   open                64526 non-null  float64       
 2   high                64526 non-null  float64       
 3   low                 64526 non-null  float64       
 4   close               64526 non-null  float64       
 5   volume              64526 non-null  float64       
 6   close_time          64525 non-null  datetime64[ns]
 7   quote_asset_volume  64526 non-null  float64       
 8   number_of_trades    64526 non-null  int64         
 9   taker_buy_base      64526 non-null  float64       
 10  taker_buy_quote     64526 non-null  float64       
 11  ignore              64526 non-null  float64       
dtypes: datetime64[ns](2), float6

In [8]:
df = df.set_index("open_time").sort_index()

full_range = pd.date_range(df.index.min(), df.index.max(), freq="H")
df = df.reindex(full_range)

df.index.name = "open_time"

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 64652 entries, 2017-08-17 04:00:00 to 2024-12-31 23:00:00
Freq: h
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   open                64482 non-null  float64       
 1   high                64482 non-null  float64       
 2   low                 64482 non-null  float64       
 3   close               64482 non-null  float64       
 4   volume              64482 non-null  float64       
 5   close_time          64482 non-null  datetime64[ns]
 6   quote_asset_volume  64482 non-null  float64       
 7   number_of_trades    64482 non-null  float64       
 8   taker_buy_base      64482 non-null  float64       
 9   taker_buy_quote     64482 non-null  float64       
 10  ignore              64482 non-null  float64       
dtypes: datetime64[ns](1), float64(10)
memory usage: 5.9 MB


  full_range = pd.date_range(df.index.min(), df.index.max(), freq="H")


## Add and transform for usefull features

In [9]:
def compute_rsi(series, window=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).fillna(0)
    loss = (-delta.where(delta < 0, 0)).fillna(0)

    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_features(df):
    df = df.copy()
    
    # SMA
    df["sma_7d"] = df["close"].rolling(window=7*24).mean()
    df["sma_30d"] = df["close"].rolling(window=30*24).mean()
    df["sma_50d"] = df["close"].rolling(window=50*24).mean()
    df["sma_100d"] = df["close"].rolling(window=100*24).mean()
    
    # Volatibility
    df["return"] = df["close"].pct_change()

    # Volatility on 20 hours
    df["volatility_20"] = df["return"].rolling(window=20).std()
    df["volatility_50"] = df["return"].rolling(window=50).std()
    df["volatility_100"] = df["return"].rolling(window=100).std()
    df["volatility_14d"] = df["return"].rolling(window=14*24).std()
    
    # RSI 14 and 14 days
    df["rsi_14"] = compute_rsi(df["close"], window=14)
    df["rsi_14d"] = compute_rsi(df["close"], window=14*24)
    
    # MACD
    df = compute_MACD(df)
    
    # Relative volume 20
    df["volume_sma20"] = df["volume"].rolling(window=20).mean()
    df["volume_sma20d"] = df["volume"].rolling(window=20*24).mean()
    df["volume_rel20"] = df["volume"] / df["volume_sma20"]
    df["volume_rel20d"] = df["volume"] / df["volume_sma20d"]
    
    return df

def compute_MACD(df):
    df = df.copy()
    
    # EMA 12 et EMA 26
    df["ema_12d"] = df["close"].ewm(span=12*24, adjust=False).mean()
    df["ema_26d"] = df["close"].ewm(span=26*24, adjust=False).mean()
    
    # MACD line
    df["MACD"] = df["ema_12d"] - df["ema_26d"]

    # Signal line (EMA 9 du MACD)
    df["Signal"] = df["MACD"].ewm(span=9*24, adjust=False).mean()

    # Histogramme
    df["MACD_Hist"] = df["MACD"] - df["Signal"]

    return df

In [10]:
df = compute_features(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 64652 entries, 2017-08-17 04:00:00 to 2024-12-31 23:00:00
Freq: h
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   open                64482 non-null  float64       
 1   high                64482 non-null  float64       
 2   low                 64482 non-null  float64       
 3   close               64482 non-null  float64       
 4   volume              64482 non-null  float64       
 5   close_time          64482 non-null  datetime64[ns]
 6   quote_asset_volume  64482 non-null  float64       
 7   number_of_trades    64482 non-null  float64       
 8   taker_buy_base      64482 non-null  float64       
 9   taker_buy_quote     64482 non-null  float64       
 10  ignore              64482 non-null  float64       
 11  sma_7d              59926 non-null  float64       
 12  sma_30d             48125 non-null  float64       
 13  sma

  df["return"] = df["close"].pct_change()


In [11]:
def clean_dataset(df):
    df = df.copy()
    
    # Colonnes inutiles
    drop_cols = [
        "close_time", "ignore", 
        "ema_12d", "ema_26d", 
        "volume_sma20", "volume_sma20d"
    ]
    
    df = df.drop(columns=[c for c in drop_cols if c in df.columns])
    
    # Suppression des lignes avec NaN restants
    df = df.dropna()
    
    print(f"✅ Dataset nettoyé : {df.shape[0]} lignes, {df.shape[1]} colonnes")
    return df


In [12]:
cleaned_df = clean_dataset(df)

✅ Dataset nettoyé : 27203 lignes, 25 colonnes


In [13]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 27203 entries, 2017-12-15 22:00:00 to 2024-12-31 23:00:00
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   open                27203 non-null  float64
 1   high                27203 non-null  float64
 2   low                 27203 non-null  float64
 3   close               27203 non-null  float64
 4   volume              27203 non-null  float64
 5   quote_asset_volume  27203 non-null  float64
 6   number_of_trades    27203 non-null  float64
 7   taker_buy_base      27203 non-null  float64
 8   taker_buy_quote     27203 non-null  float64
 9   sma_7d              27203 non-null  float64
 10  sma_30d             27203 non-null  float64
 11  sma_50d             27203 non-null  float64
 12  sma_100d            27203 non-null  float64
 13  return              27203 non-null  float64
 14  volatility_20       27203 non-null  float64
 15  volatility_50     

In [14]:
cleaned_df.to_parquet("BTC_features_clean.parquet")

In [39]:
import numpy as np

def make_windows(df, input_len=168, output_horizons=[1, 6, 12, 24, 168], target_col="close"):
    """
    Génère les fenêtres (X, Y) pour l'entraînement d'un modèle séquentiel.
    
    df : DataFrame Pandas (index = datetime, colonnes = features)
    input_len : longueur de la fenêtre d'entrée (en heures)
    output_horizons : horizons de prédiction (en heures)
    target_col : colonne de référence pour la target (ex. 'close')
    
    Retourne :
        X : np.array (n_samples, input_len, n_features)
        Y : np.array (n_samples, len(output_horizons))
    """
    data = df.values
    target = df[target_col].values
    n_features = data.shape[1]

    X, Y = [], []
    max_h = max(output_horizons)

    for t in range(len(df) - input_len - max_h):
        # Fenêtre d'entrée
        x_window = data[t : t + input_len]

        # Targets en rendements relatifs
        y_window = []
        current_price = target[t + input_len - 1]
        for h in output_horizons:
            future_price = target[t + input_len + h - 1]
            y_window.append(future_price / current_price - 1)

        X.append(x_window)
        Y.append(y_window)

    X = np.array(X, dtype=np.float32)
    Y = np.array(Y, dtype=np.float32)

    print(f"✅ make_windows: X={X.shape}, Y={Y.shape}")
    return X, Y


In [40]:
X, Y, = make_windows(cleaned_df)

✅ make_windows: X=(26867, 168, 25), Y=(26867, 5)
