In [1]:
import requests
import json
import pandas as pd
import numpy as np
from datetime import datetime
import time
import numpy as np

In [2]:
def format_data(df, ticker):
    renamed_columns = {
    "Fecha": 'Date', 
    'Último': f'{ticker}-Close', 
    "Apertura": f'{ticker}-Open', 
    "Máximo": f"{ticker}-High", 
    "Mínimo": f"{ticker}-Low", 
    "Vol.": f"{ticker}-Volume", 
    "% var.": f"{ticker}-% var"
    }
    df = df.rename(columns=renamed_columns)
    df["Date"] = pd.to_datetime(df["Date"], format="%d.%m.%Y")
    return df
    

# OHLC (Open - High - Low - Close) del bitcoin
**Open** -> Precio de apertura

**High** -> Precio máximo

**Low** -> Precio mínimo

**Close** -> Precio de cierre

**Volume** -> Cantidad de BTC que se movieron

In [3]:
startTime = 1502928000000
limit = time.time() * 1000

OHLC_dataframes = []

while (startTime < limit):
    url = 'https://api.binance.com/api/v3/klines?symbol=BTCUSDT&interval=1d&limit=1000&startTime={startTime}'.format(startTime=startTime)
    res = requests.get(url)
    data = json.loads(res.text)
    df = pd.DataFrame(data, columns=[
        "Open time", "Open", "High", "Low", "Close", "Volume",
        "Close time", "Quote asset volume", "Number of trades",
        "Taker buy base asset volume", "Taker buy quote asset volume", "Ignore"
    ])
    df["Date"] = pd.to_datetime(df["Open time"], unit='ms') - pd.Timedelta(days=1)
    startTime = df['Close time'].iloc[-1]
    df = df[["Date", "Open", "High", "Low", "Close", "Volume"]]
    df[["Open", "High", "Low", "Close", "Volume"]] = df[["Open", "High", "Low", "Close", "Volume"]].astype(float)
    OHLC_dataframes.append(df)
    
OHLC_data = pd.concat(df for df in OHLC_dataframes)
OHLC_data

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2017-08-16,4261.48,4485.39,4200.74,4285.08,795.150377
1,2017-08-17,4285.08,4371.52,3938.77,4108.37,1199.888264
2,2017-08-18,4108.37,4184.69,3850.00,4139.98,381.309763
3,2017-08-19,4120.98,4211.08,4032.62,4086.29,467.083022
4,2017-08-20,4069.13,4119.62,3911.79,4016.00,691.743060
...,...,...,...,...,...,...
954,2025-09-17,116447.60,117900.00,116092.76,117073.53,11657.233650
955,2025-09-18,117073.53,117459.99,115100.00,115632.38,8992.090650
956,2025-09-19,115632.39,116121.81,115408.47,115685.63,4674.927520
957,2025-09-20,115685.63,115819.06,115188.00,115232.29,4511.522190


# OHLC del petróleo (WTI) desde la fecha más vieja en la que pudimos obtener datos del BTC
https://es.investing.com/commodities/crude-oil-historical-data

In [21]:
from pathlib import Path

csv_path = Path.cwd().parent / "CSVs" / "WTI.csv"
print("Leyendo:", csv_path)

OHLC_WTI = pd.read_csv(csv_path)
OHLC_WTI = format_data(OHLC_WTI, "WTI")

for feature in OHLC_WTI.columns.values[1:-2]:
    OHLC_WTI[feature] = OHLC_WTI[feature].str.replace(",",".")
OHLC_WTI[["WTI-Close", "WTI-Open", "WTI-High", "WTI-Low"]] = OHLC_WTI[["WTI-Close", "WTI-Open", "WTI-High", "WTI-Low"]].astype(float)

hay_sin_k = OHLC_WTI["WTI-Volume"].str.endswith("K", na=False).all()
if hay_sin_k:
    print("✅ Todos los valores terminan en K")
else:
    print("⚠️ Hay valores que NO terminan en K")

OHLC_WTI.head(25)

Leyendo: c:\ML_Bitcoin\EDA\OHLCs\CSVs\WTI.csv
⚠️ Hay valores que NO terminan en K


Unnamed: 0,Date,WTI-Close,WTI-Open,WTI-High,WTI-Low,WTI-Volume,WTI-% var
0,2025-10-05,61.43,61.46,61.79,61.36,"6,92K","0,90%"
1,2025-10-03,60.88,60.7,61.38,60.55,"209,16K","0,66%"
2,2025-10-02,60.48,61.78,62.54,60.4,"290,51K","-2,10%"
3,2025-10-01,61.78,62.46,62.89,61.4,"274,34K","-0,95%"
4,2025-09-30,62.37,63.14,63.26,62.03,"271,65K","-1,70%"
5,2025-09-29,63.45,65.07,65.4,62.98,"294,29K","-2,59%"
6,2025-09-28,65.14,65.0,65.18,64.88,"8,23K","-0,88%"
7,2025-09-26,65.72,65.2,66.42,64.66,"284,99K","1,14%"
8,2025-09-25,64.98,64.8,65.34,64.06,"258,35K","-0,02%"
9,2025-09-24,64.99,63.64,65.05,63.25,"282,72K","2,49%"


In [22]:
print("NaN en la columna WTI-Volume:")
print(OHLC_WTI["WTI-Volume"].isna().sum())
print(OHLC_WTI[OHLC_WTI["WTI-Volume"].isna()])
print("Valores en la columna WTI-Volume sin una K")
filtro = OHLC_WTI["WTI-Volume"].notna() & ~OHLC_WTI["WTI-Volume"].str.endswith("K", na=False)
print(filtro.sum())
print(OHLC_WTI[filtro])

NaN en la columna WTI-Volume:
88
           Date  WTI-Close  WTI-Open  WTI-High  WTI-Low WTI-Volume WTI-% var
27   2025-08-31      63.96     63.98     64.01    63.92        NaN    -0,08%
98   2025-05-25      61.93     61.73     62.15    61.58        NaN     0,65%
282  2024-09-02      73.78     73.00     74.39    72.89        NaN     1,10%
283  2024-09-01      72.98     73.33     73.42    72.97        NaN     0,45%
325  2024-07-04      83.94     83.61     84.20    83.03        NaN     1,11%
...         ...        ...       ...       ...      ...        ...       ...
2041 2018-01-01      60.24     60.26     60.28    60.15        NaN    -0,33%
2046 2017-12-25      58.59     58.41     58.62    58.38        NaN     0,09%
2068 2017-11-23      58.38     57.97     58.58    57.76        NaN     0,62%
2126 2017-09-04      47.41     47.31     47.66    47.16        NaN     0,19%
2127 2017-09-03      47.32     47.31     47.42    47.30        NaN     0,06%

[88 rows x 7 columns]
Valores en la column

In [23]:
def parse_value(x):
    if pd.isna(x):  # mantenemos los NaN por ahora
        return np.nan
    x = str(x).strip()
    
    factor = 1
    if x.endswith("K"):
        factor = 1000
        x = x[:-1]
    elif x.endswith("M"):
        factor = 1000000
        x = x[:-1]
    
    # reemplazamos coma decimal por punto
    x = x.replace(",", ".")
    
    try:
        return float(x) * factor
    except ValueError:
        return np.nan  # en caso de algún valor raro

OHLC_WTI["WTI-Volume"] = OHLC_WTI["WTI-Volume"].apply(parse_value)
OHLC_WTI[OHLC_WTI["WTI-Volume"] >= 1000000]

Unnamed: 0,Date,WTI-Close,WTI-Open,WTI-High,WTI-Low,WTI-Volume,WTI-% var
1433,2020-04-21,11.57,21.32,22.58,6.5,2290000.0,"-43,37%"
1434,2020-04-20,20.43,24.76,24.92,20.19,1320000.0,"-18,38%"
1454,2020-03-20,22.63,25.59,28.49,22.39,1130000.0,"-12,66%"
1455,2020-03-19,25.91,22.82,28.28,21.77,1190000.0,"24,39%"
1456,2020-03-18,20.83,27.3,27.6,20.52,1000000.0,"-23,78%"
2125,2017-09-05,48.66,47.28,48.98,47.15,1030000.0,"2,64%"


In [24]:
OHLC_WTI.head(25)

Unnamed: 0,Date,WTI-Close,WTI-Open,WTI-High,WTI-Low,WTI-Volume,WTI-% var
0,2025-10-05,61.43,61.46,61.79,61.36,6920.0,"0,90%"
1,2025-10-03,60.88,60.7,61.38,60.55,209160.0,"0,66%"
2,2025-10-02,60.48,61.78,62.54,60.4,290510.0,"-2,10%"
3,2025-10-01,61.78,62.46,62.89,61.4,274340.0,"-0,95%"
4,2025-09-30,62.37,63.14,63.26,62.03,271650.0,"-1,70%"
5,2025-09-29,63.45,65.07,65.4,62.98,294290.0,"-2,59%"
6,2025-09-28,65.14,65.0,65.18,64.88,8230.0,"-0,88%"
7,2025-09-26,65.72,65.2,66.42,64.66,284990.0,"1,14%"
8,2025-09-25,64.98,64.8,65.34,64.06,258350.0,"-0,02%"
9,2025-09-24,64.99,63.64,65.05,63.25,282720.0,"2,49%"


# OHLC del petróleo (Brent) desde la fecha más vieja en la que pudimos obtener datos del BTC
https://es.investing.com/commodities/brent-oil-historical-data

In [26]:
from pathlib import Path

csv_path = Path.cwd().parent / "CSVs" / "BRENT.csv"
print("Leyendo:", csv_path)

OHLC_BRENT = pd.read_csv(csv_path)
OHLC_BRENT = format_data(OHLC_BRENT, "BRENT")

for feature in OHLC_BRENT.columns.values[1:-2]:
    OHLC_BRENT[feature] = OHLC_BRENT[feature].str.replace(",",".")
OHLC_BRENT[["BRENT-Close", "BRENT-Open", "BRENT-High", "BRENT-Low"]] = OHLC_BRENT[["BRENT-Close", "BRENT-Open", "BRENT-High", "BRENT-Low"]].astype(float)

hay_sin_k = OHLC_BRENT["BRENT-Volume"].str.endswith("K", na=False).all()
if hay_sin_k:
    print("✅ Todos los valores terminan en K")
else:
    print("⚠️ Hay valores que NO terminan en K")

OHLC_BRENT.head(25)

Leyendo: c:\ML_Bitcoin\EDA\OHLCs\CSVs\BRENT.csv
⚠️ Hay valores que NO terminan en K


Unnamed: 0,Date,BRENT-Close,BRENT-Open,BRENT-High,BRENT-Low,BRENT-Volume,BRENT-% var
0,2025-10-05,65.1,64.86,65.51,64.84,"5,72K","0,88%"
1,2025-10-03,64.53,64.38,65.02,64.2,"311,95K","0,66%"
2,2025-10-02,64.11,65.45,66.15,64.0,"480,22K","-1,90%"
3,2025-10-01,65.35,66.18,66.57,65.05,"420,60K","-2,49%"
4,2025-09-30,67.02,67.5,67.71,66.84,"19,23K","-1,40%"
5,2025-09-29,67.97,69.5,69.91,67.52,"107,59K","-3,08%"
6,2025-09-26,70.13,69.54,70.76,69.11,"110,38K","1,02%"
7,2025-09-25,69.42,69.04,69.68,68.42,"210,51K","0,16%"
8,2025-09-24,69.31,67.88,69.37,67.51,"222,73K","3,49%"
9,2025-09-23,66.97,65.99,67.42,65.49,"344,39K","1,52%"


In [27]:
print("NaN en la columna BRENT-Volume:")
print(OHLC_BRENT["BRENT-Volume"].isna().sum())
print("Valores en la columna BRENT-Volume sin una K")
filtro = ~OHLC_BRENT["BRENT-Volume"].str.endswith("K", na=False)
print(filtro.sum())
print(OHLC_BRENT[filtro])

NaN en la columna BRENT-Volume:
0
Valores en la columna BRENT-Volume sin una K
1
         Date  BRENT-Close  BRENT-Open  BRENT-High  BRENT-Low BRENT-Volume  \
81 2025-06-13        74.23        70.5        78.5      70.41        1,24M   

   BRENT-% var  
81       7,02%  


In [28]:
OHLC_BRENT["BRENT-Volume"] = OHLC_BRENT["BRENT-Volume"].apply(parse_value)
OHLC_BRENT[OHLC_BRENT["BRENT-Volume"] >= 1000000]

Unnamed: 0,Date,BRENT-Close,BRENT-Open,BRENT-High,BRENT-Low,BRENT-Volume,BRENT-% var
81,2025-06-13,74.23,70.5,78.5,70.41,1240000.0,"7,02%"


In [29]:
OHLC_BRENT.head(25)

Unnamed: 0,Date,BRENT-Close,BRENT-Open,BRENT-High,BRENT-Low,BRENT-Volume,BRENT-% var
0,2025-10-05,65.1,64.86,65.51,64.84,5720.0,"0,88%"
1,2025-10-03,64.53,64.38,65.02,64.2,311950.0,"0,66%"
2,2025-10-02,64.11,65.45,66.15,64.0,480220.0,"-1,90%"
3,2025-10-01,65.35,66.18,66.57,65.05,420600.0,"-2,49%"
4,2025-09-30,67.02,67.5,67.71,66.84,19230.0,"-1,40%"
5,2025-09-29,67.97,69.5,69.91,67.52,107590.0,"-3,08%"
6,2025-09-26,70.13,69.54,70.76,69.11,110380.0,"1,02%"
7,2025-09-25,69.42,69.04,69.68,68.42,210510.0,"0,16%"
8,2025-09-24,69.31,67.88,69.37,67.51,222730.0,"3,49%"
9,2025-09-23,66.97,65.99,67.42,65.49,344390.0,"1,52%"


# OHLC del SPX (ticker del S&P 500) desde la fecha más vieja en la que pudimos obtener datos del BTC
https://es.investing.com/indices/us-spx-500-historical-data

In [31]:
from pathlib import Path

csv_path = Path.cwd().parent / "CSVs" / "SPX.csv"
print("Leyendo:", csv_path)

OHLC_SPX = pd.read_csv(csv_path)
OHLC_SPX = format_data(OHLC_SPX, "SPX")

OHLC_SPX = OHLC_SPX[['Date', 'SPX-Close', 'SPX-Open', 'SPX-High', 'SPX-Low', 'SPX-% var']]
for feature in OHLC_SPX.columns.values[1:-1]:
    OHLC_SPX[feature] = OHLC_SPX[feature].str.replace(".","")
    OHLC_SPX[feature] = OHLC_SPX[feature].str.replace(",",".")
OHLC_SPX

Leyendo: c:\ML_Bitcoin\EDA\OHLCs\CSVs\SPX.csv


Unnamed: 0,Date,SPX-Close,SPX-Open,SPX-High,SPX-Low,SPX-% var
0,2025-10-03,6715.79,6722.14,6750.87,6705.67,"0,01%"
1,2025-10-02,6715.35,6731.31,6731.94,6693.23,"0,06%"
2,2025-10-01,6711.20,6664.92,6718.48,6656.20,"0,34%"
3,2025-09-30,6688.46,6656.19,6691.25,6641.00,"0,41%"
4,2025-09-29,6661.21,6661.58,6677.31,6644.49,"0,26%"
...,...,...,...,...,...,...
2040,2017-08-22,2452.50,2433.80,2454.80,2433.70,"0,99%"
2041,2017-08-21,2428.40,2425.50,2430.60,2417.30,"0,12%"
2042,2017-08-18,2425.60,2427.60,2440.30,2420.70,"-0,18%"
2043,2017-08-17,2430.00,2462.90,2465.00,2430.00,"-1,54%"


In [32]:
OHLC_SPX[["SPX-Close", "SPX-Open", "SPX-High", "SPX-Low"]] = OHLC_SPX[["SPX-Close", "SPX-Open", "SPX-High", "SPX-Low"]].astype(float)
OHLC_SPX.dtypes

Date         datetime64[ns]
SPX-Close           float64
SPX-Open            float64
SPX-High            float64
SPX-Low             float64
SPX-% var            object
dtype: object

In [33]:
OHLC_SPX

Unnamed: 0,Date,SPX-Close,SPX-Open,SPX-High,SPX-Low,SPX-% var
0,2025-10-03,6715.79,6722.14,6750.87,6705.67,"0,01%"
1,2025-10-02,6715.35,6731.31,6731.94,6693.23,"0,06%"
2,2025-10-01,6711.20,6664.92,6718.48,6656.20,"0,34%"
3,2025-09-30,6688.46,6656.19,6691.25,6641.00,"0,41%"
4,2025-09-29,6661.21,6661.58,6677.31,6644.49,"0,26%"
...,...,...,...,...,...,...
2040,2017-08-22,2452.50,2433.80,2454.80,2433.70,"0,99%"
2041,2017-08-21,2428.40,2425.50,2430.60,2417.30,"0,12%"
2042,2017-08-18,2425.60,2427.60,2440.30,2420.70,"-0,18%"
2043,2017-08-17,2430.00,2462.90,2465.00,2430.00,"-1,54%"


# OHLC del XAU/USD (precio del oro en USD) desde la fecha más vieja en la que pudimos obtener datos del BTC
https://es.investing.com/currencies/xau-usd-historical-data

In [35]:
from pathlib import Path

csv_path = Path.cwd().parent / "CSVs" / "XAU-USD.csv"
print("Leyendo:", csv_path)

OHLC_XAUUSD = pd.read_csv(csv_path)
OHLC_XAUUSD = format_data(OHLC_XAUUSD, "XAUUSD")

OHLC_XAUUSD = OHLC_XAUUSD[['Date', 'XAUUSD-Close', 'XAUUSD-Open', 'XAUUSD-High', 'XAUUSD-Low', 'XAUUSD-% var']]
for feature in OHLC_XAUUSD.columns.values[1:-1]:
    OHLC_XAUUSD[feature] = OHLC_XAUUSD[feature].str.replace(".","")
    OHLC_XAUUSD[feature] = OHLC_XAUUSD[feature].str.replace(",",".")
OHLC_XAUUSD

Leyendo: c:\ML_Bitcoin\EDA\OHLCs\CSVs\XAU-USD.csv


Unnamed: 0,Date,XAUUSD-Close,XAUUSD-Open,XAUUSD-High,XAUUSD-Low,XAUUSD-% var
0,2025-10-05,3918.14,3889.33,3920.32,3884.49,"0,81%"
1,2025-10-03,3886.83,3857.92,3891.85,3838.05,"0,79%"
2,2025-10-02,3856.53,3866.66,3897.20,3819.51,"-0,24%"
3,2025-10-01,3865.80,3858.84,3895.45,3853.44,"0,19%"
4,2025-09-30,3858.51,3833.84,3871.87,3793.18,"0,64%"
...,...,...,...,...,...,...
2112,2017-08-22,1284.72,1291.18,1292.69,1282.11,"-0,50%"
2113,2017-08-21,1291.22,1283.60,1293.58,1280.60,"0,52%"
2114,2017-08-18,1284.50,1288.20,1301.20,1283.64,"-0,27%"
2115,2017-08-17,1288.01,1283.61,1290.47,1282.20,"0,38%"


In [36]:
OHLC_XAUUSD[["XAUUSD-Close", "XAUUSD-Open", "XAUUSD-High", "XAUUSD-Low"]] = OHLC_XAUUSD[["XAUUSD-Close", "XAUUSD-Open", "XAUUSD-High", "XAUUSD-Low"]].astype(float)
OHLC_XAUUSD.dtypes

Date            datetime64[ns]
XAUUSD-Close           float64
XAUUSD-Open            float64
XAUUSD-High            float64
XAUUSD-Low             float64
XAUUSD-% var            object
dtype: object

In [37]:
OHLC_XAUUSD

Unnamed: 0,Date,XAUUSD-Close,XAUUSD-Open,XAUUSD-High,XAUUSD-Low,XAUUSD-% var
0,2025-10-05,3918.14,3889.33,3920.32,3884.49,"0,81%"
1,2025-10-03,3886.83,3857.92,3891.85,3838.05,"0,79%"
2,2025-10-02,3856.53,3866.66,3897.20,3819.51,"-0,24%"
3,2025-10-01,3865.80,3858.84,3895.45,3853.44,"0,19%"
4,2025-09-30,3858.51,3833.84,3871.87,3793.18,"0,64%"
...,...,...,...,...,...,...
2112,2017-08-22,1284.72,1291.18,1292.69,1282.11,"-0,50%"
2113,2017-08-21,1291.22,1283.60,1293.58,1280.60,"0,52%"
2114,2017-08-18,1284.50,1288.20,1301.20,1283.64,"-0,27%"
2115,2017-08-17,1288.01,1283.61,1290.47,1282.20,"0,38%"


# Consumer Price Index for All Urban Consumers: All Items in U.S. City Average (CPIAUCSL) 
Buscamos el CPI para consumidores urbanos en lo que respecta a todos los artículos (de Estados Unidos), que es el índice de precios de una canasta de bienes y servicios pagados por consumidores urbanos, cuyos cambios porcentuales miden la tasa de inflación entre cualquier intervalo de tiempo.
Este índice tiene relación con el bitcoin porque si los precios de los bienes y servicios suben, el dinero pierde valor, entonces, si el IPC es muy alto, más personas pueden querer comprar BTC para asegurar su dinero y así su precio sube (esto no está garantizado).
Este índice incluye aproximadamente al 88% de la población estadounidense, y se basa en precios de comida, ropa, alojamiento, combustibles, tarifas de transporte, tarifas de servicios (agua, alcantarillado, impuestos de ventas).

https://fred.stlouisfed.org/series/CPIAUCSL

In [39]:
from pathlib import Path

csv_path = Path.cwd().parent / "CSVs" / "CPIAUCSL to 8.2025.csv"
print("Leyendo:", csv_path)

CPIAUCSL = pd.read_csv(csv_path)
CPIAUCSL.dtypes

Leyendo: c:\ML_Bitcoin\EDA\OHLCs\CSVs\CPIAUCSL to 8.2025.csv


observation_date     object
CPIAUCSL            float64
dtype: object

In [40]:
CPIAUCSL["observation_date"] = pd.to_datetime(CPIAUCSL["observation_date"], format="%Y-%m-%d")
CPIAUCSL.dtypes

observation_date    datetime64[ns]
CPIAUCSL                   float64
dtype: object

In [41]:
CPIAUCSL = CPIAUCSL.rename(columns={'observation_date':'Date'})
CPIAUCSL

Unnamed: 0,Date,CPIAUCSL
0,2017-08-01,245.183
1,2017-09-01,246.435
2,2017-10-01,246.626
3,2017-11-01,247.284
4,2017-12-01,247.805
...,...,...
92,2025-04-01,320.321
93,2025-05-01,320.580
94,2025-06-01,321.500
95,2025-07-01,322.132


# CBOE Volatility Index (VIX)
El índice de volatilidad CBOE (VIX) mide la expectativa del mercado sobre la futura volatilidad del índice S&P 500. Cuando el VIX sube, por lo general indica que los inversores esperan una mayor turbulencia en el mercado, lo cual puede estar asociado a caídas significativas o incertidumbre sobre el mismo, mientras que si baja, se suele asociar a un período de estabilidad.
Decidimos incluirlo porque en principio el VIX y el valor del BTC se creía que eran totalmente independientes uno del otro, pero luego de un tiempo se han observado patrones en los que se parece encontrar una cierta correlación negativa entre ambos.

https://es.investing.com/indices/volatility-s-p-500

In [44]:
from pathlib import Path

csv_path = Path.cwd().parent / "CSVs" / "VIX.csv"
print("Leyendo:", csv_path)

VIX = pd.read_csv(csv_path)
VIX.dtypes

Leyendo: c:\ML_Bitcoin\EDA\OHLCs\CSVs\VIX.csv


Fecha        object
Último       object
Apertura     object
Máximo       object
Mínimo       object
Vol.        float64
% var.       object
dtype: object

In [45]:
VIX = format_data(VIX, "VIX")
VIX = VIX[["Date","VIX-Close","VIX-Open","VIX-High","VIX-Low","VIX-% var"]]
VIX

Unnamed: 0,Date,VIX-Close,VIX-Open,VIX-High,VIX-Low,VIX-% var
0,2025-10-03,1665,1635,1706,1620,"0,12%"
1,2025-10-02,1663,1612,1692,1593,"2,09%"
2,2025-10-01,1629,1728,1728,1598,"0,06%"
3,2025-09-30,1628,1649,1670,1602,"0,99%"
4,2025-09-29,1612,1584,1629,1574,"5,43%"
...,...,...,...,...,...,...
2078,2017-08-22,1135,1260,1294,1135,"-13,95%"
2079,2017-08-21,1319,1459,1474,1307,"-7,50%"
2080,2017-08-18,1426,1538,1604,1332,"-8,30%"
2081,2017-08-17,1555,1181,1577,1154,"32,45%"


In [46]:
for feature in VIX.columns.values[1:-1]:
    VIX[feature] = VIX[feature].str.replace(",",".")
VIX[["VIX-Close","VIX-Open","VIX-High","VIX-Low"]] = VIX[["VIX-Close","VIX-Open","VIX-High","VIX-Low"]].astype(float)
VIX.dtypes

Date         datetime64[ns]
VIX-Close           float64
VIX-Open            float64
VIX-High            float64
VIX-Low             float64
VIX-% var            object
dtype: object

In [47]:
VIX

Unnamed: 0,Date,VIX-Close,VIX-Open,VIX-High,VIX-Low,VIX-% var
0,2025-10-03,16.65,16.35,17.06,16.20,"0,12%"
1,2025-10-02,16.63,16.12,16.92,15.93,"2,09%"
2,2025-10-01,16.29,17.28,17.28,15.98,"0,06%"
3,2025-09-30,16.28,16.49,16.70,16.02,"0,99%"
4,2025-09-29,16.12,15.84,16.29,15.74,"5,43%"
...,...,...,...,...,...,...
2078,2017-08-22,11.35,12.60,12.94,11.35,"-13,95%"
2079,2017-08-21,13.19,14.59,14.74,13.07,"-7,50%"
2080,2017-08-18,14.26,15.38,16.04,13.32,"-8,30%"
2081,2017-08-17,15.55,11.81,15.77,11.54,"32,45%"


In [48]:
dataset=pd.merge(OHLC_WTI, OHLC_BRENT, on='Date', how='inner').merge(OHLC_SPX, on='Date', how='inner').merge(OHLC_XAUUSD, on='Date', how='inner').merge(VIX, on='Date', how='inner').merge(CPIAUCSL, on='Date', how='inner')

In [43]:
dataset.dtypes

Date            datetime64[ns]
WTI-Close              float64
WTI-Open               float64
WTI-High               float64
WTI-Low                float64
WTI-Volume             float64
WTI-% var               object
BRENT-Close            float64
BRENT-Open             float64
BRENT-High             float64
BRENT-Low              float64
BRENT-Volume           float64
BRENT-% var             object
SPX-Close              float64
SPX-Open               float64
SPX-High               float64
SPX-Low                float64
SPX-% var               object
XAUUSD-Close           float64
XAUUSD-Open            float64
XAUUSD-High            float64
XAUUSD-Low             float64
XAUUSD-% var            object
VIX-Close              float64
VIX-Open               float64
VIX-High               float64
VIX-Low                float64
VIX-% var               object
CPIAUCSL               float64
dtype: object

In [49]:
from sklearn.preprocessing import QuantileTransformer
tr = QuantileTransformer(n_quantiles=50, output_distribution='uniform')
numeric_dataset = dataset.select_dtypes(include=['number'])
transformed = tr.fit_transform(numeric_dataset)
transformed_df = pd.DataFrame(transformed, 
                              columns=numeric_dataset.columns, 
                              index=dataset.index)
transformed_df


Unnamed: 0,WTI-Close,WTI-Open,WTI-High,WTI-Low,WTI-Volume,BRENT-Close,BRENT-Open,BRENT-High,BRENT-Low,BRENT-Volume,...,SPX-Low,XAUUSD-Close,XAUUSD-Open,XAUUSD-High,XAUUSD-Low,VIX-Close,VIX-Open,VIX-High,VIX-Low,CPIAUCSL
0,0.484084,0.548903,0.512382,0.530937,0.963132,0.389745,0.453957,0.405137,0.412399,0.856630,...,1.000000,1.000000,0.980568,1.000000,0.990072,0.631109,0.531068,0.658117,0.558907,1.000000
1,0.402343,0.387666,0.388760,0.409586,0.711120,0.339423,0.329162,0.331357,0.340331,0.260193,...,0.995439,0.990285,1.000000,0.995213,1.000000,0.445780,0.472618,0.407118,0.503167,0.984704
2,0.266558,0.250421,0.256549,0.240072,0.979711,0.208396,0.204867,0.193637,0.163093,0.918688,...,0.933211,0.969176,0.979209,0.971511,0.968964,0.791329,0.744104,0.746340,0.797441,0.961379
3,0.576352,0.580789,0.574754,0.582007,0.876597,0.555775,0.550170,0.545267,0.569947,0.902589,...,0.920595,0.955713,0.955027,0.955555,0.956332,0.693814,0.693750,0.701732,0.726305,0.958409
4,0.533657,0.560294,0.538413,0.561836,0.162399,0.507653,0.510804,0.489397,0.539129,0.572236,...,0.960458,0.929140,0.930694,0.929872,0.930657,0.710714,0.722880,0.685950,0.710395,0.929628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,0.306335,0.326389,0.301587,0.321687,0.734909,0.266728,0.276927,0.262753,0.285829,0.624844,...,0.080489,0.163011,0.161605,0.162144,0.155963,0.743494,0.598014,0.763002,0.628014,0.061386
59,0.415421,0.374317,0.387734,0.387632,0.803780,0.387730,0.376314,0.369542,0.387265,0.571401,...,0.176265,0.208300,0.207311,0.207223,0.207128,0.185423,0.130952,0.183673,0.123228,0.060304
60,0.243635,0.220659,0.241967,0.257641,0.540781,0.253491,0.228181,0.229819,0.252999,0.513172,...,0.050522,0.079603,0.062597,0.079327,0.066270,0.035699,0.032615,0.217939,0.028805,0.026429
61,0.173900,0.186431,0.161896,0.202698,0.494313,0.163253,0.184803,0.163124,0.203381,0.596124,...,0.037155,0.059514,0.058984,0.057371,0.059089,0.003605,0.000000,0.001141,0.000000,0.017551


In [50]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
poly_array = poly.fit_transform(numeric_dataset)
poly_features = poly.get_feature_names_out(numeric_dataset.columns)

poly_df = pd.DataFrame(poly_array, columns=poly_features, index=numeric_dataset.index)
poly_df

Unnamed: 0,1,WTI-Close,WTI-Open,WTI-High,WTI-Low,WTI-Volume,BRENT-Close,BRENT-Open,BRENT-High,BRENT-Low,...,VIX-Open^2,VIX-Open VIX-High,VIX-Open VIX-Low,VIX-Open CPIAUCSL,VIX-High^2,VIX-High VIX-Low,VIX-High CPIAUCSL,VIX-Low^2,VIX-Low CPIAUCSL,CPIAUCSL^2
0,1.0,67.33,69.35,69.58,67.05,355820.0,69.67,71.83,72.00,69.40,...,302.7600,381.0600,302.5860,5626.53360,479.6100,380.8410,7081.67160,302.4121,5623.29996,104564.276496
1,1.0,65.45,64.96,65.98,64.67,193160.0,67.11,66.58,67.50,66.34,...,288.3204,296.8104,280.3398,5469.80136,305.5504,288.5948,5630.86736,272.5801,5318.39932,103769.025424
2,1.0,59.24,58.16,59.50,56.39,364220.0,62.13,61.23,62.40,59.30,...,573.1236,602.8092,557.8020,7674.68520,634.0324,586.6940,8072.20440,542.8900,7469.51400,102771.536400
3,1.0,71.20,71.39,72.10,71.03,272830.0,74.49,74.67,75.29,74.33,...,486.6436,518.8512,476.0548,7066.28126,553.1904,507.5616,7533.94992,465.6964,6912.52718,102605.543041
4,1.0,69.08,69.96,70.95,68.90,110380.0,73.10,74.11,74.94,72.89,...,527.1616,530.1464,485.8336,7265.66904,533.1481,488.5844,7306.80741,447.7456,6696.06084,100139.969601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,1.0,60.80,61.39,61.64,60.02,200030.0,63.83,64.57,65.04,63.19,...,398.4016,504.9880,390.6172,4981.55692,640.0900,495.1210,6314.29810,382.9849,4884.22189,62288.678929
59,1.0,65.55,64.53,65.96,64.43,226700.0,69.65,69.00,69.97,68.81,...,170.3025,186.6150,163.1250,3256.35345,204.4900,178.7500,3568.26470,156.2500,3119.11250,62264.721841
60,1.0,58.38,57.38,58.90,57.35,160630.0,63.73,62.74,64.32,62.59,...,125.2161,163.1502,117.9426,2772.93795,212.5764,153.6732,3612.99690,111.0916,2611.86470,61407.318025
61,1.0,54.51,54.86,55.42,54.11,154430.0,60.49,61.07,61.70,60.00,...,95.8441,102.6971,95.3546,2420.91036,110.0401,102.1726,2594.00916,94.8676,2408.54616,61149.376656
