In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import plotly.express as px
import requests
from pathlib import Path
import yfinance as yf
import pandas_datareader.data as web 

In [2]:
import sys, subprocess, pkgutil
print("Kernel:", sys.executable)  

subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "pandas_datareader"])

Kernel: c:\Users\Usuario\BTC-ML\.venv\Scripts\python.exe


0

In [3]:
startTime = 1502928000000      
endTime   = 4102444800000      
limit     = 1000

chunks = []
while startTime < endTime:
    url = ("https://api.binance.com/api/v3/klines"
           f"?symbol=BTCUSDT&interval=1d&limit={limit}&startTime={startTime}")
    data = requests.get(url, timeout=30).json()
    if not data:
        break

    df = pd.DataFrame(data, columns=[
        "Open time","Open","High","Low","Close","Volume",
        "Close time","Quote asset volume","Number of trades",
        "Taker buy base asset volume","Taker buy quote asset volume","Ignore"
    ])

    df["Date"] = pd.to_datetime(df["Open time"], unit="ms")

    df = df[["Date","Open","High","Low","Close","Volume"]]

    num_cols = ["Open","High","Low","Close","Volume"]
    df[num_cols] = df[num_cols].astype(float)

    chunks.append(df)

    # esto es lo que mejora lo que habia hecho el mija
    startTime = data[-1][6] + 1


btc = (pd.concat(chunks, ignore_index=True)
         .drop_duplicates(subset="Date")
         .sort_values("Date")
         .set_index("Date"))

btc.to_csv("btc_usdt_binance_daily.csv")
btc.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-10-04,122232.21,122800.0,121510.0,122391.0,8208.16678
2025-10-05,122390.99,125708.42,122136.0,123482.31,22043.097553
2025-10-06,123482.32,126199.63,123084.0,124658.54,19494.628793
2025-10-07,124658.54,125126.0,120574.94,121332.95,21633.99385
2025-10-08,121332.96,123350.0,121066.14,122775.15,10519.05602


In [4]:
btc.sample(n=100, random_state=42)


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-03-16,69499.84,70043.00,64780.00,65300.63,55926.953360
2025-06-21,103297.98,103982.64,100837.90,102120.01,11154.213320
2023-07-16,30289.52,30441.46,30064.29,30231.99,15760.128100
2019-01-06,3771.12,4027.71,3740.00,3987.60,36553.806709
2024-05-14,62940.09,63118.36,61142.77,61577.49,29088.720410
...,...,...,...,...,...
2022-06-23,19988.00,21233.00,19890.07,21110.13,83127.087160
2019-08-31,9582.76,9684.51,9420.75,9587.47,17130.290074
2018-03-08,9910.00,10099.00,9060.00,9271.64,41109.473226
2024-12-21,97805.44,99540.61,96398.39,97291.99,23483.541430


In [5]:
btc = pd.read_csv("btc_usdt_binance_daily.csv", parse_dates=["Date"], index_col="Date")
btc = btc[~btc.index.duplicated(keep="first")].asfreq("D").ffill()

display(btc.tail(), btc.describe())

fig = px.line(btc, x=btc.index, y="Close", title="BTC Close")
fig.show()


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-10-04,122232.21,122800.0,121510.0,122391.0,8208.16678
2025-10-05,122390.99,125708.42,122136.0,123482.31,22043.097553
2025-10-06,123482.32,126199.63,123084.0,124658.54,19494.628793
2025-10-07,124658.54,125126.0,120574.94,121332.95,21633.99385
2025-10-08,121332.96,123350.0,121066.14,122775.15,10519.05602


Unnamed: 0,Open,High,Low,Close,Volume
count,2975.0,2975.0,2975.0,2975.0,2975.0
mean,34146.26722,34890.365281,33358.582434,34186.152561,63642.387827
std,30707.296782,31217.171821,30196.401549,30745.363962,77606.074553
min,3188.01,3276.5,2817.0,3189.02,228.108068
25%,8987.395,9225.575,8737.5,9007.1,25185.909259
50%,23940.2,24599.59,23502.25,23954.05,40467.400638
75%,50050.87,51515.77,48600.0,50201.635,67436.597601
max,124658.54,126199.63,123084.0,124658.54,760705.362783


In [7]:
btc.shape



(2975, 5)

In [8]:
n_raw = btc.shape[0]

n_clean = btc.shape[0]

print(f"Registros originales: {n_raw}")
print(f"Registros después de limpieza: {n_clean}")
print(f"Diferencia: {n_raw - n_clean}")

if n_raw-n_clean == 0:
    print("Todos los registros del dataframe son distintos")


Registros originales: 2975
Registros después de limpieza: 2975
Diferencia: 0
Todos los registros del dataframe son distintos


In [9]:
# Volumen
fig = px.line(btc, x=btc.index, y="Volume", title="Volumen BTC (trading)")
fig.show()


In [10]:
btc["Return"] = btc["Close"].pct_change()
fig = px.histogram(
    btc.dropna(),      
    x="Return", 
    nbins=100, 
    opacity=0.7, 
    title="Distribución de retornos diarios BTC"
)
fig.show()


In [11]:

btc["Volatility30"] = btc["Return"].rolling(window=30).std()

fig = px.line(
    btc, 
    x=btc.index, 
    y="Volatility30", 
    title="Volatilidad histórica BTC (rolling 30 días)"
)
fig.show()


In [12]:
btc["MA7"]  = btc["Close"].rolling(window=7).mean()   
btc["MA30"] = btc["Close"].rolling(window=30).mean()  

In [13]:
import plotly.graph_objects as go

fig = go.Figure()
for col in ["Close", "MA7", "MA30"]:
    fig.add_trace(go.Scatter(x=btc.index, y=btc[col], mode="lines", name=col))
fig.update_layout(title="Precio BTC con medias móviles", xaxis_title="Fecha", yaxis_title="Precio")
fig.show()


In [14]:
fig = px.imshow(
    btc.corr(),
    text_auto=True,
    aspect="auto",
    title="Matriz de correlaciones"
)
fig.show()

In [15]:
min_close = btc["Close"].min()
max_close = btc["Close"].max()
mean_close = btc["Close"].mean()
closest_to_mean = btc.loc[(btc["Close"] - mean_close).abs().idxmin(), "Close"]

print(f"Valor más bajo (Close): {min_close}")
print(f"Valor más alto (Close): {max_close}")
print(f"Valor más cercano a la media (Close): {closest_to_mean}")

Valor más bajo (Close): 3189.02
Valor más alto (Close): 124658.54
Valor más cercano a la media (Close): 34220.01


In [16]:


startTime = 1502928000000      
endTime   = 4102444800000      
limit     = 1000

csv_dir = Path.cwd() / "CSVs-2"
chunks = []
while startTime < endTime:
    url = ("https://api.binance.com/api/v3/klines"
           f"?symbol=ETHUSDT&interval=1d&limit={limit}&startTime={startTime}")
    data = requests.get(url, timeout=30).json()
    if not data:
        break

    df = pd.DataFrame(data, columns=[
        "Open time","Open","High","Low","Close","Volume",
        "Close time","Quote asset volume","Number of trades",
        "Taker buy base asset volume","Taker buy quote asset volume","Ignore"
    ])

    df["Date"] = pd.to_datetime(df["Open time"], unit="ms")
    df = df[["Date","Open","High","Low","Close","Volume"]]

    # Convertir columnas numéricas
    num_cols = ["Open","High","Low","Close","Volume"]
    df[num_cols] = df[num_cols].astype(float)

    chunks.append(df)

 
    startTime = data[-1][6] + 1  

eth = (pd.concat(chunks, ignore_index=True)
         .drop_duplicates(subset="Date")
         .sort_values("Date")
         .set_index("Date"))


eth.to_csv(csv_dir / "eth_usdt_binance_daily.csv")

eth.head(), eth.tail()


(              Open    High     Low   Close      Volume
 Date                                                  
 2017-08-17  301.13  312.18  298.00  302.00  7030.71034
 2017-08-18  302.00  311.79  283.94  293.96  9537.84646
 2017-08-19  293.31  299.90  278.00  290.91  2146.19773
 2017-08-20  289.41  300.53  282.85  299.10  2510.13871
 2017-08-21  299.10  346.52  294.60  323.29  5219.44542,
                Open     High      Low    Close       Volume
 Date                                                       
 2025-10-04  4512.88  4517.93  4440.00  4487.15  199732.7088
 2025-10-05  4487.16  4618.17  4467.05  4514.32  506587.9614
 2025-10-06  4514.32  4735.93  4488.25  4684.01  456200.6528
 2025-10-07  4684.02  4755.00  4430.43  4447.70  640685.9319
 2025-10-08  4447.70  4522.60  4410.08  4495.25  241697.0023)

In [17]:
min_close = eth["Close"].min()
max_close = eth["Close"].max()
mean_close = eth["Close"].mean()
closest_to_mean = eth.loc[(eth["Close"] - mean_close).abs().idxmin(), "Close"]

print(f"Valor más bajo (Close): {min_close}")
print(f"Valor más alto (Close): {max_close}")
print(f"Valor más cercano a la media (Close): {closest_to_mean}")

Valor más bajo (Close): 83.76
Valor más alto (Close): 4832.07
Valor más cercano a la media (Close): 1597.76


In [18]:
csv_dir = Path.cwd() / "CSVs-2"
btc_path = csv_dir / "btc_usdt_binance_daily.csv"
eth_path = csv_dir / "eth_usdt_binance_daily.csv"

print("Leyendo BTC desde:", btc_path)
print("Leyendo ETH desde:", eth_path)

btc = pd.read_csv(btc_path, parse_dates=[0], index_col=0)
eth = pd.read_csv(eth_path, parse_dates=[0], index_col=0)

pair = pd.DataFrame({
    "BTC_Close": btc["Close"],
    "ETH_Close": eth["Close"],
}).dropna()

fig = px.line(
    pair.reset_index(),
    x="Date",
    y=["BTC_Close", "ETH_Close"],
    title="BTC vs ETH — Precios diarios"
)
fig.update_layout(xaxis_title="Fecha", yaxis_title="Precio (USDT)")
fig.show()

rets = pair.pct_change().dropna()
corr = rets.corr().loc["BTC_Close", "ETH_Close"]
print(f"Correlación de retornos diarios BTC–ETH: {corr:.3f}")

fig2 = px.scatter(
    rets, x="BTC_Close", y="ETH_Close",
    trendline="ols",
    title=f"BTC vs ETH — Retornos diarios (corr={corr:.3f})"
)
fig2.update_layout(xaxis_title="Retorno BTC", yaxis_title="Retorno ETH")
fig2.show()

# === 7) Guardar dataset combinado ===
pair.to_csv(csv_dir / "btc_eth_close_aligned.csv")


Leyendo BTC desde: c:\ML_Bitcoin\EDA-2\CSVs-2\btc_usdt_binance_daily.csv
Leyendo ETH desde: c:\ML_Bitcoin\EDA-2\CSVs-2\eth_usdt_binance_daily.csv


Correlación de retornos diarios BTC–ETH: 0.777


In [20]:

# Definir carpeta donde están los CSVs de BTC y ETH, y cargar sus datos
csv_dir = Path.cwd() / "CSVs-2"
btc = pd.read_csv(csv_dir / "btc_usdt_binance_daily.csv", parse_dates=[0], index_col="Date")
eth = pd.read_csv(csv_dir / "eth_usdt_binance_daily.csv", parse_dates=[0], index_col="Date")

# Tomamos el rango de fechas común entre BTC y ETH para usarlo con los bonos
start, end = btc.index.min(), btc.index.max()

# Descargar los rendimientos de los Bonos del Tesoro de EE.UU. desde FRED
# (DGS10 = 10 años, DGS3MO = 3 meses, DGS30 = 30 años)
# Si FRED falla, usa ETFs equivalentes desde Yahoo Finance como proxies.
treasuries = None
try:
    t10 = web.DataReader("DGS10", "fred", start, end)   # 10-Year
    t3m = web.DataReader("DGS3MO", "fred", start, end)  # 3-Month
    t30 = web.DataReader("DGS30", "fred", start, end)   # 30-Year
    treasuries = pd.concat([t10, t3m, t30], axis=1).dropna()
    treasuries.columns = ["US10Y", "US3M", "US30Y"]
    print("Bonos descargados desde FRED.")
except Exception as e:
    print("No se pudo usar FRED:", e)
    try:
        data = yf.download(["TLT","IEF","SHY"], start=start.strftime("%Y-%m-%d"),
                           end=end.strftime("%Y-%m-%d"), progress=False, threads=False, auto_adjust=False)
        if isinstance(data.columns, pd.MultiIndex):
            data = data["Close"]
        treasuries = data.rename(columns={"TLT":"US30Y_proxy","IEF":"US10Y_proxy","SHY":"US3M_proxy"}).dropna()
        print("Bonos proxy descargados desde Yahoo (ETFs TLT/IEF/SHY).")
    except Exception as e2:
        print("También falló Yahoo:", e2)
        treasuries = pd.DataFrame()
# Unir en un mismo DataFrame:
# - Precio de cierre de BTC
# - Precio de cierre de ETH
# - Rendimientos de los bonos (FRED o proxies de ETFs)
assets = pd.DataFrame({
    "BTC": btc["Close"],
    "ETH": eth["Close"],
}).join(treasuries, how="inner")

# Guardar dataset combinado en CSV
out_path = csv_dir / "btc_eth_treasuries.csv"
assets.to_csv(out_path)
print("Dataset guardado en:", out_path)

# Normalizar todos los valores a base 100 para compararlos en la misma escala
# (ejemplo: todos arrancan en 100 el primer día)
normalized = assets / assets.iloc[0] * 100
normalized_plot = normalized.reset_index().rename(columns={"index": "Date"})

# Graficar la evolución relativa de BTC, ETH y los bonos (todos base=100)
fig = px.line(
    normalized_plot,
    x="Date",
    y=normalized.columns,
    title="BTC, ETH y Bonos del Tesoro de EE.UU. — Evolución normalizada (base=100)"
)
fig.update_layout(xaxis_title="Fecha", yaxis_title="Índice (base=100)")
fig.show()




Bonos descargados desde FRED.
Dataset guardado en: c:\ML_Bitcoin\EDA-2\CSVs-2\btc_eth_treasuries.csv


In [None]:
# FRED significa Federal Reserve Economic Data, y es una enorme base de datos económica mantenida por el Banco de la Reserva Federal de St. Louis (Estados Unidos).
# Nos hemos inclinado por usar FRED porque es una fuente confiable y ampliamente utilizada para datos económicos, incluyendo tasas de interés de bonos del gobierno, indicadores macroeconómicos, y otros datos financieros relevantes.
# Además de que yahoo finance no contenia la información o no la servia.

In [21]:
corr = rets.corr().round(3)  # redondeo a 3 decimales
fig = px.imshow(
    corr,
    text_auto=True,
    color_continuous_scale="RdBu_r",
    origin="lower",
    title="Matriz de correlaciones (BTC, ETH y Bonos del Tesoro)"
)
fig.show()

In [None]:
#BTC y ETH muestran alta correlación entre sí (~0.76), pero casi nula con los bonos del Tesoro, lo que confirma que las criptos se comportan como un mercado independiente. 
# En cambio, los bonos de largo plazo (10Y y 30Y) están fuertemente correlacionados (~0.93), reflejando su dependencia común de la política monetaria.

In [22]:
csv_dir = Path.cwd() / "CSVs-2"
btc = pd.read_csv(csv_dir / "btc_usdt_binance_daily.csv", parse_dates=["Date"], index_col="Date")
eth = pd.read_csv(csv_dir / "eth_usdt_binance_daily.csv", parse_dates=["Date"], index_col="Date")

start, end = btc.index.min(), btc.index.max()

try:
    dxy = web.DataReader("DTWEXBGS", "fred", start, end)
    dxy = dxy.rename(columns={"DTWEXBGS": "DXY"})
    print("DXY descargado correctamente desde FRED.")
except Exception as e:
    print("No se pudo descargar DXY desde FRED:", e)
    dxy = pd.DataFrame()

# === Combinar BTC, ETH y DXY ===
assets_dxy = pd.DataFrame({
    "BTC": btc["Close"],
    "ETH": eth["Close"],
}).join(dxy, how="inner")

out_path = csv_dir / "btc_eth_dxy.csv"
assets_dxy.to_csv(out_path)
print("Dataset guardado en:", out_path)

# === Normalización base=100 ===
if not assets_dxy.empty:
    normalized_dxy = assets_dxy / assets_dxy.iloc[0] * 100

    # Aseguramos que el índice se llame "Date" al resetear
    normalized_dxy = normalized_dxy.reset_index().rename(columns={"index": "Date"})

    # === Gráfico evolución ===
    fig = px.line(
        normalized_dxy,
        x="Date",
        y=["BTC", "ETH", "DXY"],
        title="BTC, ETH y DXY (FRED) — Evolución normalizada (base=100)"
    )
    fig.update_layout(xaxis_title="Fecha", yaxis_title="Índice (base=100)")
    fig.show()

    # === Matriz de correlaciones ===
    rets_dxy = assets_dxy.pct_change().dropna()
    corr_dxy = rets_dxy.corr().round(3)

    fig_corr = px.imshow(
        corr_dxy,
        text_auto=True,
        color_continuous_scale="RdBu_r",
        origin="lower",
        title="Matriz de correlaciones (BTC, ETH y DXY desde FRED)"
    )
    fig_corr.show()
else:
    print("No se pudo obtener DXY, dataset vacío.")


DXY descargado correctamente desde FRED.
Dataset guardado en: c:\ML_Bitcoin\EDA-2\CSVs-2\btc_eth_dxy.csv



The default fill_method='pad' in DataFrame.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.



In [23]:

# calculamos retornos diarios
rets_dxy = assets_dxy.pct_change().dropna()

# correlación rodante (90 días)
rolling_corr_btc = rets_dxy["BTC"].rolling(window=90).corr(rets_dxy["DXY"])
rolling_corr_eth = rets_dxy["ETH"].rolling(window=90).corr(rets_dxy["DXY"])

# unimos en un dataframe
rolling_corr = pd.DataFrame({
    "BTC vs DXY": rolling_corr_btc,
    "ETH vs DXY": rolling_corr_eth
})


fig = px.line(
    rolling_corr.reset_index(),
    x="index",
    y=rolling_corr.columns,
    title="Correlación rodante (90 días) — BTC/ETH vs DXY"
)
fig.update_layout(xaxis_title="Fecha", yaxis_title="Correlación")
fig.show()


The default fill_method='pad' in DataFrame.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.



In [None]:
# DXY es el índice del dólar estadounidense, que mide el valor del dólar en relación con una canasta de monedas extranjeras. 
# Nos interesa porque el valor del dólar puede influir en los precios de activos globales, incluyendo criptomonedas como BTC. 
# Por lo que si se refuerza el dólar, podría impactar negativamente en los precios de BTC, y viceversa.


In [24]:
import pandas as pd
from pathlib import Path


csv_dir = Path.cwd() / "CSVs-2"

# Cargar datasets principales
btc = pd.read_csv(csv_dir / "btc_usdt_binance_daily.csv", parse_dates=["Date"], index_col="Date")[["Close"]]
btc.rename(columns={"Close": "BTC_Close"}, inplace=True)

# Función para cargar y renombrar columna de cierre
def load_close(file_path, colname):
    df = pd.read_csv(file_path)

    # 1️⃣ Detectar columna de fecha (por nombre o posición)
    date_cols = [c for c in df.columns if str(c).lower() in ["date", "fecha", "time"]]
    if date_cols:
        df[date_cols[0]] = pd.to_datetime(df[date_cols[0]], errors="coerce", dayfirst=True)
        df = df.set_index(date_cols[0])
    else:
        # Si no hay columna con nombre, usar la primera si parece fecha
        first_col = df.columns[0]
        try:
            df[first_col] = pd.to_datetime(df[first_col], errors="coerce")
            if df[first_col].notna().any():
                df = df.set_index(first_col)
            else:
                print(f"⚠️ No hay fechas válidas en {file_path.name}, se usa índice numérico.")
                df.index = pd.date_range(end=pd.Timestamp.today(), periods=len(df), freq="D")
        except:
            print(f"⚠️ {file_path.name}: sin columna de fecha, se genera índice diario.")
            df.index = pd.date_range(end=pd.Timestamp.today(), periods=len(df), freq="D")

    # 2️⃣ Buscar columna de cierre o equivalente
    close_candidates = [c for c in df.columns if "close" in c.lower() or "cierre" in c.lower()]
    if not close_candidates:
        close_candidates = [df.columns[-1]]  # usar última columna si no hay 'close'

    df = df[[close_candidates[0]]].rename(columns={close_candidates[0]: colname})

    # 3️⃣ Asegurar que el índice sea único y ordenado
    df = df[~df.index.duplicated()].sort_index()

    return df

drivers = pd.concat([
    btc,
    load_close(csv_dir / "eth_usdt_binance_daily.csv", "ETH_Close"),
    load_close(csv_dir / "SPX to 16.9.2025.csv", "SPX"),
    load_close(csv_dir / "WTI to 15.9.2025.csv", "WTI"),
    load_close(csv_dir / "Brent to 15.9.2025.csv", "BRENT"),
    load_close(csv_dir / "XAU-USD to 16.9.2025.csv", "GOLD"),
    load_close(csv_dir / "btc_eth_dxy.csv", "DXY"),
    load_close(csv_dir / "btc_eth_treasuries.csv", "US10Y")
], axis=1)

drivers = drivers.asfreq("D").ffill().dropna(how="all")
drivers.tail()





Unnamed: 0,BTC_Close,ETH_Close,SPX,WTI,BRENT,GOLD,DXY,US10Y
2025-10-04,115786.17,4487.15,"0,47%","0,57%","0,91%","0,48%",119.6063,4.66
2025-10-05,115786.17,4514.32,"0,47%","0,57%","0,91%","0,48%",119.6063,4.66
2025-10-06,115786.17,4684.01,"0,47%","0,57%","0,91%","0,48%",119.6063,4.66
2025-10-07,115786.17,4447.7,"0,47%","0,57%","0,91%","0,48%",119.6063,4.66
2025-10-08,115786.17,4495.25,"0,47%","0,57%","0,91%","0,48%",119.6063,4.66


In [25]:
# === FEATURE ENGINEERING ===
import numpy as np
import pandas as pd

df = drivers.copy()

# 1️⃣ Limpieza: convertir todas las columnas a numéricas
for col in df.columns:
    df[col] = (
        df[col]
        .astype(str)
        .str.replace("%", "", regex=False)
        .str.replace(",", ".", regex=False)
    )
    df[col] = pd.to_numeric(df[col], errors="coerce")

# 2️⃣ Calcular log-retornos
df["BTC_ret"] = np.log(df["BTC_Close"]).diff()

for col in df.columns:
    if col != "BTC_Close":
        df[f"{col}_ret"] = np.log(df[col]).diff()

# 3️⃣ Lags y medias móviles
lags = [1, 2, 3, 5, 7, 14, 21]
for L in lags:
    df[f"BTC_lag{L}"] = df["BTC_Close"].shift(L)

df["MA7"] = df["BTC_Close"].rolling(7).mean()
df["MA30"] = df["BTC_Close"].rolling(30).mean()
df["Vol7"] = df["BTC_ret"].rolling(7).std()

# 4️⃣ Targets futuros (t+1 … t+7)
y = pd.DataFrame(index=df.index)
for h in range(1, 8):
    y[f"y_t+{h}"] = df["BTC_Close"].shift(-h)

# 5️⃣ Combinar features + targets
data = df.join(y).dropna()

X = data[[c for c in data.columns if not c.startswith("y_t+")]]
Y = data[[c for c in data.columns if c.startswith("y_t+")]]

print("✅ X shape:", X.shape)
print("✅ Y shape:", Y.shape)

data.head()


✅ X shape: (73, 27)
✅ Y shape: (73, 7)



divide by zero encountered in log


invalid value encountered in log


divide by zero encountered in log


invalid value encountered in log


divide by zero encountered in log


invalid value encountered in log


divide by zero encountered in log


invalid value encountered in log


divide by zero encountered in log


invalid value encountered in log



Unnamed: 0,BTC_Close,ETH_Close,SPX,WTI,BRENT,GOLD,DXY,US10Y,BTC_ret,ETH_Close_ret,...,MA7,MA30,Vol7,y_t+1,y_t+2,y_t+3,y_t+4,y_t+5,y_t+6,y_t+7
2017-10-14,5869.99,342.0,0.09,1.57,1.64,0.87,110.2669,2.81,0.038201,0.011883,...,5140.201429,4289.086,0.038843,5709.99,5760.02,5595.0,5512.06,5683.9,6010.01,6024.97
2018-01-13,14210.0,1388.02,0.67,0.86,0.88,1.21,108.9011,2.85,0.033634,0.101644,...,14506.921429,15231.356667,0.06285,13474.99,13539.93,10900.0,10988.79,10961.97,11474.98,12799.94
2018-01-28,11879.95,1251.96,1.19,0.92,0.14,0.16,106.6909,2.91,0.033288,0.112284,...,11220.634286,13132.806667,0.040443,11251.0,10237.51,10285.1,9224.52,8873.03,9199.96,8184.81
2018-03-03,11464.48,856.02,0.5,0.48,0.85,0.43,108.5357,3.14,0.037819,-0.000304,...,10604.754286,9555.071,0.034869,11515.0,11454.0,10716.48,9910.0,9271.64,9227.0,8770.22
2018-03-04,11515.0,866.66,0.5,0.48,0.85,0.43,108.5357,3.14,0.004397,0.012353,...,10879.754286,9643.136667,0.032746,11454.0,10716.48,9910.0,9271.64,9227.0,8770.22,9533.57


In [26]:
# === Entrenar KNN y generar predicciones (y_t+1) ===
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import pandas as pd

# Usamos el target a horizonte t+1
target = 'y_t+1'
# Aseguramos que X y Y existan en el notebook (se crean en celdas previas)
# X: features, Y: dataframe con columnas y_t+1..y_t+7
assert 'X' in globals(), 'Variable X no encontrada. Ejecuta la celda de feature engineering primero.'
assert 'Y' in globals(), 'Variable Y no encontrada. Ejecuta la celda de feature engineering primero.'

# Copiamos para evitar modificar objetos originales
X_local = X.copy()
y_local = Y[target].copy()

# Split manteniendo el orden temporal (shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X_local, y_local, test_size=0.2, shuffle=False)

print('Formas: X_train', X_train.shape, 'X_test', X_test.shape, 'y_train', y_train.shape, 'y_test', y_test.shape)

# Pipeline: escalado + KNN (k=5 por defecto)
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=5))
])

knn_pipeline.fit(X_train, y_train)

# Predicción sobre el conjunto de test
y_pred = knn_pipeline.predict(X_test)
# Convertir a Series con el mismo índice temporal para el gráfico
y_pred = pd.Series(y_pred, index=X_test.index)
# Asegurar que y_test también sea Series con el índice correcto
y_test = pd.Series(y_test.values, index=X_test.index)

# Métricas simples
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'KNN — MAE: {mae:.4f}, R2: {r2:.4f}')

# Dejamos y_pred e y_test en el entorno para la celda de plotting siguiente


Formas: X_train (58, 27) X_test (15, 27) y_train (58,) y_test (15,)
KNN — MAE: 18072.1580, R2: 0.2476


In [27]:
import plotly.express as px
import pandas as pd

# Armamos el DataFrame de resultados
df_pred = pd.DataFrame({
    "Fecha": y_test.index,
    "BTC Real": y_test.values,
    "BTC Predicho (KNN)": y_pred
})

# Gráfico interactivo
fig = px.line(
    df_pred,
    x="Fecha",
    y=["BTC Real", "BTC Predicho (KNN)"],
    title="📈 Predicción BTC — Modelo KNN (y_t+1)",
    labels={"value": "Precio BTC (USDT)", "Fecha": "Fecha"},
    template="plotly_dark"
)

# Personalización
fig.update_traces(mode="lines+markers")
fig.update_layout(
    legend_title_text="Serie",
    hovermode="x unified",
    title_x=0.5,
    font=dict(size=14)
)

fig.show()
