En este notebook voy a implementar un modelo de regresión logística aummentando los predictores, además de los rendimientos rezagados, con los indicadores RSI, SMA, MACD y BB.  
También se agregan algunas estrategias que se irán explicando en los comentarios

In [2]:
# Se realizan las importaciones necesarias
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
plt.style.use("seaborn-v0_8-whitegrid")

In [9]:
# Función para calcular indicadores técnicos
def calculate_technical_indicators(df, price_col="price"):
    # SMA (Simple Moving Average) - Media móvil simple (ventana de 20 periodos)
    df["SMA"] = df[price_col].rolling(window=20).mean()

    # MACD (Moving Average Convergence Divergence)
    exp1 = df[price_col].ewm(span=12, adjust=False).mean()  # EMA rápida
    exp2 = df[price_col].ewm(span=26, adjust=False).mean()  # EMA lenta
    df["MACD"] = exp1 - exp2
    df["MACD_Signal"] = df["MACD"].ewm(span=9, adjust=False).mean()  # Línea de señal

    # RSI (Relative Strength Index)
    delta = df[price_col].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df["RSI"] = 100 - (100 / (1 + rs))

    # Bandas de Bollinger (ventana de 20 periodos, 2 desviaciones estándar)
    df["BB_Middle"] = df[price_col].rolling(window=20).mean()
    df["BB_Std"] = df[price_col].rolling(window=20).std()
    df["BB_Upper"] = df["BB_Middle"] + 2 * df["BB_Std"]
    df["BB_Lower"] = df["BB_Middle"] - 2 * df["BB_Std"]

    return df


In [10]:
# función para cargar los datos
def carga_archivo(archivo, tipo):
    data = pd.read_csv(archivo)
    if tipo == "b":
        data = data.rename(columns = {"Close time": "time", "Close": "price"})
    data["time"] = pd.to_datetime(data["time"])
    data.set_index("time", inplace=True)
    return data


In [11]:
# seleccionar el archivo de datos
archivo = "five_minute.csv"
tipo = "t"

In [12]:
"""Cargar el archivo de datos
Voy a realizar cálculos con diferentes archivos de datos que están en el directorio actual y corresponden a
datos del tutorial y a datos de Binance.
Los datos del tutorial están en el archivo "five_minute.csv" y los datos de Binance están en  "EURUSDT_2020_5MIN.csv" """

df = carga_archivo(archivo, tipo)


In [13]:
df

Unnamed: 0_level_0,price
time,Unnamed: 1_level_1
2019-01-01 22:00:00+00:00,1.146580
2019-01-01 22:05:00+00:00,1.146350
2019-01-01 22:10:00+00:00,1.146320
2019-01-01 22:15:00+00:00,1.146320
2019-01-01 22:20:00+00:00,1.146530
...,...
2019-12-30 23:35:00+00:00,1.120180
2019-12-30 23:40:00+00:00,1.120210
2019-12-30 23:45:00+00:00,1.120295
2019-12-30 23:50:00+00:00,1.120275


In [16]:
# Calcular los rendimientos logarítmicos
df["returns"] = np.log(df["price"] / df["price"].shift(1))


In [17]:
# Calcular la dirección del mercado (tres clases: +1, 0, -1)
df["direction"] = np.sign(df["returns"])

In [18]:
# Verificar la distribución de clases
print("\nDistribución de clases en 'direction':")
print(df["direction"].value_counts())


Distribución de clases en 'direction':
direction
 1.0    36058
-1.0    35702
 0.0     1959
Name: count, dtype: int64


In [19]:
# Calcular indicadores técnicos
df = calculate_technical_indicators(df)


In [21]:
# Crear cinco predictores de rendimientos rezagados
lags = 5
cols = []
for lag in range(1, lags + 1):
    col = f"lag{lag}"
    df[col] = df["returns"].shift(lag)
    cols.append(col)


In [22]:
# Agregar indicadores técnicos como predictores
technical_indicators = ["SMA", "MACD", "MACD_Signal", "RSI", "BB_Upper", "BB_Lower"]
cols.extend(technical_indicators)

In [23]:
# Eliminar filas con valores NaN
df.dropna(inplace=True)

In [24]:
df

Unnamed: 0_level_0,price,SMA,MACD,MACD_Signal,RSI,BB_Middle,BB_Std,BB_Upper,BB_Lower,returns,direction,lag1,lag2,lag3,lag4,lag5
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-01-01 23:35:00+00:00,1.146630,1.146497,0.000027,0.000004,58.378378,1.146497,0.000122,1.146740,1.146253,0.000065,1.0,-0.000083,-0.000039,0.000022,0.000065,0.000057
2019-01-01 23:40:00+00:00,1.146500,1.146493,0.000019,0.000007,52.173913,1.146493,0.000120,1.146732,1.146253,-0.000113,-1.0,0.000065,-0.000083,-0.000039,0.000022,0.000065
2019-01-01 23:45:00+00:00,1.146210,1.146486,-0.000010,0.000004,40.754717,1.146486,0.000132,1.146750,1.146221,-0.000253,-1.0,-0.000113,0.000065,-0.000083,-0.000039,0.000022
2019-01-01 23:50:00+00:00,1.146180,1.146478,-0.000035,-0.000004,42.519685,1.146478,0.000145,1.146768,1.146189,-0.000026,-1.0,-0.000253,-0.000113,0.000065,-0.000083,-0.000039
2019-01-01 23:55:00+00:00,1.146145,1.146470,-0.000057,-0.000015,43.200000,1.146470,0.000159,1.146788,1.146151,-0.000031,-1.0,-0.000026,-0.000253,-0.000113,0.000065,-0.000083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-30 23:35:00+00:00,1.120180,1.120160,0.000032,0.000017,61.000000,1.120160,0.000158,1.120475,1.119845,-0.000004,-1.0,-0.000112,-0.000018,0.000022,-0.000004,0.000089
2019-12-30 23:40:00+00:00,1.120210,1.120167,0.000030,0.000020,59.375000,1.120167,0.000156,1.120479,1.119856,0.000027,1.0,-0.000004,-0.000112,-0.000018,0.000022,-0.000004
2019-12-30 23:45:00+00:00,1.120295,1.120173,0.000035,0.000023,62.679426,1.120173,0.000159,1.120491,1.119856,0.000076,1.0,0.000027,-0.000004,-0.000112,-0.000018,0.000022
2019-12-30 23:50:00+00:00,1.120275,1.120166,0.000038,0.000026,60.576923,1.120166,0.000148,1.120462,1.119869,-0.000018,-1.0,0.000076,0.000027,-0.000004,-0.000112,-0.000018


In [25]:
cols

['lag1',
 'lag2',
 'lag3',
 'lag4',
 'lag5',
 'SMA',
 'MACD',
 'MACD_Signal',
 'RSI',
 'BB_Upper',
 'BB_Lower']

In [26]:
# Separar los datos en predictores (X) y variable objetivo (y)
X = df[cols]
y = df["direction"]


In [27]:
# Escalar los predictores
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [28]:
# Dividir los datos en entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [29]:
# Entrenar el modelo de regresión logística con pesos balanceados
lm = LogisticRegression(
    multi_class="multinomial", solver="lbfgs", max_iter=10000, class_weight="balanced"
)

In [30]:
# Ajustar el modelo
lm.fit(X_train, y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,10000


In [31]:
# Predecir en el conjunto de prueba
y_pred = lm.predict(X_test)

In [32]:
# Evaluar el modelo
print("\nReporte de clasificación (Conjunto de prueba):")
print(classification_report(y_test, y_pred, zero_division=0))
print("\nMatriz de confusión (Conjunto de prueba):")
print(confusion_matrix(y_test, y_pred))


Reporte de clasificación (Conjunto de prueba):
              precision    recall  f1-score   support

        -1.0       0.62      0.42      0.50      7139
         0.0       0.04      0.59      0.08       391
         1.0       0.60      0.38      0.47      7211

    accuracy                           0.40     14741
   macro avg       0.42      0.46      0.35     14741
weighted avg       0.60      0.40      0.47     14741


Matriz de confusión (Conjunto de prueba):
[[2964 2452 1723]
 [  85  229   77]
 [1745 2710 2756]]
