In [1]:
import pandas as pd
import ta 
import optuna 
import time
import numpy as np
from multiprocessing import Pool
from itertools import combinations, chain 
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [None]:
data_1d_train = pd.read_csv("../data/aapl_1d_train.csv")
data_1d_train = data_1d_train.dropna()

In [None]:
data_1m_train = pd.read_csv("../data/aapl_1m_train.csv")
data_1m_train = data_1m_train.dropna()

In [None]:
data_1h_train = pd.read_csv("../data/aapl_1h_train.csv")
data_1h_train = data_1h_train.dropna()

In [None]:
data_5m_train = pd.read_csv("../data/aapl_5m_train.csv")
data_5m_train = data_5m_train.dropna()

In [None]:
def powerset(s):
    return chain.from_iterable(combinations(s,r) for r in range(1,len(s)+1))

In [None]:
def file_features(data, ds_type: str):
    data1=pd.DataFrame()
    #Calcular indicadores tecnicos
    cmf_data = ta.volume.ChaikinMoneyFlowIndicator(data.High, data.Low, data.Close, data.Volume, window = 14)
    rsi_data = ta.momentum.RSIIndicator(data.Close, window=14)
    
    data1["CMF"] = cmf_data.chaikin_money_flow()
    data1["RSI"] = rsi_data.rsi()
    # Calcular la volatilidad
    data1['Volatility'] = data['High'] - data['Low']
    data1['Close_Lag0'] = data['Close']
    # Calcular las tendencias
    for i in range(1, 5 + 1):
        data1[f'Close_Lag{i}'] = data['Close'].shift(i)
    #Variable ded respuesta
    if ds_type == "buy":
        data1['Response'] = (data['Close'] < data['Close'].shift(-10))
    else:
        data1['Response'] = (data['Close'] > data['Close'].shift(-10))
    
    data1 = data1.drop(data1.index[:30])
    data1 = data1.drop(data1.index[-30:])
    data1.reset_index(drop=True, inplace=True)
    
    return data1

In [None]:
dataresult_long_1d_train = file_features(data_1d_train, ds_type="buy")
dataresult_short_1d_train = file_features(data_1d_train, ds_type="sell")
dataresult_long_1m_train = file_features(data_1m_train, ds_type="buy")
dataresult_short_1m_train = file_features(data_1m_train, ds_type="sell")
dataresult_long_1h_train = file_features(data_1h_train, ds_type="buy")
dataresult_short_1h_train = file_features(data_1h_train, ds_type="sell")
dataresult_long_5m_train = file_features(data_5m_train, ds_type="buy")
dataresult_short_5m_train = file_features(data_5m_train, ds_type="sell")

In [None]:
def objective_log_regresor(trial, data):
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    # Definir los parámetros a optimizar
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    C = trial.suggest_loguniform('C', 0.001, 1000)
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    # Crear el modelo de regresión logística con los parámetros sugeridos
    model = LogisticRegression(penalty=penalty, C=C, solver=solver, max_iter=10_000, random_state=123)
    # Entrenar el modelo
    model.fit(X_train, y_train)
    # Calcular la precisión en el conjunto de prueba
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy
##%%
def objective_svm(trial, data):
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Definir los parámetros a optimizar
    C = trial.suggest_loguniform('C', 0.001, 1000)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    if kernel == 'poly':
        degree = trial.suggest_int('degree', 2, 5)
    else:
        degree = 3  # Valor predeterminado si el kernel no es 'poly'
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto']) if kernel in ['rbf', 'poly', 'sigmoid'] else 'scale'
    # Crear el modelo SVM con los parámetros sugeridos
    model = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, max_iter=100_000, random_state=123)
    # Entrenar el modelo
    model.fit(X_train, y_train)
    # Calcular la precisión en el conjunto de prueba
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy
##%%
def objective_xgboost(trial, data):
    data = data.copy()
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    # Definir los parámetros a optimizar
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=100)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.5)
    subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1)
    colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1)
    # Crear el modelo XGBoost con los parámetros sugeridos
    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=123
    )
    # Entrenar el modelo
    model.fit(X_train, y_train)
    # Calcular la precisión en el conjunto de prueba
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy
##%%
def optimize_params_log_regresor(data):
    # Crear un estudio Optuna para la optimización
    study = optuna.create_study(direction='maximize')
    
    # Función objetivo con el dataset como parámetro fijo
    objective_fn = lambda trial: objective_log_regresor(trial, data)
    
    # Ejecutar la optimización
    study.optimize(objective_fn, n_trials=2)

    # Obtener los mejores parámetros
    best_params = study.best_params
    best_accuracy = study.best_value

    return best_params, best_accuracy
##%%
def optimize_params_svm(data):
    # Crear un estudio Optuna para la optimización
    study = optuna.create_study(direction='maximize')
    
    # Función objetivo con el dataset como parámetro fijo
    objective_fn = lambda trial: objective_svm(trial, data)
    
    # Ejecutar la optimización
    study.optimize(objective_fn, n_trials=2)

    # Obtener los mejores parámetros
    best_params = study.best_params
    best_accuracy = study.best_value

    return best_params, best_accuracy
##%%
def optimize_params_xgboost(data):
    # Crear un estudio Optuna para la optimización
    study = optuna.create_study(direction='maximize')
    
    # Función objetivo con el dataset como parámetro fijo
    objective_fn = lambda trial: objective_xgboost(trial, data)
    
    # Ejecutar la optimización
    study.optimize(objective_fn, n_trials=2)

    # Obtener los mejores parámetros
    best_params = study.best_params
    best_accuracy = study.best_value

    return best_params, best_accuracy
##%%
def optimize_params(data):
    # Optimización de regresión logística
    best_params_lr, best_accuracy_lr = optimize_params_log_regresor(data)
    print("Mejores parámetros de regresión logística:", best_params_lr)
    print("Precisión del modelo de regresión logística:", best_accuracy_lr)
    # Optimización de SVM
    best_params_svm, best_accuracy_svm = optimize_params_svm(data)
    print("Mejores parámetros de SVM:", best_params_svm)
    print("Precisión del modelo de SVM:", best_accuracy_svm)
    # Optimización de XGBoost
    best_params_xgb, best_accuracy_xgb = optimize_params_xgboost(data)
    print("Mejores parámetros de XGBoost:", best_params_xgb)
    print("Precisión del modelo de XGBoost:", best_accuracy_xgb)

In [None]:
params_1d_long = optimize_params(dataresult_long_1d_train)
params_1d_short = optimize_params(dataresult_short_1d_train)
params_1m_long = optimize_params(dataresult_long_1m_train)
params_1m_short = optimize_params(dataresult_short_1m_train)
params_1h_long = optimize_params(dataresult_long_1h_train)
params_1h_short = optimize_params(dataresult_short_1h_train)
params_5m_long = optimize_params(dataresult_long_5m_train)
params_5m_short = optimize_params(dataresult_short_5m_train)

[I 2024-03-09 15:13:14,709] A new study created in memory with name: no-name-a0ef3fc5-4553-4b49-9fdf-d2cd01c84eaf
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-09 15:13:14,729] Trial 0 finished with value: 0.6555023923444976 and parameters: {'penalty': 'l2', 'C': 0.02054362589578733, 'solver': 'liblinear'}. Best is trial 0 with value: 0.6555023923444976.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-09 15:13:15,194] Trial 1 finished with value: 0.6631578947368421 and parameters: {'penalty': 'l1', 'C': 0.008798139830459064, 'solver': 'saga'}. Best is trial 1 with value: 0.6631578947368421.
[I 2024-03-09 15:13:15,194] A new study created in memory with name: no-name-ca166ff5-a6c9-45ab-8658-b2344d48553b
  C = trial.suggest_loguniform('C', 0.001, 1000)


Mejores parámetros de regresión logística: {'penalty': 'l1', 'C': 0.008798139830459064, 'solver': 'saga'}
Precisión del modelo de regresión logística: 0.6631578947368421


[I 2024-03-09 15:13:15,806] Trial 0 finished with value: 0.6593301435406699 and parameters: {'C': 37.29423057590281, 'kernel': 'sigmoid', 'gamma': 'auto'}. Best is trial 0 with value: 0.6593301435406699.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-09 15:13:17,077] Trial 1 finished with value: 0.6593301435406699 and parameters: {'C': 0.0019177197723385766, 'kernel': 'linear'}. Best is trial 0 with value: 0.6593301435406699.
[I 2024-03-09 15:13:17,077] A new study created in memory with name: no-name-4216f604-c1d5-4c41-b0c4-81d62ff2df36
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.5)
  subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1)
  colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1)


Mejores parámetros de SVM: {'C': 37.29423057590281, 'kernel': 'sigmoid', 'gamma': 'auto'}
Precisión del modelo de SVM: 0.6593301435406699


[I 2024-03-09 15:13:18,035] Trial 0 finished with value: 0.5291866028708134 and parameters: {'n_estimators': 1000, 'max_depth': 8, 'learning_rate': 0.4621973561135585, 'subsample': 1.0, 'colsample_bytree': 0.8}. Best is trial 0 with value: 0.5291866028708134.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.5)
  subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1)
  colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1)
[I 2024-03-09 15:13:18,193] Trial 1 finished with value: 0.6181818181818182 and parameters: {'n_estimators': 100, 'max_depth': 8, 'learning_rate': 0.02353186669766063, 'subsample': 0.8, 'colsample_bytree': 1.0}. Best is trial 1 with value: 0.6181818181818182.
[I 2024-03-09 15:13:18,193] A new study created in memory with name: no-name-68759755-0447-425c-a824-88a93d66dca2
  C = trial.suggest_loguniform('C', 0.001, 1000)


Mejores parámetros de XGBoost: {'n_estimators': 100, 'max_depth': 8, 'learning_rate': 0.02353186669766063, 'subsample': 0.8, 'colsample_bytree': 1.0}
Precisión del modelo de XGBoost: 0.6181818181818182


[I 2024-03-09 15:13:19,380] Trial 0 finished with value: 0.6602870813397129 and parameters: {'penalty': 'l1', 'C': 391.6055943510264, 'solver': 'liblinear'}. Best is trial 0 with value: 0.6602870813397129.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-09 15:13:25,034] Trial 1 finished with value: 0.6593301435406699 and parameters: {'penalty': 'l1', 'C': 40.27133752652738, 'solver': 'saga'}. Best is trial 0 with value: 0.6602870813397129.
[I 2024-03-09 15:13:25,034] A new study created in memory with name: no-name-55f014a6-2fa6-4e9a-b577-d86423366854
  C = trial.suggest_loguniform('C', 0.001, 1000)


Mejores parámetros de regresión logística: {'penalty': 'l1', 'C': 391.6055943510264, 'solver': 'liblinear'}
Precisión del modelo de regresión logística: 0.6602870813397129


[I 2024-03-09 15:13:26,391] Trial 0 finished with value: 0.661244019138756 and parameters: {'C': 34.154290341054214, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.661244019138756.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-09 15:13:28,003] Trial 1 finished with value: 0.5550239234449761 and parameters: {'C': 0.13634823458733597, 'kernel': 'linear'}. Best is trial 0 with value: 0.661244019138756.
[I 2024-03-09 15:13:28,003] A new study created in memory with name: no-name-2ccf73b5-62c2-4ebc-909b-d1b3e04fe30d
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.5)
  subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1)
  colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1)


Mejores parámetros de SVM: {'C': 34.154290341054214, 'kernel': 'rbf', 'gamma': 'scale'}
Precisión del modelo de SVM: 0.661244019138756


[I 2024-03-09 15:13:28,346] Trial 0 finished with value: 0.5732057416267943 and parameters: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.01845611375279615, 'subsample': 0.7, 'colsample_bytree': 0.9}. Best is trial 0 with value: 0.5732057416267943.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.5)
  subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1)
  colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1)
[I 2024-03-09 15:13:29,306] Trial 1 finished with value: 0.5607655502392345 and parameters: {'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.06786781783358023, 'subsample': 0.6, 'colsample_bytree': 1.0}. Best is trial 0 with value: 0.5732057416267943.
[I 2024-03-09 15:13:29,307] A new study created in memory with name: no-name-4aa093f2-eb91-4323-a32a-c31c05b28131
  C = trial.suggest_loguniform('C', 0.001, 1000)


Mejores parámetros de XGBoost: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.01845611375279615, 'subsample': 0.7, 'colsample_bytree': 0.9}
Precisión del modelo de XGBoost: 0.5732057416267943


Escogemos los mejores parametros, en este caso el porcentaje de accuracy no representa gran diferencia entre todos los data sets, por lo que escogemos el dataset de 1d, para trabajar de manera mas eficiente.

In [None]:
def buy_signals(data):
    buy_signals = pd.DataFrame()
    # Selecciona las características
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]

    # Crear modelos con los mejores parámetros encontrados para cada algoritmo
    best_logistic_model = LogisticRegression(penalty= 'l1', C= 0.008798139830459064, solver= 'saga')
    best_svm_model = SVC(C= 37.29423057590281, kernel= 'sigmoid', gamma= 'auto')
    best_xgboost_model = XGBClassifier(n_estimators= 100, max_depth= 8, learning_rate= 0.02353186669766063, subsample= 0.8, colsample_bytree= 1.0)

    # Entrenar los modelos con todo el conjunto de datos original
    best_logistic_model.fit(X, y)
    best_svm_model.fit(X, y)
    best_xgboost_model.fit(X, y)

    # Realizar predicciones en el conjunto de datos original
    predictions_lr = best_logistic_model.predict(X)
    predictions_svm = best_svm_model.predict(X)
    predictions_xgboost = best_xgboost_model.predict(X)
    predictions_xgboost_bool = predictions_xgboost.astype(bool)


    # Agregar las predicciones como nuevas columnas al conjunto de datos original
    buy_signals['predicciones_lr'] = predictions_lr
    buy_signals['predicciones_svm'] = predictions_svm
    buy_signals['predicciones_xgboost'] = predictions_xgboost_bool

    return buy_signals

In [None]:
def sell_signals(data):
    sell_signals = pd.DataFrame()
    # Selecciona las características
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]

    # Crear modelos con los mejores parámetros encontrados para cada algoritmo
    best_logistic_model = LogisticRegression(penalty='l1', C=391.6055943510264, solver='liblinear')
    best_svm_model = SVC(C=34.154290341054214, kernel='rbf', gamma='scale')
    best_xgboost_model = XGBClassifier(n_estimators=300, max_depth=7, learning_rate=0.01845611375279615, subsample=0.7,
                                       colsample_bytree=0.9)

    # Entrenar los modelos con todo el conjunto de datos original
    best_logistic_model.fit(X, y)
    best_svm_model.fit(X, y)
    best_xgboost_model.fit(X, y)

    # Realizar predicciones en el conjunto de datos original
    predictions_lr = best_logistic_model.predict(X)
    predictions_svm = best_svm_model.predict(X)
    predictions_xgboost = best_xgboost_model.predict(X)
    predictions_xgboost_bool = predictions_xgboost.astype(bool)

    # Agregar las predicciones como nuevas columnas al conjunto de datos original
    sell_signals['predicciones_lr'] = predictions_lr
    sell_signals['predicciones_svm'] = predictions_svm
    sell_signals['predicciones_xgboost'] = predictions_xgboost_bool

    return sell_signals

In [None]:
global_buy_signals = buy_signals(dataresult_long_1d_train)
global_sell_signals = sell_signals(dataresult_short_1d_train)

In [None]:
def backtest(data, buy_signals, sell_signals, stop_loss, take_profit, n_shares):
            history = []
            active_operations = []
            cash = 1_000_000
            com = 1.25 / 100

            for i, row in data.iterrows():
                # close active operation
                active_op_temp = []
                for operation in active_operations:
                    if operation["stop_loss"] > row.Close:
                        cash += (row.Close * operation["n_shares"]) * (1 - com)
                    elif operation["take_profit"] < row.Close:
                        cash += (row.Close * operation["n_shares"]) * (1 - com)
                    else:
                        active_op_temp.append(operation)
                active_operations = active_op_temp

                # check if we have enough cash
                if cash < (row.Close * (1 + com)):
                    asset_vals = sum([operation["n_shares"] * row.Close for operation in active_operations])
                    portfolio_value = cash + asset_vals
                    continue

                # Apply buy signals
                if buy_signals.loc[i].any():
                    active_operations.append({
                        "bought": row.Close,
                        "n_shares": n_shares,
                        "stop_loss": row.Close * stop_loss,
                        "take_profit": row.Close * take_profit
                    })

                    cash -= row.Close * (1 + com) * n_shares

                # Apply sell signals
                if sell_signals.loc[i].any():
                    active_op_temp = []
                    for operation in active_operations:
                        if operation["take_profit"] < row.Close or operation["stop_loss"] > row.Close:
                            cash += (row.Close * operation["n_shares"]) * (1 - com)
                        else:
                            active_op_temp.append(operation)
                    active_operations = active_op_temp

                asset_vals = sum([operation["n_shares"] * row.Close for operation in active_operations])
                portfolio_value = cash + asset_vals

            return portfolio_value

In [None]:
def optimize(trial, strategy, data):
    portfolio_value = 0

    stop_loss = trial.suggest_float("stop_loss", 0.80, 0.90)
    take_profit = trial.suggest_float("take_profit", 1.01, 1.10)
    n_shares = trial.suggest_int("n_shares", 20, 50)

    strat_params = {}

    buy_signals = pd.DataFrame()
    sell_signals = pd.DataFrame()

    if "logistic" in strategy:
        buy_signals["logistic"] = global_buy_signals["predicciones_lr"]
        sell_signals["logistic"] = global_sell_signals["predicciones_lr"]
        
    if "svm" in strategy:
        buy_signals["svm"] = global_buy_signals["predicciones_svm"]
        sell_signals["svm"] = global_sell_signals["predicciones_svm"]
        
    if "xg" in strategy:
        buy_signals["xg"] = global_buy_signals["predicciones_xgboost"]
        sell_signals["xg"] = global_sell_signals["predicciones_xgboost"]
    
    return backtest(data, buy_signals, sell_signals, stop_loss, take_profit, n_shares)

In [None]:
def optimize_file(data):
    data = data.drop(data.index[:30])
    data = data.drop(data.index[-30:])
    data.reset_index(drop=True, inplace=True)
    strategies = list(powerset(["logistic", "svm", "xg"]))
    best_strat = None
    best_val = -1
    best_params = None

    for strat in strategies:
        study = optuna.create_study(direction="maximize")
        study.optimize(lambda x: optimize(x, strat, data), n_trials=15)
        value = study.best_value
        if value > best_val:
            best_val = value
            best_strat = strat
            best_params = study.best_params
    print(study.best_value)
    print(best_strat)
    print(best_params)

    return {"file": data,
            "strat": best_strat,
            "value": best_val,
            "params": best_params}

In [None]:
file_1d_test = optimize_file(data_1d_train)