In [1]:
import pandas as pd
import ta 
import optuna 
import time
import numpy as np
from multiprocessing import Pool
from itertools import combinations, chain 
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score



In [2]:
def powerset(s):
    return chain.from_iterable(combinations(s,r) for r in range(1,len(s)+1))

In [3]:
def file_features(data, ds_type: str):
    data1=pd.DataFrame()
    #Calcular indicadores tecnicos
    cmf_data = ta.volume.ChaikinMoneyFlowIndicator(data.High, data.Low, data.Close, data.Volume, window = 14)
    rsi_data = ta.momentum.RSIIndicator(data.Close, window=14)
    
    data1["CMF"] = cmf_data.chaikin_money_flow()
    data1["RSI"] = rsi_data.rsi()
    # Calcular la volatilidad
    data1['Volatility'] = data['High'] - data['Low']
    data1['Close_Lag0'] = data['Close']
    # Calcular las tendencias
    for i in range(1, 5 + 1):
        data1[f'Close_Lag{i}'] = data['Close'].shift(i)
    #Variable ded respuesta
    if ds_type == "buy":
        data1['Response'] = (data['Close'] < data['Close'].shift(-10))
    else:
        data1['Response'] = (data['Close'] > data['Close'].shift(-10))
    
    data1 = data1.drop(data1.index[:30])
    data1 = data1.drop(data1.index[-30:])
    data1.reset_index(drop=True, inplace=True)
    
    return data1

In [4]:
def objective_log_regresor(trial, data):
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    # Definir los parámetros a optimizar
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    C = trial.suggest_loguniform('C', 0.001, 1000)
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    # Crear el modelo de regresión logística con los parámetros sugeridos
    model = LogisticRegression(penalty=penalty, C=C, solver=solver, max_iter=10_000, random_state=123)
    # Entrenar el modelo
    model.fit(X_train, y_train)
    # Calcular la precisión en el conjunto de prueba
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [5]:
def objective_svm(trial, data):
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Definir los parámetros a optimizar
    C = trial.suggest_loguniform('C', 0.001, 1000)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    if kernel == 'poly':
        degree = trial.suggest_int('degree', 2, 5)
    else:
        degree = 3  # Valor predeterminado si el kernel no es 'poly'
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto']) if kernel in ['rbf', 'poly', 'sigmoid'] else 'scale'
    # Crear el modelo SVM con los parámetros sugeridos
    model = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, max_iter=100_000, random_state=123)
    # Entrenar el modelo
    model.fit(X_train, y_train)
    # Calcular la precisión en el conjunto de prueba
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [6]:
def objective_xgboost(trial, data):
    data = data.copy()
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    # Definir los parámetros a optimizar
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=100)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.5)
    subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1)
    colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1)
    # Crear el modelo XGBoost con los parámetros sugeridos
    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=123
    )
    # Entrenar el modelo
    model.fit(X_train, y_train)
    # Calcular la precisión en el conjunto de prueba
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [7]:
def optimize_params_log_regresor(data):
    # Crear un estudio Optuna para la optimización
    study = optuna.create_study(direction='maximize')
    
    # Función objetivo con el dataset como parámetro fijo
    objective_fn = lambda trial: objective_log_regresor(trial, data)
    
    # Ejecutar la optimización
    study.optimize(objective_fn, n_trials=25)

    # Obtener los mejores parámetros
    best_params = study.best_params
    best_accuracy = study.best_value

    return best_params, best_accuracy

In [8]:
def optimize_params_svm(data):
    # Crear un estudio Optuna para la optimización
    study = optuna.create_study(direction='maximize')
    
    # Función objetivo con el dataset como parámetro fijo
    objective_fn = lambda trial: objective_svm(trial, data)
    
    # Ejecutar la optimización
    study.optimize(objective_fn, n_trials=25)

    # Obtener los mejores parámetros
    best_params = study.best_params
    best_accuracy = study.best_value

    return best_params, best_accuracy

In [9]:
def optimize_params_xgboost(data):
    # Crear un estudio Optuna para la optimización
    study = optuna.create_study(direction='maximize')
    
    # Función objetivo con el dataset como parámetro fijo
    objective_fn = lambda trial: objective_xgboost(trial, data)
    
    # Ejecutar la optimización
    study.optimize(objective_fn, n_trials=25)

    # Obtener los mejores parámetros
    best_params = study.best_params
    best_accuracy = study.best_value

    return best_params, best_accuracy

In [10]:
def optimize_params(data):
    # Optimización de regresión logística
    best_params_lr, best_accuracy_lr = optimize_params_log_regresor(data)
    print("Mejores parámetros de regresión logística:", best_params_lr)
    print("Precisión del modelo de regresión logística:", best_accuracy_lr)
    # Optimización de SVM
    best_params_svm, best_accuracy_svm = optimize_params_svm(data)
    print("Mejores parámetros de SVM:", best_params_svm)
    print("Precisión del modelo de SVM:", best_accuracy_svm)
    # Optimización de XGBoost
    best_params_xgb, best_accuracy_xgb = optimize_params_xgboost(data)
    print("Mejores parámetros de XGBoost:", best_params_xgb)
    print("Precisión del modelo de XGBoost:", best_accuracy_xgb)

In [11]:
data_1d_train = pd.read_csv("data/aapl_1d_train.csv")
data_1d_train = data_1d_train.dropna()
dataresult_long_1d_train = file_features(data_1d_train, ds_type="buy")
dataresult_short_1d_train = file_features(data_1d_train, ds_type="sell")
params_1d_long = optimize_params(dataresult_long_1d_train)
params_1d_short = optimize_params(dataresult_short_1d_train)

[I 2024-03-25 18:37:03,395] A new study created in memory with name: no-name-1dc52719-1e91-4bf1-b296-5dff24d46387
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-25 18:37:06,608] Trial 0 finished with value: 0.6564593301435406 and parameters: {'penalty': 'l2', 'C': 0.10332382197428353, 'solver': 'saga'}. Best is trial 0 with value: 0.6564593301435406.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-25 18:37:07,920] Trial 1 finished with value: 0.6583732057416268 and parameters: {'penalty': 'l1', 'C': 994.9539555852161, 'solver': 'liblinear'}. Best is trial 1 with value: 0.6583732057416268.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-25 18:37:10,454] Trial 2 finished with value: 0.6555023923444976 and parameters: {'penalty': 'l2', 'C': 0.008409497311641048, 'solver': 'saga'}. Best is trial 1 with value: 0.6583732057416268.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-25 18:37:11,849] Trial 3 finished with value: 0.6593301435406699 a

Mejores parámetros de regresión logística: {'penalty': 'l1', 'C': 0.004230720669978713, 'solver': 'liblinear'}
Precisión del modelo de regresión logística: 0.6631578947368421


[I 2024-03-25 18:37:30,905] Trial 0 finished with value: 0.6593301435406699 and parameters: {'C': 0.19971754866060473, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.6593301435406699.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-25 18:37:31,206] Trial 1 finished with value: 0.5578947368421052 and parameters: {'C': 19.90973059795461, 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 0 with value: 0.6593301435406699.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-25 18:37:31,543] Trial 2 finished with value: 0.5578947368421052 and parameters: {'C': 0.20286289357931717, 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 0 with value: 0.6593301435406699.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-25 18:37:32,534] Trial 3 finished with value: 0.661244019138756 and parameters: {'C': 0.010285159938351896, 'kernel': 'linear'}. Best is trial 3 with value: 0.661244019138756.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I

Mejores parámetros de SVM: {'C': 0.7487133302446832, 'kernel': 'rbf', 'gamma': 'scale'}
Precisión del modelo de SVM: 0.662200956937799


[I 2024-03-25 18:37:53,516] Trial 0 finished with value: 0.5253588516746411 and parameters: {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.3660656803694218, 'subsample': 0.9, 'colsample_bytree': 1.0}. Best is trial 0 with value: 0.5253588516746411.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.5)
  subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1)
  colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1)
[I 2024-03-25 18:37:53,746] Trial 1 finished with value: 0.5550239234449761 and parameters: {'n_estimators': 800, 'max_depth': 3, 'learning_rate': 0.3631125584011118, 'subsample': 1.0, 'colsample_bytree': 0.7}. Best is trial 1 with value: 0.5550239234449761.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.5)
  subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1)
  colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1)
[I 2024-03-25 18:37:

Mejores parámetros de XGBoost: {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.015078398794586863, 'subsample': 0.5, 'colsample_bytree': 0.5}
Precisión del modelo de XGBoost: 0.6631578947368421


[I 2024-03-25 18:38:00,300] Trial 0 finished with value: 0.6602870813397129 and parameters: {'penalty': 'l1', 'C': 12.960982243816039, 'solver': 'liblinear'}. Best is trial 0 with value: 0.6602870813397129.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-25 18:38:03,620] Trial 1 finished with value: 0.6593301435406699 and parameters: {'penalty': 'l2', 'C': 0.8740394655297478, 'solver': 'saga'}. Best is trial 0 with value: 0.6602870813397129.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-25 18:38:07,481] Trial 2 finished with value: 0.6593301435406699 and parameters: {'penalty': 'l1', 'C': 509.23735210056685, 'solver': 'saga'}. Best is trial 0 with value: 0.6602870813397129.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-25 18:38:10,803] Trial 3 finished with value: 0.6593301435406699 and parameters: {'penalty': 'l2', 'C': 3.5007709511461553, 'solver': 'saga'}. Best is trial 0 with value: 0.6602870813397129.
  C = trial.suggest_loguniform('C', 0.00

Mejores parámetros de regresión logística: {'penalty': 'l2', 'C': 0.9987046043280675, 'solver': 'liblinear'}
Precisión del modelo de regresión logística: 0.662200956937799


[I 2024-03-25 18:38:29,657] Trial 0 finished with value: 0.33588516746411484 and parameters: {'C': 7.409514222815256, 'kernel': 'poly', 'degree': 5, 'gamma': 'scale'}. Best is trial 0 with value: 0.33588516746411484.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-25 18:38:31,098] Trial 1 finished with value: 0.43444976076555025 and parameters: {'C': 32.729559226035626, 'kernel': 'linear'}. Best is trial 1 with value: 0.43444976076555025.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-25 18:38:31,487] Trial 2 finished with value: 0.661244019138756 and parameters: {'C': 0.15665233479881704, 'kernel': 'sigmoid', 'gamma': 'auto'}. Best is trial 2 with value: 0.661244019138756.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-25 18:38:32,534] Trial 3 finished with value: 0.661244019138756 and parameters: {'C': 20.293433725006686, 'kernel': 'poly', 'degree': 2, 'gamma': 'scale'}. Best is trial 2 with value: 0.661244019138756.
  C = trial.suggest_logunifor

Mejores parámetros de SVM: {'C': 2.8942273574098545, 'kernel': 'rbf', 'gamma': 'scale'}
Precisión del modelo de SVM: 0.6641148325358852


[I 2024-03-25 18:38:50,339] Trial 0 finished with value: 0.5588516746411484 and parameters: {'n_estimators': 800, 'max_depth': 9, 'learning_rate': 0.022135897572382556, 'subsample': 0.9, 'colsample_bytree': 0.9}. Best is trial 0 with value: 0.5588516746411484.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.5)
  subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1)
  colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1)
[I 2024-03-25 18:38:50,625] Trial 1 finished with value: 0.5942583732057416 and parameters: {'n_estimators': 800, 'max_depth': 4, 'learning_rate': 0.019592740496330367, 'subsample': 0.6, 'colsample_bytree': 0.5}. Best is trial 1 with value: 0.5942583732057416.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.5)
  subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1)
  colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1)
[I 2024-03-25 18

Mejores parámetros de XGBoost: {'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.016097786471921612, 'subsample': 0.6, 'colsample_bytree': 0.9}
Precisión del modelo de XGBoost: 0.6602870813397129


In [12]:
#Una vez actualizados los parametros optimos .....

In [13]:
def buy_signals(data):
    buy_signals = pd.DataFrame()
    # Selecciona las características
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]

    # Crear modelos con los mejores parámetros encontrados para cada algoritmo
    best_logistic_model = LogisticRegression(penalty= 'l2', C= 0.40296273057700516, solver= 'liblinear')
    best_svm_model = SVC(C= 0.9142482404219855, kernel= 'rbf', gamma= 'scale')
    best_xgboost_model = XGBClassifier(n_estimators= 100, max_depth= 7, learning_rate= 0.015749776840105326, subsample= 0.9, colsample_bytree= 0.5)

    # Entrenar los modelos con todo el conjunto de datos original
    best_logistic_model.fit(X, y)
    best_svm_model.fit(X, y)
    best_xgboost_model.fit(X, y)

    # Realizar predicciones en el conjunto de datos original
    predictions_lr = best_logistic_model.predict(X)
    predictions_svm = best_svm_model.predict(X)
    predictions_xgboost = best_xgboost_model.predict(X)
    predictions_xgboost_bool = predictions_xgboost.astype(bool)


    # Agregar las predicciones como nuevas columnas al conjunto de datos original
    buy_signals['predicciones_lr'] = predictions_lr
    buy_signals['predicciones_svm'] = predictions_svm
    buy_signals['predicciones_xgboost'] = predictions_xgboost_bool

    return buy_signals

In [14]:
def sell_signals(data):
    sell_signals = pd.DataFrame()
    # Selecciona las características
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]

    # Crear modelos con los mejores parámetros encontrados para cada algoritmo
    best_logistic_model = LogisticRegression(penalty= 'l2', C= 4.03756570068321, solver= 'liblinear')
    best_svm_model = SVC(C= 19.773384392663225, kernel= 'poly', degree= 5, gamma= 'auto')
    best_xgboost_model = XGBClassifier(n_estimators= 300, max_depth= 3, learning_rate= 0.014290645381624112, subsample= 0.6, colsample_bytree= 0.7)

    # Entrenar los modelos con todo el conjunto de datos original
    best_logistic_model.fit(X, y)
    best_svm_model.fit(X, y)
    best_xgboost_model.fit(X, y)

    # Realizar predicciones en el conjunto de datos original
    predictions_lr = best_logistic_model.predict(X)
    predictions_svm = best_svm_model.predict(X)
    predictions_xgboost = best_xgboost_model.predict(X)
    predictions_xgboost_bool = predictions_xgboost.astype(bool)

    # Agregar las predicciones como nuevas columnas al conjunto de datos original
    sell_signals['predicciones_lr'] = predictions_lr
    sell_signals['predicciones_svm'] = predictions_svm
    sell_signals['predicciones_xgboost'] = predictions_xgboost_bool

    return sell_signals

In [15]:
def backtest(data, buy_signals, sell_signals, stop_loss, take_profit, n_shares):
            history = []
            active_operations = []
            cash = 1_000_000
            com = 1.25 / 100

            for i, row in data.iterrows():
                # close active operation
                active_op_temp = []
                for operation in active_operations:
                    if operation["stop_loss"] > row.Close:
                        cash += (row.Close * operation["n_shares"]) * (1 - com)
                    elif operation["take_profit"] < row.Close:
                        cash += (row.Close * operation["n_shares"]) * (1 - com)
                    else:
                        active_op_temp.append(operation)
                active_operations = active_op_temp

                # check if we have enough cash
                if cash < (row.Close * (1 + com)):
                    asset_vals = sum([operation["n_shares"] * row.Close for operation in active_operations])
                    portfolio_value = cash + asset_vals
                    continue

                # Apply buy signals
                if buy_signals.loc[i].any():
                    active_operations.append({
                        "bought": row.Close,
                        "n_shares": n_shares,
                        "stop_loss": row.Close * stop_loss,
                        "take_profit": row.Close * take_profit
                    })

                    cash -= row.Close * (1 + com) * n_shares

                # Apply sell signals
                if sell_signals.loc[i].any():
                    active_op_temp = []
                    for operation in active_operations:
                        if operation["take_profit"] < row.Close or operation["stop_loss"] > row.Close:
                            cash += (row.Close * operation["n_shares"]) * (1 - com)
                        else:
                            active_op_temp.append(operation)
                    active_operations = active_op_temp

                asset_vals = sum([operation["n_shares"] * row.Close for operation in active_operations])
                portfolio_value = cash + asset_vals

            return portfolio_value

In [16]:
def optimize(trial, strategy, data):
    portfolio_value = 0

    stop_loss = trial.suggest_float("stop_loss", 0.80, 0.90)
    take_profit = trial.suggest_float("take_profit", 1.01, 1.10)
    n_shares = trial.suggest_int("n_shares", 20, 50)

    strat_params = {}

    buy_signals = pd.DataFrame()
    sell_signals = pd.DataFrame()

    if "logistic" in strategy:
        buy_signals["logistic"] = global_buy_signals_1d["predicciones_lr"]
        sell_signals["logistic"] = global_sell_signals_1d["predicciones_lr"]
        
    if "svm" in strategy:
        buy_signals["svm"] = global_buy_signals_1d["predicciones_svm"]
        sell_signals["svm"] = global_sell_signals_1d["predicciones_svm"]
        
    if "xg" in strategy:
        buy_signals["xg"] = global_buy_signals_1d["predicciones_xgboost"]
        sell_signals["xg"] = global_sell_signals_1d["predicciones_xgboost"]
    
    return backtest(data, buy_signals, sell_signals, stop_loss, take_profit, n_shares)

In [17]:
def optimize_file(data):
    data = data.drop(data.index[:30])
    data = data.drop(data.index[-30:])
    data.reset_index(drop=True, inplace=True)
    strategies = list(powerset(["logistic", "svm", "xg"]))
    best_strat = None
    best_val = -1
    best_params = None

    for strat in strategies:
        study = optuna.create_study(direction="maximize")
        study.optimize(lambda x: optimize(x, strat, data), n_trials=25)
        value = study.best_value
        if value > best_val:
            best_val = value
            best_strat = strat
            best_params = study.best_params
    print(study.best_value)
    print(best_strat)
    print(best_params)

    return {"file": data,
            "strat": best_strat,
            "value": best_val,
            "params": best_params}

In [18]:
#Ahora con los datos de prueba

In [None]:
data_1d_test = pd.read_csv("data/aapl_1d_test.csv")
data_1d_test = data_1d_test.dropna()
dataresult_long_1d_test = file_features(data_1d_test, ds_type="buy")
dataresult_short_1d_test = file_features(data_1d_test, ds_type="sell")
global_buy_signals_1d = buy_signals(dataresult_long_1d_test)
global_sell_signals_1d = sell_signals(dataresult_short_1d_test)
file_1d_test = optimize_file(data_1d_test)