In [1]:
import pandas as pd
import ta 
import optuna 
import time
import numpy as np
from multiprocessing import Pool
from itertools import combinations, chain 
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score



In [2]:
data_1d = pd.read_csv("../data/aapl_1d_train.csv")
data_1d = data_1d.dropna()

In [3]:
data_1h = pd.read_csv("../data/aapl_1h_train.csv")
data_1h= data_1h.dropna()

In [4]:
data_5m = pd.read_csv("../data/aapl_5m_train.csv")
data_5m = data_5m.dropna()

In [5]:
data_1m = pd.read_csv("../data/aapl_1m_train.csv")
data_1m = data_1m.dropna()

In [6]:
def powerset(s):
    return chain.from_iterable(combinations(s,r) for r in range(1,len(s)+1))

In [7]:
def file_features(data, ds_type: str):
    data1=pd.DataFrame()
    #Calcular indicadores tecnicos
    cmf_data = ta.volume.ChaikinMoneyFlowIndicator(data.High, data.Low, data.Close, data.Volume, window = 14)
    rsi_data = ta.momentum.RSIIndicator(data.Close, window=14)
    
    data1["CMF"] = cmf_data.chaikin_money_flow()
    data1["RSI"] = rsi_data.rsi()
    # Calcular la volatilidad
    data1['Volatility'] = data['High'] - data['Low']
    data1['Close_Lag0'] = data['Close']
    # Calcular las tendencias
    for i in range(1, 5 + 1):
        data1[f'Close_Lag{i}'] = data['Close'].shift(i)
    #Variable ded respuesta
    if ds_type == "buy":
        data1['Response'] = (data['Close'] < data['Close'].shift(-10))
    else:
        data1['Response'] = (data['Close'] > data['Close'].shift(-10))
    
    data1 = data1.drop(data1.index[:30])
    data1 = data1.drop(data1.index[-30:])
    data1.reset_index(drop=True, inplace=True)
    
    return data1

In [8]:
dataresult_long_1d = file_features(data_1m, ds_type="buy")
dataresult_short_1d = file_features(data_1m, ds_type="sell")
dataresult_short_1d

Unnamed: 0,CMF,RSI,Volatility,Close_Lag0,Close_Lag1,Close_Lag2,Close_Lag3,Close_Lag4,Close_Lag5,Response
0,0.037057,42.198356,0.650001,119.830001,119.889999,121.150001,120.699996,120.955001,120.608398,True
1,0.110396,44.020547,1.010003,120.059997,119.830001,119.889999,121.150001,120.699996,120.955001,True
2,0.079499,44.875763,0.485000,120.165100,120.059997,119.830001,119.889999,121.150001,120.699996,True
3,0.007440,43.137629,0.790001,119.907699,120.165100,120.059997,119.830001,119.889999,121.150001,True
4,0.016513,37.447391,1.139999,118.970001,119.907699,120.165100,120.059997,119.830001,119.889999,True
...,...,...,...,...,...,...,...,...,...,...
315,0.230007,60.275954,0.309998,131.610000,131.755905,132.080001,132.068298,131.389999,131.733795,False
316,0.129291,55.880925,0.960006,130.960006,131.610000,131.755905,132.080001,132.068298,131.389999,False
317,0.148648,65.063275,2.080490,133.135604,130.960006,131.610000,131.755905,132.080001,132.068298,False
318,0.132043,59.049706,1.350006,132.147094,133.135604,130.960006,131.610000,131.755905,132.080001,False


In [9]:
def objective_log_regresor(trial, data):
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    # Definir los parámetros a optimizar
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    C = trial.suggest_loguniform('C', 0.001, 1000)
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    # Crear el modelo de regresión logística con los parámetros sugeridos
    model = LogisticRegression(penalty=penalty, C=C, solver=solver, max_iter=10_000, random_state=123)
    # Entrenar el modelo
    model.fit(X_train, y_train)
    # Calcular la precisión en el conjunto de prueba
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [10]:
def objective_svm(trial, data):
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Definir los parámetros a optimizar
    C = trial.suggest_loguniform('C', 0.001, 1000)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    if kernel == 'poly':
        degree = trial.suggest_int('degree', 2, 5)
    else:
        degree = 3  # Valor predeterminado si el kernel no es 'poly'
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto']) if kernel in ['rbf', 'poly', 'sigmoid'] else 'scale'
    # Crear el modelo SVM con los parámetros sugeridos
    model = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, max_iter=100_000, random_state=123)
    # Entrenar el modelo
    model.fit(X_train, y_train)
    # Calcular la precisión en el conjunto de prueba
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [11]:
def objective_xgboost(trial, data):
    data = data.copy()
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    # Definir los parámetros a optimizar
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=100)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.5)
    subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1)
    colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1)
    # Crear el modelo XGBoost con los parámetros sugeridos
    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=123
    )
    # Entrenar el modelo
    model.fit(X_train, y_train)
    # Calcular la precisión en el conjunto de prueba
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [12]:
def optimize_params_log_regresor(data):
    # Crear un estudio Optuna para la optimización
    study = optuna.create_study(direction='maximize')
    
    # Función objetivo con el dataset como parámetro fijo
    objective_fn = lambda trial: objective_log_regresor(trial, data)
    
    # Ejecutar la optimización
    study.optimize(objective_fn, n_trials=2)

    # Obtener los mejores parámetros
    best_params = study.best_params
    best_accuracy = study.best_value

    return best_params, best_accuracy

In [13]:
def optimize_params_svm(data):
    # Crear un estudio Optuna para la optimización
    study = optuna.create_study(direction='maximize')
    
    # Función objetivo con el dataset como parámetro fijo
    objective_fn = lambda trial: objective_svm(trial, data)
    
    # Ejecutar la optimización
    study.optimize(objective_fn, n_trials=2)

    # Obtener los mejores parámetros
    best_params = study.best_params
    best_accuracy = study.best_value

    return best_params, best_accuracy

In [14]:
def optimize_params_xgboost(data):
    # Crear un estudio Optuna para la optimización
    study = optuna.create_study(direction='maximize')
    
    # Función objetivo con el dataset como parámetro fijo
    objective_fn = lambda trial: objective_xgboost(trial, data)
    
    # Ejecutar la optimización
    study.optimize(objective_fn, n_trials=2)

    # Obtener los mejores parámetros
    best_params = study.best_params
    best_accuracy = study.best_value

    return best_params, best_accuracy

In [15]:
def optimize_params(data):
    # Optimización de regresión logística
    best_params_lr, best_accuracy_lr = optimize_params_log_regresor(data)
    print("Mejores parámetros de regresión logística:", best_params_lr)
    print("Precisión del modelo de regresión logística:", best_accuracy_lr)
    # Optimización de SVM
    best_params_svm, best_accuracy_svm = optimize_params_svm(data)
    print("Mejores parámetros de SVM:", best_params_svm)
    print("Precisión del modelo de SVM:", best_accuracy_svm)
    # Optimización de XGBoost
    best_params_xgb, best_accuracy_xgb = optimize_params_xgboost(data)
    print("Mejores parámetros de XGBoost:", best_params_xgb)
    print("Precisión del modelo de XGBoost:", best_accuracy_xgb)

In [16]:
params_1d_long = optimize_params(dataresult_long_1d)
params_1d_short = optimize_params(dataresult_short_1d)

[I 2024-03-08 09:52:30,738] A new study created in memory with name: no-name-1a4bbb8a-7055-47d3-8111-bcc02c154b2c
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-08 09:52:30,751] Trial 0 finished with value: 0.765625 and parameters: {'penalty': 'l1', 'C': 0.0023247647259174516, 'solver': 'saga'}. Best is trial 0 with value: 0.765625.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-08 09:52:30,967] Trial 1 finished with value: 0.3125 and parameters: {'penalty': 'l2', 'C': 304.09600299777014, 'solver': 'saga'}. Best is trial 0 with value: 0.765625.
[I 2024-03-08 09:52:30,968] A new study created in memory with name: no-name-96c898f2-f3ed-46c3-b525-c9becfb66070
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-08 09:52:30,980] Trial 0 finished with value: 0.234375 and parameters: {'C': 237.30645354385865, 'kernel': 'sigmoid', 'gamma': 'auto'}. Best is trial 0 with value: 0.234375.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-08 09:52:30,988

Mejores parámetros de regresión logística: {'penalty': 'l1', 'C': 0.0023247647259174516, 'solver': 'saga'}
Precisión del modelo de regresión logística: 0.765625
Mejores parámetros de SVM: {'C': 237.30645354385865, 'kernel': 'sigmoid', 'gamma': 'auto'}
Precisión del modelo de SVM: 0.234375


[I 2024-03-08 09:52:31,345] Trial 0 finished with value: 0.4375 and parameters: {'n_estimators': 900, 'max_depth': 6, 'learning_rate': 0.49198096684074955, 'subsample': 0.6, 'colsample_bytree': 0.8}. Best is trial 0 with value: 0.4375.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.5)
  subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1)
  colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1)
[I 2024-03-08 09:52:31,400] Trial 1 finished with value: 0.4375 and parameters: {'n_estimators': 100, 'max_depth': 8, 'learning_rate': 0.25941791373402096, 'subsample': 0.8, 'colsample_bytree': 0.5}. Best is trial 0 with value: 0.4375.
[I 2024-03-08 09:52:31,402] A new study created in memory with name: no-name-4cc5782b-e6f5-4488-bcc3-59224030be56
  C = trial.suggest_loguniform('C', 0.001, 1000)


Mejores parámetros de XGBoost: {'n_estimators': 900, 'max_depth': 6, 'learning_rate': 0.49198096684074955, 'subsample': 0.6, 'colsample_bytree': 0.8}
Precisión del modelo de XGBoost: 0.4375


[I 2024-03-08 09:52:32,192] Trial 0 finished with value: 0.203125 and parameters: {'penalty': 'l1', 'C': 14.91745433837624, 'solver': 'liblinear'}. Best is trial 0 with value: 0.203125.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-08 09:52:32,204] Trial 1 finished with value: 0.265625 and parameters: {'penalty': 'l2', 'C': 125.59746539838365, 'solver': 'liblinear'}. Best is trial 1 with value: 0.265625.
[I 2024-03-08 09:52:32,206] A new study created in memory with name: no-name-880ba328-0996-4134-86ab-cdc4a374a566
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-08 09:52:32,252] Trial 0 finished with value: 0.3125 and parameters: {'C': 19.902738975695588, 'kernel': 'linear'}. Best is trial 0 with value: 0.3125.
  C = trial.suggest_loguniform('C', 0.001, 1000)
[I 2024-03-08 09:52:32,261] Trial 1 finished with value: 0.21875 and parameters: {'C': 0.015778637525842715, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.3125.
[I 2024-03-08 09:52:32

Mejores parámetros de regresión logística: {'penalty': 'l2', 'C': 125.59746539838365, 'solver': 'liblinear'}
Precisión del modelo de regresión logística: 0.265625
Mejores parámetros de SVM: {'C': 19.902738975695588, 'kernel': 'linear'}
Precisión del modelo de SVM: 0.3125


[I 2024-03-08 09:52:32,523] Trial 0 finished with value: 0.421875 and parameters: {'n_estimators': 800, 'max_depth': 10, 'learning_rate': 0.24767342500671413, 'subsample': 0.8, 'colsample_bytree': 0.5}. Best is trial 0 with value: 0.421875.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.5)
  subsample = trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1)
  colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1)
[I 2024-03-08 09:52:32,608] Trial 1 finished with value: 0.34375 and parameters: {'n_estimators': 300, 'max_depth': 10, 'learning_rate': 0.46954020376337907, 'subsample': 1.0, 'colsample_bytree': 0.6}. Best is trial 0 with value: 0.421875.


Mejores parámetros de XGBoost: {'n_estimators': 800, 'max_depth': 10, 'learning_rate': 0.24767342500671413, 'subsample': 0.8, 'colsample_bytree': 0.5}
Precisión del modelo de XGBoost: 0.421875


In [17]:
def buy_signals(data):
    buy_signals = pd.DataFrame()
    # Selecciona las características
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]

    # Crear modelos con los mejores parámetros encontrados para cada algoritmo
    best_logistic_model = LogisticRegression(penalty='l1', C=142.00912335775166, solver='liblinear')
    best_svm_model = SVC(C=1.0, kernel='rbf', gamma='scale')
    best_xgboost_model = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8)

    # Entrenar los modelos con todo el conjunto de datos original
    best_logistic_model.fit(X, y)
    best_svm_model.fit(X, y)
    best_xgboost_model.fit(X, y)

    # Realizar predicciones en el conjunto de datos original
    predictions_lr = best_logistic_model.predict(X)
    predictions_svm = best_svm_model.predict(X)
    predictions_xgboost = best_xgboost_model.predict(X)

    # Agregar las predicciones como nuevas columnas al conjunto de datos original
    buy_signals['predicciones_lr'] = predictions_lr
    buy_signals['predicciones_svm'] = predictions_svm
    buy_signals['predicciones_xgboost'] = predictions_xgboost
    
    return buy_signals

In [18]:
def sell_signals(data):
    sell_signals = pd.DataFrame()
    # Selecciona las características
    X = data.iloc[:, :-1]
    # Selecciona la variable objetivo
    y = data.iloc[:, -1]

    # Crear modelos con los mejores parámetros encontrados para cada algoritmo
    best_logistic_model = LogisticRegression(penalty='l1', C=142.00912335775166, solver='liblinear')
    best_svm_model = SVC(C=1.0, kernel='rbf', gamma='scale')
    best_xgboost_model = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8)

    # Entrenar los modelos con todo el conjunto de datos original
    best_logistic_model.fit(X, y)
    best_svm_model.fit(X, y)
    best_xgboost_model.fit(X, y)

    # Realizar predicciones en el conjunto de datos original
    predictions_lr = best_logistic_model.predict(X)
    predictions_svm = best_svm_model.predict(X)
    predictions_xgboost = best_xgboost_model.predict(X)
    predictions_xgboost_bool = predictions_xgboost.astype(bool)

    # Agregar las predicciones como nuevas columnas al conjunto de datos original
    sell_signals['predicciones_lr'] = predictions_lr
    sell_signals['predicciones_svm'] = predictions_svm
    sell_signals['predicciones_xgboost'] = predictions_xgboost_bool
    
    return sell_signals

In [19]:
global_buy_signals = buy_signals(dataresult_long_1d)
global_sell_signals = sell_signals(dataresult_short_1d)



In [20]:
def backtest(data, buy_signals, sell_signals, stop_loss, take_profit, n_shares):
    history = []
    active_operations = []
    cash = 1_000_000
    com = 1.25 / 100
    
    for i, row in data.iterrows():
        # close active operation
        active_op_temp = []
        for operation in active_operations:
            if operation["stop_loss"] > row.Close:
                cash += (row.Close * operation["n_shares"]) * (1 - com)
            elif operation["take_profit"] < row.Close:
                cash += (row.Close * operation["n_shares"]) * (1 - com)
            else:
                active_op_temp.append(operation)
        active_operations = active_op_temp

        # check if we have enough cash
        if cash < (row.Close * (1 + com)):
            asset_vals = sum([operation["n_shares"] * row.Close for operation in active_operations])
            portfolio_value = cash + asset_vals
            continue

        # Apply buy signals
        if buy_signals.loc[i].any():
            active_operations.append({
                "bought": row.Close,
                "n_shares": n_shares,
                "stop_loss": row.Close * stop_loss,
                "take_profit": row.Close * take_profit
            })

            cash -= row.Close * (1 + com) * n_shares

        # Apply sell signals
        if sell_signals.loc[i].any():
            active_op_temp = []
            for operation in active_operations:
                if operation["take_profit"] < row.Close or operation["stop_loss"] > row.Close:
                    cash += (row.Close * operation["n_shares"]) * (1 - com)
                else:
                    active_op_temp.append(operation)
            active_operations = active_op_temp

        asset_vals = sum([operation["n_shares"] * row.Close for operation in active_operations])
        portfolio_value = cash + asset_vals

    return portfolio_value

In [21]:
def optimize(trial, strategy, data):
    portfolio_value = 0

    stop_loss = trial.suggest_float("stop_loss", 0.00250, 0.05)
    take_profit = trial.suggest_float("take_profit", 0.00250, 0.05)
    n_shares = trial.suggest_int("n_shares", 5, 200)

    strat_params = {}

    buy_signals = pd.DataFrame()
    sell_signals = pd.DataFrame()

    if "logistic" in strategy:
        buy_signals["logistic"] = global_buy_signals["predicciones_lr"]
        sell_signals["logistic"] = global_sell_signals["predicciones_lr"]
        
    if "svm" in strategy:
        buy_signals["svm"] = global_buy_signals["predicciones_svm"]
        sell_signals["svm"] = global_sell_signals["predicciones_svm"]
        
    if "xg" in strategy:
        buy_signals["xg"] = global_buy_signals["predicciones_xgboost"]
        sell_signals["xg"] = global_sell_signals["predicciones_xgboost"]
    
    return backtest(data, buy_signals, sell_signals, stop_loss, take_profit, n_shares)

In [22]:
def optimize_file(data):
    data = data.drop(data.index[:30])
    data = data.drop(data.index[-30:])
    data.reset_index(drop=True, inplace=True)
    strategies = list(powerset(["logistic", "svm", "xg"]))
    best_strat = None
    best_val = -1
    best_params = None

    for strat in strategies:
        study = optuna.create_study(direction="maximize")
        study.optimize(lambda x: optimize(x, strat, data), n_trials=30)
        value = study.best_value
        if value > best_val:
            best_val = value
            best_strat = strat
            best_params = study.best_params
    print(study.best_value)
    print(best_strat)
    print(best_params)

    return {"file": data,
            "strat": best_strat,
            "value": best_val,
            "params": best_params}

In [23]:
file_1d = optimize_file(data_1d)
file_1m = optimize_file(data_1m)
file_5m = optimize_file(data_5m)
file_1h = optimize_file(data_1h)


[I 2024-03-08 09:52:32,940] A new study created in memory with name: no-name-1f2e64e8-fb27-4941-a416-b85cc289cad8
[I 2024-03-08 09:52:32,966] Trial 0 finished with value: 917884.8775802505 and parameters: {'stop_loss': 0.04244308488074175, 'take_profit': 0.03604590935976131, 'n_shares': 140}. Best is trial 0 with value: 917884.8775802505.
[I 2024-03-08 09:52:32,992] Trial 1 finished with value: 968327.0242095251 and parameters: {'stop_loss': 0.03820298681559703, 'take_profit': 0.036406831258872153, 'n_shares': 54}. Best is trial 1 with value: 968327.0242095251.
[I 2024-03-08 09:52:33,016] Trial 2 finished with value: 886798.4383784883 and parameters: {'stop_loss': 0.046405644448177005, 'take_profit': 0.03242889647667265, 'n_shares': 193}. Best is trial 1 with value: 968327.0242095251.
[I 2024-03-08 09:52:33,041] Trial 3 finished with value: 940759.804540038 and parameters: {'stop_loss': 0.02702005855694973, 'take_profit': 0.02772089429493936, 'n_shares': 101}. Best is trial 1 with valu

995299.1535478123
('xg',)
{'stop_loss': 0.034588981520722574, 'take_profit': 0.023717941532341004, 'n_shares': 5}
