In [1]:
import os
import numpy as np
import pandas as pd
import xgboost

import optuna
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, root_mean_squared_error 
import joblib

logical_cores = os.cpu_count()
print(f"Number of logical CPU cores: {logical_cores}")

num_workers = max(1, logical_cores // 2)
print(f"Number of workers set to: {num_workers}")

def is_gpu_available():
    try:
        import torch
        return torch.cuda.is_available()
    except ImportError:
        return False

gpu_available = is_gpu_available()
print(f"GPU available: {gpu_available}")

print(xgboost.build_info())

Number of logical CPU cores: 16
Number of workers set to: 8
GPU available: True
{'BUILTIN_PREFETCH_PRESENT': False, 'CUDA_VERSION': [12, 4], 'DEBUG': False, 'MM_PREFETCH_PRESENT': True, 'THRUST_VERSION': [2, 3, 2], 'USE_CUDA': True, 'USE_DLOPEN_NCCL': False, 'USE_FEDERATED': False, 'USE_NCCL': False, 'USE_OPENMP': True, 'USE_RMM': False, 'libxgboost': 'C:\\Users\\ng_mi\\Anaconda\\envs\\pytorch-gpu\\Lib\\site-packages\\xgboost\\lib\\xgboost.dll'}


In [2]:
os.makedirs('../feature-importances/xbclassifier', exist_ok=True)
os.makedirs('../feature-importances/xbregressor', exist_ok=True)
os.makedirs('../models/xgboost/xbclassifier', exist_ok=True)
os.makedirs('../models/xgboost/xbregressor', exist_ok=True)

path = '../data'
ticker_list = []

if os.path.exists(path):
    ticker_list = [os.path.splitext(f)[0] for f in os.listdir(path) if f.endswith('.csv')]


In [3]:
def preprocess_data(df):
    if df.isna().sum().sum() > 0 or df.isin([float('inf'), float('-inf')]).sum().sum() > 0:
        df = df.replace([float('inf'), float('-inf')], float('nan')).dropna()

    df = df.dropna()

    columns_to_drop = [
        'NEXT_DAY_CLOSEPRICE', 'DAILY_CLOSEPRICE_CHANGE', 'CLOSEPRICE_DIRECTION',
        'DAILY_MIDPRICE', 'NEXT_DAY_MIDPRICE', 'DAILY_MIDPRICE_CHANGE', 'MIDPRICE_DIRECTION', 'Date'
    ]
    X = df.drop(columns=columns_to_drop)
    y_classifier = (df['DAILY_CLOSEPRICE_CHANGE'] > 0).astype(int)
    y_regressor = df['DAILY_CLOSEPRICE_CHANGE']

    return X, y_classifier, y_regressor

In [4]:
def process_xbclassifier(X, y, gpu_available, ticker):
    def objective(trial):
        param = {
            'verbosity': 0,
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'tree_method': 'hist',
            'device': 'cuda' if gpu_available else 'cpu',  
            'use_label_encoder': False,
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
            'early_stopping_rounds': 50 
        }

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
        model = XGBClassifier(**param)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        preds = model.predict(X_valid)
        accuracy = accuracy_score(y_valid, preds)
        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    best_model = XGBClassifier(**study.best_params)
    best_model.fit(X, y, eval_set=[(X, y)], verbose=False)
    joblib.dump(best_model, f'../models/xgboost/xbclassifier/{ticker}_best_model.pkl')
    print(f"Best model for {ticker} saved with accuracy: {study.best_value}")

    # Save feature importances
    feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
    feature_importances = feature_importances.sort_values(by='importance', ascending=False)
    feature_importances.to_csv(f'../feature-importances/xbclassifier/{ticker}_feature_importances.csv')
    print(f"Feature importances for {ticker} saved.")

def process_xbregressor(X, y, gpu_available, ticker):
    def objective(trial):
        param = {
            'verbosity': 0,
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'tree_method': 'hist',
            'device': 'cuda' if gpu_available else 'cpu',  
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
            'early_stopping_rounds': 50 
        }

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
        model = XGBRegressor(**param)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        preds = model.predict(X_valid)
        rmse = root_mean_squared_error (y_valid, preds)
        return rmse

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100)

    best_model = XGBRegressor(**study.best_params)
    best_model.fit(X, y, eval_set=[(X, y)], verbose=False)
    joblib.dump(best_model, f'../models/xgboost/xbregressor/{ticker}_best_model.pkl')
    print(f"Best model for {ticker} saved with RMSE: {study.best_value}")

    # Save feature importances
    feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
    feature_importances = feature_importances.sort_values(by='importance', ascending=False)
    feature_importances.to_csv(f'../feature-importances/xbregressor/{ticker}_feature_importances.csv')
    print(f"Feature importances for {ticker} saved.")

In [5]:
for ticker in ticker_list:
    dataframe = pd.read_csv(f"../data/{ticker}.csv")
    X, y_classifier, y_regressor = preprocess_data(dataframe)
    process_xbclassifier(X, y_classifier, gpu_available, ticker)
    process_xbregressor(X, y_regressor, gpu_available, ticker)
    

[I 2024-09-03 21:39:58,596] A new study created in memory with name: no-name-30ec56ff-1f58-4e50-a848-f6e7da8e7623
[I 2024-09-03 21:39:59,018] Trial 0 finished with value: 0.6363636363636364 and parameters: {'n_estimators': 833, 'max_depth': 9, 'learning_rate': 0.02901036333811243, 'subsample': 0.6771652922392146, 'colsample_bytree': 0.8767575450818463, 'gamma': 1.1476099819609897, 'lambda': 0.028355432556007275, 'alpha': 0.0013620098310947548}. Best is trial 0 with value: 0.6363636363636364.
[I 2024-09-03 21:39:59,221] Trial 1 finished with value: 0.6363636363636364 and parameters: {'n_estimators': 291, 'max_depth': 7, 'learning_rate': 0.18625430748736596, 'subsample': 0.6941228714039314, 'colsample_bytree': 0.9342330106599632, 'gamma': 1.4489522914799786, 'lambda': 0.3881742295212244, 'alpha': 0.0022345422928945146}. Best is trial 0 with value: 0.6363636363636364.
[I 2024-09-03 21:39:59,370] Trial 2 finished with value: 0.45454545454545453 and parameters: {'n_estimators': 100, 'max_de

Best model for CL=F saved with accuracy: 0.7575757575757576
Feature importances for CL=F saved.


[I 2024-09-03 21:40:23,856] Trial 1 finished with value: 1.336258722970387 and parameters: {'n_estimators': 306, 'max_depth': 3, 'learning_rate': 0.03143592640913408, 'subsample': 0.9731195334526734, 'colsample_bytree': 0.5875826645528253, 'gamma': 0.3787656005003409, 'lambda': 0.7064818192849284, 'alpha': 9.52604547343284e-08}. Best is trial 0 with value: 1.316238322087783.
[I 2024-09-03 21:40:24,091] Trial 2 finished with value: 1.327103349671084 and parameters: {'n_estimators': 140, 'max_depth': 7, 'learning_rate': 0.057111897162144216, 'subsample': 0.7266551792980884, 'colsample_bytree': 0.8688446450005691, 'gamma': 1.1568090317720392, 'lambda': 0.0017604618635624148, 'alpha': 0.00012294749730046279}. Best is trial 0 with value: 1.316238322087783.
[I 2024-09-03 21:40:24,269] Trial 3 finished with value: 1.3239260144769143 and parameters: {'n_estimators': 747, 'max_depth': 5, 'learning_rate': 0.2469432523159899, 'subsample': 0.8741061635710441, 'colsample_bytree': 0.5029383885908754

Best model for CL=F saved with RMSE: 1.2487237781763556
Feature importances for CL=F saved.


[I 2024-09-03 21:40:48,224] Trial 0 finished with value: 0.5757575757575758 and parameters: {'n_estimators': 304, 'max_depth': 5, 'learning_rate': 0.0855964896141462, 'subsample': 0.835444280660714, 'colsample_bytree': 0.6780315498583558, 'gamma': 1.5287037025291483, 'lambda': 0.00031693570977109537, 'alpha': 0.0036581103685437684}. Best is trial 0 with value: 0.5757575757575758.
[I 2024-09-03 21:40:48,475] Trial 1 finished with value: 0.5151515151515151 and parameters: {'n_estimators': 257, 'max_depth': 6, 'learning_rate': 0.2697438124027815, 'subsample': 0.5925557848580483, 'colsample_bytree': 0.9334053776355753, 'gamma': 0.8585990348268108, 'lambda': 0.009747108766119016, 'alpha': 0.3061968996861288}. Best is trial 0 with value: 0.5757575757575758.
[I 2024-09-03 21:40:48,688] Trial 2 finished with value: 0.5151515151515151 and parameters: {'n_estimators': 298, 'max_depth': 7, 'learning_rate': 0.1621630270089254, 'subsample': 0.8344489239602738, 'colsample_bytree': 0.5694190438343698

Best model for NVDA saved with accuracy: 0.6666666666666666
Feature importances for NVDA saved.


[I 2024-09-03 21:41:17,154] Trial 0 finished with value: 3.3643842494277227 and parameters: {'n_estimators': 390, 'max_depth': 9, 'learning_rate': 0.03824844425422512, 'subsample': 0.6898422914249795, 'colsample_bytree': 0.8891717223318953, 'gamma': 4.372085208396243, 'lambda': 0.16261492499194002, 'alpha': 6.691332088008663e-07}. Best is trial 0 with value: 3.3643842494277227.
[I 2024-09-03 21:41:17,477] Trial 1 finished with value: 3.260327419741223 and parameters: {'n_estimators': 425, 'max_depth': 7, 'learning_rate': 0.10697628343683713, 'subsample': 0.623077122954905, 'colsample_bytree': 0.9785951218275943, 'gamma': 1.2626451457350651, 'lambda': 0.04875082323405018, 'alpha': 0.04955312194124662}. Best is trial 1 with value: 3.260327419741223.
[I 2024-09-03 21:41:17,683] Trial 2 finished with value: 3.7861374109655146 and parameters: {'n_estimators': 875, 'max_depth': 8, 'learning_rate': 0.24430572698667508, 'subsample': 0.7992055860405278, 'colsample_bytree': 0.7239938203085803, '

Best model for NVDA saved with RMSE: 3.1885219770503648
Feature importances for NVDA saved.


[I 2024-09-03 21:41:50,148] Trial 0 finished with value: 0.6857142857142857 and parameters: {'n_estimators': 299, 'max_depth': 7, 'learning_rate': 0.0543133689676546, 'subsample': 0.7760709552370642, 'colsample_bytree': 0.8875843910489936, 'gamma': 0.27527080528526204, 'lambda': 9.51896696057814e-08, 'alpha': 1.0162857150670247e-06}. Best is trial 0 with value: 0.6857142857142857.
[I 2024-09-03 21:41:50,448] Trial 1 finished with value: 0.7142857142857143 and parameters: {'n_estimators': 840, 'max_depth': 3, 'learning_rate': 0.05391102804051353, 'subsample': 0.5616693192206409, 'colsample_bytree': 0.6446966623018129, 'gamma': 2.957476086693145, 'lambda': 0.509672651941767, 'alpha': 4.188441289971723e-05}. Best is trial 1 with value: 0.7142857142857143.
[I 2024-09-03 21:41:50,707] Trial 2 finished with value: 0.5142857142857142 and parameters: {'n_estimators': 833, 'max_depth': 9, 'learning_rate': 0.1829461740993358, 'subsample': 0.6073972149343942, 'colsample_bytree': 0.761756037621321

Best model for SGDUSD=X saved with accuracy: 0.7714285714285715
Feature importances for SGDUSD=X saved.


[I 2024-09-03 21:42:17,484] Trial 1 finished with value: 0.0014156801122100055 and parameters: {'n_estimators': 569, 'max_depth': 9, 'learning_rate': 0.08679686583454274, 'subsample': 0.9573085186704666, 'colsample_bytree': 0.5057876984762509, 'gamma': 4.490661131429718, 'lambda': 0.009862283556628599, 'alpha': 0.290558823250246}. Best is trial 0 with value: 0.0014150814058400622.
[I 2024-09-03 21:42:17,669] Trial 2 finished with value: 0.0014156801122100055 and parameters: {'n_estimators': 469, 'max_depth': 3, 'learning_rate': 0.021915948314185155, 'subsample': 0.5145274099016863, 'colsample_bytree': 0.6674986748658409, 'gamma': 4.539863010121371, 'lambda': 0.008875975383777164, 'alpha': 0.12779924422190694}. Best is trial 0 with value: 0.0014150814058400622.
[I 2024-09-03 21:42:17,893] Trial 3 finished with value: 0.0014073170606808752 and parameters: {'n_estimators': 213, 'max_depth': 5, 'learning_rate': 0.25516504872497414, 'subsample': 0.6501184347740301, 'colsample_bytree': 0.861

Best model for SGDUSD=X saved with RMSE: 0.0013993269250579831
Feature importances for SGDUSD=X saved.


[I 2024-09-03 21:42:42,364] Trial 0 finished with value: 0.6857142857142857 and parameters: {'n_estimators': 785, 'max_depth': 7, 'learning_rate': 0.037287837673124075, 'subsample': 0.7619639094246278, 'colsample_bytree': 0.9162414328703818, 'gamma': 3.6912456976937937, 'lambda': 1.983676802078895e-07, 'alpha': 1.165752902504391e-06}. Best is trial 0 with value: 0.6857142857142857.
[I 2024-09-03 21:42:42,590] Trial 1 finished with value: 0.7714285714285715 and parameters: {'n_estimators': 345, 'max_depth': 3, 'learning_rate': 0.07259026250653362, 'subsample': 0.8659297224117408, 'colsample_bytree': 0.7392324514727707, 'gamma': 2.113530029278552, 'lambda': 1.5524736923527545e-06, 'alpha': 2.652818960729626e-05}. Best is trial 1 with value: 0.7714285714285715.
[I 2024-09-03 21:42:42,769] Trial 2 finished with value: 0.7142857142857143 and parameters: {'n_estimators': 847, 'max_depth': 3, 'learning_rate': 0.291194813169599, 'subsample': 0.8866056610858669, 'colsample_bytree': 0.5505123740

Best model for USDSGD=X saved with accuracy: 0.7714285714285715
Feature importances for USDSGD=X saved.


[I 2024-09-03 21:43:11,434] Trial 1 finished with value: 0.0025521400248092094 and parameters: {'n_estimators': 459, 'max_depth': 10, 'learning_rate': 0.023860865188488525, 'subsample': 0.7606152827709505, 'colsample_bytree': 0.8498681787881441, 'gamma': 0.9202558365399138, 'lambda': 1.3790311864463325e-07, 'alpha': 0.0009531238520627677}. Best is trial 0 with value: 0.002545567883816371.
[I 2024-09-03 21:43:11,692] Trial 2 finished with value: 0.002548680179045196 and parameters: {'n_estimators': 901, 'max_depth': 8, 'learning_rate': 0.11911703498522784, 'subsample': 0.9190969136850483, 'colsample_bytree': 0.9661251334840348, 'gamma': 1.0113611502673492, 'lambda': 0.08136928215418568, 'alpha': 8.840005984242586e-05}. Best is trial 0 with value: 0.002545567883816371.
[I 2024-09-03 21:43:11,886] Trial 3 finished with value: 0.0025522179016584444 and parameters: {'n_estimators': 398, 'max_depth': 4, 'learning_rate': 0.06997980608013128, 'subsample': 0.8459562286245093, 'colsample_bytree'

Best model for USDSGD=X saved with RMSE: 0.0025270044593394985
Feature importances for USDSGD=X saved.
