In [1]:
import os
import numpy as np
import pandas as pd
import xgboost

import optuna
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, root_mean_squared_error 
import joblib

logical_cores = os.cpu_count()
print(f"Number of logical CPU cores: {logical_cores}")

num_workers = max(1, logical_cores // 2)
print(f"Number of workers set to: {num_workers}")

def is_gpu_available():
    try:
        import torch
        return torch.cuda.is_available()
    except ImportError:
        return False

gpu_available = is_gpu_available()
print(f"GPU available: {gpu_available}")

print(xgboost.build_info())

Number of logical CPU cores: 16
Number of workers set to: 8
GPU available: True
{'BUILTIN_PREFETCH_PRESENT': False, 'CUDA_VERSION': [12, 4], 'DEBUG': False, 'MM_PREFETCH_PRESENT': True, 'THRUST_VERSION': [2, 3, 2], 'USE_CUDA': True, 'USE_DLOPEN_NCCL': False, 'USE_FEDERATED': False, 'USE_NCCL': False, 'USE_OPENMP': True, 'USE_RMM': False, 'libxgboost': 'C:\\Users\\ng_mi\\Anaconda\\envs\\pytorch-gpu\\Lib\\site-packages\\xgboost\\lib\\xgboost.dll'}


In [2]:
os.makedirs('../feature-importances/xbclassifier', exist_ok=True)
os.makedirs('../feature-importances/xbregressor', exist_ok=True)
os.makedirs('../models/xgboost/xbclassifier', exist_ok=True)
os.makedirs('../models/xgboost/xbregressor', exist_ok=True)

path = '../data'

csv_file_path = "../ticker-best-model.csv"
if os.path.isfile(csv_file_path):
    # Load the existing file into a DataFrame
    ticker_df = pd.read_csv(csv_file_path)
else:
    # Define the column types
    column_types = {
        "Ticker": str,
        "Best_XGBClassifier_Score": float,
        "Best_XGBClassifier_Path": str,
        "Best_XGBRegressor_Score": float,
        "Best_XGBRegressor_Path": str,
        "Best_Cov1D_Score": float,
        "Best_Cov1D_Path": str,
        "Best_LSTM_Score": float,
        "Best_LSTM_Path": str,
        "Best_Cov1D-LSTM_Score": float,
        "Best_Cov1D-LSTM_Path": str
    }
    ticker_df = pd.DataFrame(columns=column_types.keys()).astype(column_types)

ticker_list = []

if os.path.exists(path):
    ticker_list = [os.path.splitext(f)[0] for f in os.listdir(path) if f.endswith('.csv')]


In [3]:
def preprocess_data(df):
    if df.isna().sum().sum() > 0 or df.isin([float('inf'), float('-inf')]).sum().sum() > 0:
        df = df.replace([float('inf'), float('-inf')], float('nan')).dropna()

    df = df.dropna()

    columns_to_drop = [
        'NEXT_DAY_CLOSEPRICE', 'DAILY_CLOSEPRICE_CHANGE', 'CLOSEPRICE_DIRECTION',
        'DAILY_MIDPRICE', 'NEXT_DAY_MIDPRICE', 'DAILY_MIDPRICE_CHANGE', 'MIDPRICE_DIRECTION', 'Date'
    ]
    X = df.drop(columns=columns_to_drop)
    y_classifier = (df['DAILY_CLOSEPRICE_CHANGE'] > 0).astype(int)
    y_regressor = df['DAILY_CLOSEPRICE_CHANGE']

    return X, y_classifier, y_regressor

In [4]:
def process_xbclassifier(X, y, gpu_available, ticker, ticker_df):
    def objective(trial):
        param = {
            'verbosity': 0,
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'tree_method': 'hist',
            'device': 'cuda' if gpu_available else 'cpu',  
            'use_label_encoder': False,
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),  # Adjusting range
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),     # Adjusting range
            'early_stopping_rounds': 50 
        }

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
        model = XGBClassifier(**param)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        preds = model.predict(X_valid)
        accuracy = accuracy_score(y_valid, preds)
        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    best_model = XGBClassifier(**study.best_params)
    best_model.fit(X, y, eval_set=[(X, y)], verbose=False)

    model_path = f'../models/xgboost/xbclassifier/{ticker}_best_model.pkl'

    # Update ticker_df
    if ticker in ticker_df['Ticker'].values:
        current_score = ticker_df.loc[ticker_df['Ticker'] == ticker, 'Best_XGBClassifier_Score'].values[0]
        if pd.isnull(current_score) or study.best_value > current_score:
            ticker_df.loc[ticker_df['Ticker'] == ticker, ['Best_XGBClassifier_Score', 'Best_XGBClassifier_Path']] = [study.best_value, model_path]
            joblib.dump(best_model, model_path)
            ticker_df.to_csv(csv_file_path, index=False)
            print(f"Best model for {ticker} saved with accuracy: {study.best_value}")
        else:
            print(f"Previous model accuracy: {current_score} is better for {ticker} than accuracy: {study.best_value}")
    else:
        new_row = pd.DataFrame({'Ticker': [ticker], 'Best_XGBClassifier_Score': [study.best_value], 'Best_XGBClassifier_Path': [model_path]})
        ticker_df = pd.concat([ticker_df, new_row], ignore_index=True)
        joblib.dump(best_model, model_path)
        ticker_df.to_csv(csv_file_path, index=False)
        print(f"Best model for {ticker} saved with accuracy: {study.best_value}")

    # Save feature importances
    feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
    feature_importances = feature_importances.sort_values(by='importance', ascending=False)
    feature_importances.to_csv(f'../feature-importances/xbclassifier/{ticker}_feature_importances.csv')
    print(f"Feature importances for {ticker} saved.")
    return ticker_df

def process_xbregressor(X, y, gpu_available, ticker, ticker_df):
    def objective(trial):
        param = {
            'verbosity': 0,
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'tree_method': 'hist',
            'device': 'cuda' if gpu_available else 'cpu',  
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),  # Adjusting range
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),     # Adjusting range
            'early_stopping_rounds': 50 
        }

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
        model = XGBRegressor(**param)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        preds = model.predict(X_valid)
        rmse = root_mean_squared_error (y_valid, preds)
        return rmse

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100)

    best_model = XGBRegressor(**study.best_params)
    best_model.fit(X, y, eval_set=[(X, y)], verbose=False)

    model_path = f'../models/xgboost/xbclassifier/{ticker}_best_model.pkl'
    # Update ticker_df
    if ticker in ticker_df['Ticker'].values:
        current_score = ticker_df.loc[ticker_df['Ticker'] == ticker, 'Best_XGBRegressor_Score'].values[0]
        if pd.isnull(current_score) or study.best_value < current_score:
            ticker_df.loc[ticker_df['Ticker'] == ticker, ['Best_XGBRegressor_Score', 'Best_XGBRegressor_Path']] = [study.best_value, model_path]
            joblib.dump(best_model, model_path)
            ticker_df.to_csv(csv_file_path, index=False)
            print(f"Best model for {ticker} saved with RMSE: {study.best_value}")
        else:
            print(f"Previous model MSE: {current_score} is better for {ticker} than MSE: {study.best_value}")
    else:
        new_row = pd.DataFrame({'Ticker': [ticker], 'Best_XGBRegressor_Score': [study.best_value], 'Best_XGBRegressor_Path': [model_path]})
        ticker_df = pd.concat([ticker_df, new_row], ignore_index=True)
        joblib.dump(best_model, model_path)
        ticker_df.to_csv(csv_file_path, index=False)
        print(f"Best model for {ticker} saved with RMSE: {study.best_value}")

    # Save feature importances
    feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
    feature_importances = feature_importances.sort_values(by='importance', ascending=False)
    feature_importances.to_csv(f'../feature-importances/xbregressor/{ticker}_feature_importances.csv')
    print(f"Feature importances for {ticker} saved.")
    
    return ticker_df

In [5]:
for ticker in ticker_list:
    dataframe = pd.read_csv(f"../data/{ticker}.csv")
    X, y_classifier, y_regressor = preprocess_data(dataframe)
    ticker_df = process_xbclassifier(X, y_classifier, gpu_available, ticker, ticker_df)
    ticker_df = process_xbregressor(X, y_regressor, gpu_available, ticker, ticker_df)


[I 2024-09-04 17:21:13,943] A new study created in memory with name: no-name-13258e9f-740a-448c-916a-9d8b70bfccdc
[I 2024-09-04 17:21:14,172] Trial 0 finished with value: 0.5757575757575758 and parameters: {'n_estimators': 606, 'max_depth': 6, 'learning_rate': 0.16956331361496488, 'subsample': 0.9203009077203326, 'colsample_bytree': 0.6101555051320551, 'gamma': 3.779047062879121, 'lambda': 2.760017590739315e-08, 'alpha': 0.023667174453073078}. Best is trial 0 with value: 0.5757575757575758.
[I 2024-09-04 17:21:14,337] Trial 1 finished with value: 0.45454545454545453 and parameters: {'n_estimators': 585, 'max_depth': 6, 'learning_rate': 0.1999863167140352, 'subsample': 0.5448079412664129, 'colsample_bytree': 0.7472671332448432, 'gamma': 3.668041890193729, 'lambda': 0.015388760988040678, 'alpha': 7.274431064359336e-07}. Best is trial 0 with value: 0.5757575757575758.
[I 2024-09-04 17:21:14,638] Trial 2 finished with value: 0.5757575757575758 and parameters: {'n_estimators': 966, 'max_dep

Best model for CL=F saved with accuracy: 0.7272727272727273
Feature importances for CL=F saved.


[I 2024-09-04 17:21:41,654] Trial 0 finished with value: 1.3964517820871156 and parameters: {'n_estimators': 582, 'max_depth': 8, 'learning_rate': 0.07373345564109023, 'subsample': 0.7082875858251951, 'colsample_bytree': 0.7766891422750345, 'gamma': 2.869085972892667, 'lambda': 0.00010458309038377258, 'alpha': 7.968844830716281e-05}. Best is trial 0 with value: 1.3964517820871156.
[I 2024-09-04 17:21:41,899] Trial 1 finished with value: 1.405363031754357 and parameters: {'n_estimators': 946, 'max_depth': 8, 'learning_rate': 0.16771276246656766, 'subsample': 0.8442963842118825, 'colsample_bytree': 0.7299348975002897, 'gamma': 2.0727039788174655, 'lambda': 0.007571902916875221, 'alpha': 8.675624874861393e-05}. Best is trial 0 with value: 1.3964517820871156.
[I 2024-09-04 17:21:42,091] Trial 2 finished with value: 1.383526042283853 and parameters: {'n_estimators': 752, 'max_depth': 3, 'learning_rate': 0.05674584114785739, 'subsample': 0.6703082277885326, 'colsample_bytree': 0.661874134384

Best model for CL=F saved with RMSE: 1.2384211499733446
Feature importances for CL=F saved.


[I 2024-09-04 17:22:10,205] Trial 0 finished with value: 0.5454545454545454 and parameters: {'n_estimators': 883, 'max_depth': 10, 'learning_rate': 0.1874868327221471, 'subsample': 0.7510918320842515, 'colsample_bytree': 0.8909289902423769, 'gamma': 3.0984428784034286, 'lambda': 0.005398612852136964, 'alpha': 0.07501776588091376}. Best is trial 0 with value: 0.5454545454545454.
[I 2024-09-04 17:22:10,458] Trial 1 finished with value: 0.5454545454545454 and parameters: {'n_estimators': 740, 'max_depth': 7, 'learning_rate': 0.050554474803945454, 'subsample': 0.6003644439853417, 'colsample_bytree': 0.7883872467571096, 'gamma': 3.8367242546440696, 'lambda': 1.2810916268420763e-06, 'alpha': 1.1631944291430427e-05}. Best is trial 0 with value: 0.5454545454545454.
[I 2024-09-04 17:22:10,676] Trial 2 finished with value: 0.48484848484848486 and parameters: {'n_estimators': 117, 'max_depth': 4, 'learning_rate': 0.24948029783361023, 'subsample': 0.8441745158753311, 'colsample_bytree': 0.74319064

Best model for NVDA saved with accuracy: 0.6666666666666666
Feature importances for NVDA saved.


[I 2024-09-04 17:22:39,334] Trial 0 finished with value: 4.190183554812025 and parameters: {'n_estimators': 976, 'max_depth': 6, 'learning_rate': 0.16694284627538222, 'subsample': 0.7978043940134718, 'colsample_bytree': 0.6558793157919459, 'gamma': 1.842669874819482, 'lambda': 2.6417142338505636e-08, 'alpha': 0.33711783593851175}. Best is trial 0 with value: 4.190183554812025.
[I 2024-09-04 17:22:39,702] Trial 1 finished with value: 4.394428999570039 and parameters: {'n_estimators': 841, 'max_depth': 9, 'learning_rate': 0.24698151072149463, 'subsample': 0.6576877666393659, 'colsample_bytree': 0.8885400732546669, 'gamma': 3.6242921833148274, 'lambda': 0.03896650284291031, 'alpha': 0.08781743638690474}. Best is trial 0 with value: 4.190183554812025.
[I 2024-09-04 17:22:40,005] Trial 2 finished with value: 4.170434683214046 and parameters: {'n_estimators': 237, 'max_depth': 9, 'learning_rate': 0.0628804647123689, 'subsample': 0.7751020011844258, 'colsample_bytree': 0.9886435932937855, 'ga

Best model for NVDA saved with RMSE: 3.882965776152511
Feature importances for NVDA saved.


[I 2024-09-04 17:23:13,864] Trial 0 finished with value: 0.4857142857142857 and parameters: {'n_estimators': 134, 'max_depth': 5, 'learning_rate': 0.14423280199726782, 'subsample': 0.759232577404015, 'colsample_bytree': 0.6328668838147248, 'gamma': 2.1539672282819278, 'lambda': 8.153733063827005e-07, 'alpha': 0.006774286012055272}. Best is trial 0 with value: 0.4857142857142857.
[I 2024-09-04 17:23:14,199] Trial 1 finished with value: 0.5714285714285714 and parameters: {'n_estimators': 744, 'max_depth': 7, 'learning_rate': 0.057435162860560995, 'subsample': 0.7471918073427022, 'colsample_bytree': 0.5805498860622179, 'gamma': 1.4551026733030787, 'lambda': 0.0007432056627284928, 'alpha': 0.0031071725627101325}. Best is trial 1 with value: 0.5714285714285714.
[I 2024-09-04 17:23:14,408] Trial 2 finished with value: 0.5714285714285714 and parameters: {'n_estimators': 529, 'max_depth': 4, 'learning_rate': 0.16246981355903112, 'subsample': 0.8102370818994652, 'colsample_bytree': 0.8269716017

Best model for SGDUSD=X saved with accuracy: 0.7142857142857143
Feature importances for SGDUSD=X saved.


[I 2024-09-04 17:23:46,859] Trial 0 finished with value: 0.00159216216667633 and parameters: {'n_estimators': 394, 'max_depth': 9, 'learning_rate': 0.15852924235460716, 'subsample': 0.8149083425546071, 'colsample_bytree': 0.7581825399721767, 'gamma': 1.9110547229747255, 'lambda': 0.0037744449773767035, 'alpha': 9.964644934324446e-08}. Best is trial 0 with value: 0.00159216216667633.
[I 2024-09-04 17:23:47,149] Trial 1 finished with value: 0.0015936041556869956 and parameters: {'n_estimators': 953, 'max_depth': 10, 'learning_rate': 0.058425692476956184, 'subsample': 0.8403223478213173, 'colsample_bytree': 0.6702461761329888, 'gamma': 4.062740927000903, 'lambda': 0.021342990455426085, 'alpha': 2.6551788946394406e-07}. Best is trial 0 with value: 0.00159216216667633.
[I 2024-09-04 17:23:47,349] Trial 2 finished with value: 0.0015891296093400171 and parameters: {'n_estimators': 914, 'max_depth': 10, 'learning_rate': 0.20492445336490814, 'subsample': 0.5479239690715003, 'colsample_bytree': 

Best model for SGDUSD=X saved with RMSE: 0.0015891294385095805
Feature importances for SGDUSD=X saved.


[I 2024-09-04 17:24:16,424] Trial 0 finished with value: 0.5714285714285714 and parameters: {'n_estimators': 972, 'max_depth': 7, 'learning_rate': 0.027865174579078493, 'subsample': 0.6999385470383755, 'colsample_bytree': 0.5618672653440497, 'gamma': 2.934410917954826, 'lambda': 0.00024618394867087186, 'alpha': 8.494645902053341e-05}. Best is trial 0 with value: 0.5714285714285714.
[I 2024-09-04 17:24:16,709] Trial 1 finished with value: 0.6285714285714286 and parameters: {'n_estimators': 270, 'max_depth': 5, 'learning_rate': 0.23551782818838138, 'subsample': 0.8095689800655763, 'colsample_bytree': 0.5743763993795208, 'gamma': 3.2168248236891293, 'lambda': 0.32917672876509024, 'alpha': 0.5662872678999222}. Best is trial 1 with value: 0.6285714285714286.
[I 2024-09-04 17:24:16,921] Trial 2 finished with value: 0.5714285714285714 and parameters: {'n_estimators': 933, 'max_depth': 3, 'learning_rate': 0.24993289047355305, 'subsample': 0.8664881992347553, 'colsample_bytree': 0.8264751520389

Best model for USDSGD=X saved with accuracy: 0.7142857142857143
Feature importances for USDSGD=X saved.


[I 2024-09-04 17:24:48,726] Trial 0 finished with value: 0.0028358642979819924 and parameters: {'n_estimators': 984, 'max_depth': 7, 'learning_rate': 0.17943402224141092, 'subsample': 0.9308315127444975, 'colsample_bytree': 0.9138007930054622, 'gamma': 1.715987122966887, 'lambda': 0.007693102122874307, 'alpha': 7.867399175637331e-05}. Best is trial 0 with value: 0.0028358642979819924.
[I 2024-09-04 17:24:48,912] Trial 1 finished with value: 0.002837450811333283 and parameters: {'n_estimators': 646, 'max_depth': 3, 'learning_rate': 0.2982946217773021, 'subsample': 0.7794521954612621, 'colsample_bytree': 0.6199442369098542, 'gamma': 3.5952381896911794, 'lambda': 9.07940721372232e-06, 'alpha': 0.024289899162864902}. Best is trial 0 with value: 0.0028358642979819924.
[I 2024-09-04 17:24:49,152] Trial 2 finished with value: 0.0028328796391671454 and parameters: {'n_estimators': 202, 'max_depth': 5, 'learning_rate': 0.23801650108575426, 'subsample': 0.8854714865333948, 'colsample_bytree': 0.

Best model for USDSGD=X saved with RMSE: 0.0028284929941148032
Feature importances for USDSGD=X saved.
