In [1]:
import os
import numpy as np
import pandas as pd
import xgboost

import optuna
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, root_mean_squared_error 
import joblib

logical_cores = os.cpu_count()
print(f"Number of logical CPU cores: {logical_cores}")

num_workers = max(1, logical_cores // 2)
print(f"Number of workers set to: {num_workers}")

def is_gpu_available():
    try:
        import torch
        return torch.cuda.is_available()
    except ImportError:
        return False

gpu_available = is_gpu_available()
print(f"GPU available: {gpu_available}")

print(xgboost.build_info())

Number of logical CPU cores: 16
Number of workers set to: 8
GPU available: True
{'BUILTIN_PREFETCH_PRESENT': False, 'CUDA_VERSION': [12, 4], 'DEBUG': False, 'MM_PREFETCH_PRESENT': True, 'THRUST_VERSION': [2, 3, 2], 'USE_CUDA': True, 'USE_DLOPEN_NCCL': False, 'USE_FEDERATED': False, 'USE_NCCL': False, 'USE_OPENMP': True, 'USE_RMM': False, 'libxgboost': 'C:\\Users\\ng_mi\\Anaconda\\envs\\pytorch-gpu\\Lib\\site-packages\\xgboost\\lib\\xgboost.dll'}


In [2]:
def load_or_create_ticker_df(csv_file_path):
    """
    Load the existing ticker DataFrame from a CSV file if it exists,
    otherwise create a new DataFrame with predefined column types.
    Ensure the DataFrame has the specified columns, add any missing columns,
    and rearrange the columns in alphabetical order, excluding 'Ticker_Symbol'.

    Args:
    csv_file_path (str): The path to the CSV file.

    Returns:
    pd.DataFrame: The loaded or newly created DataFrame.
    """
    # Define the column types
    column_types = {
        "Ticker_Symbol": str,
        "Best_Cov1D_Classification_Accuracy": float,
        "Best_Cov1D_Classification_Path": str,
        "Best_Cov1D_Regression_RMSE": float,
        "Best_Cov1D_Regression_Path": str,
        "Best_LSTM_Classification_Accuracy": float,
        "Best_LSTM_Classification_Path": str,
        "Best_LSTM_Regression_RMSE": float,
        "Best_LSTM_Regression_Path": str,
        "Best_Transformer_Classification_Accuracy": float,
        "Best_TransformerClassification_Path": str,
        "Best_Transformer_Regression_RMSE": float,
        "Best_Transformer_Regression_Path": str,
        "Best_XGBClassifier_Classification_Accuracy": float,
        "Best_XGBClassifier_Classification_Path": str,
        "Best_XGBRegressor_Regression_RMSE": float,
        "Best_XGBRegressor_Regression_Path": str
    }


    if os.path.isfile(csv_file_path):
        # Load the existing file into a DataFrame
        ticker_df = pd.read_csv(csv_file_path)
        
        # Ensure all specified columns are present
        for column, dtype in column_types.items():
            if column not in ticker_df.columns:
                ticker_df[column] = pd.Series(dtype=dtype)
        
        # Reorder columns alphabetically, excluding 'Ticker_Symbol'
        columns = ["Ticker_Symbol"] + sorted([col for col in ticker_df.columns if col != "Ticker_Symbol"])
        ticker_df = ticker_df[columns]
    else:
        # Create a new DataFrame with the specified column types
        ticker_df = pd.DataFrame(columns=column_types.keys()).astype(column_types)
    
    return ticker_df

csv_file_path = "../ticker-best-model.csv"
ticker_df = load_or_create_ticker_df(csv_file_path)

In [3]:
os.makedirs('../feature-importances/xbclassifier', exist_ok=True)
os.makedirs('../feature-importances/xbregressor', exist_ok=True)
os.makedirs('../models/xgboost/xbclassifier', exist_ok=True)
os.makedirs('../models/xgboost/xbregressor', exist_ok=True)

path = '../data/train'

ticker_list = []

if os.path.exists(path):
    ticker_list = [os.path.splitext(f)[0] for f in os.listdir(path) if f.endswith('.csv')]


In [4]:
def preprocess_data(df):
    if df.isna().sum().sum() > 0 or df.isin([float('inf'), float('-inf')]).sum().sum() > 0:
        df = df.replace([float('inf'), float('-inf')], float('nan')).dropna()

    df = df.dropna()

    columns_to_drop = [
        'NEXT_DAY_CLOSEPRICE', 'DAILY_CLOSEPRICE_CHANGE', 'CLOSEPRICE_DIRECTION',
        'DAILY_MIDPRICE', 'NEXT_DAY_MIDPRICE', 'DAILY_MIDPRICE_CHANGE', 'MIDPRICE_DIRECTION', 'Date'
    ]
    X = df.drop(columns=columns_to_drop)
    y_classifier = (df['DAILY_CLOSEPRICE_CHANGE'] > 0).astype(int)
    y_regressor = df['DAILY_CLOSEPRICE_CHANGE']

    return X, y_classifier, y_regressor

In [5]:
def xbclassifier_hyperparameters_search(X, y, gpu_available, ticker, ticker_df, csv_file_path):
    def objective(trial):
        param = {
            'verbosity': 0,
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'tree_method': 'hist',
            'device': 'cuda' if gpu_available else 'cpu',  
            'use_label_encoder': False,
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),  # Adjusting range
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),     # Adjusting range
            'early_stopping_rounds': 50 
        }

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
        model = XGBClassifier(**param)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        preds = model.predict(X_valid)
        accuracy = accuracy_score(y_valid, preds)
        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    best_model = XGBClassifier(**study.best_params)
    best_model.fit(X, y, eval_set=[(X, y)], verbose=False)

    model_path = f'../models/xgboost/xbclassifier/{ticker}_best_model.pkl'

    # Update ticker_df
    if ticker in ticker_df['Ticker_Symbol'].values:
        current_score = ticker_df.loc[ticker_df['Ticker_Symbol'] == ticker, 'Best_XGBClassifier_Classification_Accuracy'].values[0]
        if pd.isnull(current_score) or study.best_value > current_score:
            ticker_df.loc[ticker_df['Ticker_Symbol'] == ticker, ['Best_XGBClassifier_Classification_Accuracy', 'Best_XGBClassifier_Classification_Path']] = [study.best_value, model_path]
            joblib.dump(best_model, model_path)
            ticker_df.to_csv(csv_file_path, index=False)
            print(f"Best model for {ticker} saved with accuracy: {study.best_value}")
             # Save feature importances
            feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
            feature_importances = feature_importances.sort_values(by='importance', ascending=False)
            feature_importances.to_csv(f'../feature-importances/xbclassifier/{ticker}_feature_importances.csv')
            print(f"Feature importances for {ticker} saved.")
        else:
            print(f"Previous model accuracy: {current_score} is better for {ticker} than accuracy: {study.best_value}")
    else:
        new_row = pd.DataFrame({'Ticker_Symbol': [ticker], 'Best_XGBClassifier_Classification_Accuracy': [study.best_value], 'Best_XGBClassifier_Classification_Path': [model_path]})
        ticker_df = pd.concat([ticker_df, new_row], ignore_index=True)
        joblib.dump(best_model, model_path)
        ticker_df.to_csv(csv_file_path, index=False)
        print(f"Best model for {ticker} saved with accuracy: {study.best_value}")

        # Save feature importances
        feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
        feature_importances = feature_importances.sort_values(by='importance', ascending=False)
        feature_importances.to_csv(f'../feature-importances/xbclassifier/{ticker}_feature_importances.csv')
        print(f"Feature importances for {ticker} saved.")

    return ticker_df


def xbregressor_hyperparameters_search(X, y, gpu_available, ticker, ticker_df, csv_file_path):
    def objective(trial):
        param = {
            'verbosity': 0,
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'tree_method': 'hist',
            'device': 'cuda' if gpu_available else 'cpu',  
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),  # Adjusting range
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),     # Adjusting range
            'early_stopping_rounds': 50 
        }

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
        model = XGBRegressor(**param)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        preds = model.predict(X_valid)
        rmse = root_mean_squared_error (y_valid, preds)
        return rmse

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100)

    best_model = XGBRegressor(**study.best_params)
    best_model.fit(X, y, eval_set=[(X, y)], verbose=False)

    model_path = f'../models/xgboost/xbclassifier/{ticker}_best_model.pkl'
    # Update ticker_df
    if ticker in ticker_df['Ticker_Symbol'].values:
        current_score = ticker_df.loc[ticker_df['Ticker_Symbol'] == ticker, 'Best_XGBRegressor_Regression_RMSE'].values[0]
        if pd.isnull(current_score) or study.best_value < current_score:
            ticker_df.loc[ticker_df['Ticker_Symbol'] == ticker, ['Best_XGBRegressor_Regression_RMSE', 'Best_XGBRegressor_Regression_Path']] = [study.best_value, model_path]
            joblib.dump(best_model, model_path)
            ticker_df.to_csv(csv_file_path, index=False)
            print(f"Best model for {ticker} saved with RMSE: {study.best_value}")

            # Save feature importances
            feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
            feature_importances = feature_importances.sort_values(by='importance', ascending=False)
            feature_importances.to_csv(f'../feature-importances/xbregressor/{ticker}_feature_importances.csv')
            print(f"Feature importances for {ticker} saved.")
        else:
            print(f"Previous model MSE: {current_score} is better for {ticker} than MSE: {study.best_value}")
    else:
        new_row = pd.DataFrame({'Ticker_Symbol': [ticker], 'Best_XGBRegressor_Regression_RMSE': [study.best_value], 'Best_XGBRegressor_Regression_Path': [model_path]})
        ticker_df = pd.concat([ticker_df, new_row], ignore_index=True)
        joblib.dump(best_model, model_path)
        ticker_df.to_csv(csv_file_path, index=False)
        print(f"Best model for {ticker} saved with RMSE: {study.best_value}")
        # Save feature importances
        feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
        feature_importances = feature_importances.sort_values(by='importance', ascending=False)
        feature_importances.to_csv(f'../feature-importances/xbregressor/{ticker}_feature_importances.csv')
        print(f"Feature importances for {ticker} saved.")
   
    
    return ticker_df

In [7]:
for ticker in ticker_list:
    dataframe = pd.read_csv(f"../data/train/{ticker}.csv")
    X, y_classifier, y_regressor = preprocess_data(dataframe)
    ticker_df = xbclassifier_hyperparameters_search(X, y_classifier, gpu_available, ticker, ticker_df, csv_file_path)
    ticker_df = xbregressor_hyperparameters_search(X, y_regressor, gpu_available, ticker, ticker_df, csv_file_path)


[I 2024-09-07 15:43:57,822] A new study created in memory with name: no-name-e6378d11-42e1-405e-9e5d-f3bd7c5f524c
[I 2024-09-07 15:43:58,455] Trial 0 finished with value: 0.6056338028169014 and parameters: {'n_estimators': 932, 'max_depth': 10, 'learning_rate': 0.160530906905138, 'subsample': 0.7995899263617309, 'colsample_bytree': 0.775397549311085, 'gamma': 1.2318037929822678, 'lambda': 0.141448092178053, 'alpha': 0.011448866478510967}. Best is trial 0 with value: 0.6056338028169014.
[I 2024-09-07 15:43:58,790] Trial 1 finished with value: 0.6619718309859155 and parameters: {'n_estimators': 163, 'max_depth': 9, 'learning_rate': 0.12071778031698321, 'subsample': 0.9195232731071336, 'colsample_bytree': 0.7815784854893654, 'gamma': 1.8593423407261405, 'lambda': 0.3243026095382506, 'alpha': 6.860056353739401e-07}. Best is trial 1 with value: 0.6619718309859155.
[I 2024-09-07 15:43:59,347] Trial 2 finished with value: 0.6619718309859155 and parameters: {'n_estimators': 397, 'max_depth': 7

Best model for CL=F saved with accuracy: 0.704225352112676
Feature importances for CL=F saved.


[I 2024-09-07 15:44:45,152] Trial 0 finished with value: 1.6614497783817446 and parameters: {'n_estimators': 854, 'max_depth': 3, 'learning_rate': 0.2411711586402869, 'subsample': 0.546181969476544, 'colsample_bytree': 0.5718828814925896, 'gamma': 0.9115192300289998, 'lambda': 0.0431822970331042, 'alpha': 0.000997144755928802}. Best is trial 0 with value: 1.6614497783817446.
[I 2024-09-07 15:44:45,948] Trial 1 finished with value: 1.52909099280126 and parameters: {'n_estimators': 314, 'max_depth': 7, 'learning_rate': 0.0699999802005318, 'subsample': 0.7620379485212441, 'colsample_bytree': 0.700550441988826, 'gamma': 0.39335857360481563, 'lambda': 3.447237767817022e-05, 'alpha': 4.462039537187048e-05}. Best is trial 1 with value: 1.52909099280126.
[I 2024-09-07 15:44:46,511] Trial 2 finished with value: 1.5884277588586069 and parameters: {'n_estimators': 614, 'max_depth': 9, 'learning_rate': 0.0298954324322436, 'subsample': 0.9520735563728275, 'colsample_bytree': 0.6330190768845038, 'ga

Best model for CL=F saved with RMSE: 1.4742101366755396
Feature importances for CL=F saved.


[I 2024-09-07 15:45:40,488] Trial 0 finished with value: 0.6619718309859155 and parameters: {'n_estimators': 964, 'max_depth': 8, 'learning_rate': 0.204761815860461, 'subsample': 0.7117042514894225, 'colsample_bytree': 0.9404068312931564, 'gamma': 2.780412595179189, 'lambda': 0.008120595345021653, 'alpha': 5.038845982620284e-07}. Best is trial 0 with value: 0.6619718309859155.
[I 2024-09-07 15:45:40,682] Trial 1 finished with value: 0.5492957746478874 and parameters: {'n_estimators': 824, 'max_depth': 7, 'learning_rate': 0.24624448338247232, 'subsample': 0.6570315816245667, 'colsample_bytree': 0.7714883551427911, 'gamma': 2.8337761864699225, 'lambda': 5.131833794314674e-05, 'alpha': 4.544633117147972e-07}. Best is trial 0 with value: 0.6619718309859155.
[I 2024-09-07 15:45:40,918] Trial 2 finished with value: 0.6197183098591549 and parameters: {'n_estimators': 619, 'max_depth': 4, 'learning_rate': 0.16036959377402013, 'subsample': 0.5268838228197368, 'colsample_bytree': 0.5296050025807

Best model for NVDA saved with accuracy: 0.704225352112676
Feature importances for NVDA saved.


[I 2024-09-07 15:46:16,971] Trial 0 finished with value: 1.5441263942719918 and parameters: {'n_estimators': 187, 'max_depth': 7, 'learning_rate': 0.03888655050440744, 'subsample': 0.6720373295011091, 'colsample_bytree': 0.5379543836003501, 'gamma': 3.419179819713091, 'lambda': 2.511512986828567e-08, 'alpha': 0.3154242618850718}. Best is trial 0 with value: 1.5441263942719918.
[I 2024-09-07 15:46:17,193] Trial 1 finished with value: 1.4649343936999997 and parameters: {'n_estimators': 799, 'max_depth': 10, 'learning_rate': 0.21184491183681645, 'subsample': 0.8472726659211801, 'colsample_bytree': 0.9181760871451983, 'gamma': 1.3053458985955368, 'lambda': 0.0007176193830055852, 'alpha': 0.015002835449651697}. Best is trial 1 with value: 1.4649343936999997.
[I 2024-09-07 15:46:17,496] Trial 2 finished with value: 1.4890525111060924 and parameters: {'n_estimators': 543, 'max_depth': 8, 'learning_rate': 0.10989702469997017, 'subsample': 0.5509349477445775, 'colsample_bytree': 0.8536197179788

Best model for NVDA saved with RMSE: 1.4184179665765868
Feature importances for NVDA saved.


[I 2024-09-07 15:46:45,068] Trial 0 finished with value: 0.6351351351351351 and parameters: {'n_estimators': 814, 'max_depth': 3, 'learning_rate': 0.06463389136381073, 'subsample': 0.8281932456750135, 'colsample_bytree': 0.8963292733392156, 'gamma': 2.0785724889144856, 'lambda': 0.14479008241939517, 'alpha': 0.0021799159979326846}. Best is trial 0 with value: 0.6351351351351351.
[I 2024-09-07 15:46:45,326] Trial 1 finished with value: 0.6216216216216216 and parameters: {'n_estimators': 578, 'max_depth': 4, 'learning_rate': 0.2734296318043481, 'subsample': 0.5954148797678777, 'colsample_bytree': 0.8330523883305605, 'gamma': 0.3938301893895335, 'lambda': 1.0122359680850358e-06, 'alpha': 6.271989249396495e-06}. Best is trial 0 with value: 0.6351351351351351.
[I 2024-09-07 15:46:45,621] Trial 2 finished with value: 0.6081081081081081 and parameters: {'n_estimators': 479, 'max_depth': 10, 'learning_rate': 0.07497586178270461, 'subsample': 0.6054325858735565, 'colsample_bytree': 0.6653634058

Best model for SGDUSD=X saved with accuracy: 0.7297297297297297
Feature importances for SGDUSD=X saved.


[I 2024-09-07 15:47:16,409] Trial 0 finished with value: 0.0018226798089726458 and parameters: {'n_estimators': 613, 'max_depth': 3, 'learning_rate': 0.06182778636135682, 'subsample': 0.789312005378769, 'colsample_bytree': 0.5656915383851486, 'gamma': 2.453883183674355, 'lambda': 0.0001888732798543952, 'alpha': 6.943831238168346e-07}. Best is trial 0 with value: 0.0018226798089726458.
[I 2024-09-07 15:47:16,707] Trial 1 finished with value: 0.001821852899410882 and parameters: {'n_estimators': 600, 'max_depth': 9, 'learning_rate': 0.14234101531580418, 'subsample': 0.7482267803672994, 'colsample_bytree': 0.674407239056898, 'gamma': 0.2882535446021517, 'lambda': 0.06300601086733697, 'alpha': 1.4640140007557126e-07}. Best is trial 1 with value: 0.001821852899410882.
[I 2024-09-07 15:47:16,981] Trial 2 finished with value: 0.0018226778828216716 and parameters: {'n_estimators': 619, 'max_depth': 8, 'learning_rate': 0.10292748879544358, 'subsample': 0.6733464296683646, 'colsample_bytree': 0.

Best model for SGDUSD=X saved with RMSE: 0.001819421929577254
Feature importances for SGDUSD=X saved.


[I 2024-09-07 15:47:45,976] Trial 0 finished with value: 0.6351351351351351 and parameters: {'n_estimators': 692, 'max_depth': 10, 'learning_rate': 0.14466862351795834, 'subsample': 0.8839714319744785, 'colsample_bytree': 0.6868863986594489, 'gamma': 2.063201273682101, 'lambda': 1.073516166416603e-08, 'alpha': 4.859127890417185e-08}. Best is trial 0 with value: 0.6351351351351351.
[I 2024-09-07 15:47:46,199] Trial 1 finished with value: 0.5405405405405406 and parameters: {'n_estimators': 615, 'max_depth': 10, 'learning_rate': 0.26461236042963204, 'subsample': 0.9972490261559565, 'colsample_bytree': 0.8613098708208835, 'gamma': 0.6841505995145769, 'lambda': 0.0006575904543099559, 'alpha': 0.0015528161756816618}. Best is trial 0 with value: 0.6351351351351351.
[I 2024-09-07 15:47:46,484] Trial 2 finished with value: 0.6621621621621622 and parameters: {'n_estimators': 283, 'max_depth': 4, 'learning_rate': 0.12744999317625405, 'subsample': 0.8815337007090477, 'colsample_bytree': 0.54265373

Best model for USDSGD=X saved with accuracy: 0.7567567567567568
Feature importances for USDSGD=X saved.


[I 2024-09-07 15:48:14,253] Trial 0 finished with value: 0.003283195428258382 and parameters: {'n_estimators': 369, 'max_depth': 9, 'learning_rate': 0.15931796269859963, 'subsample': 0.6813595432316073, 'colsample_bytree': 0.6187112468323195, 'gamma': 2.507514312455043, 'lambda': 0.00025384518175037646, 'alpha': 3.452599240302028e-07}. Best is trial 0 with value: 0.003283195428258382.
[I 2024-09-07 15:48:14,556] Trial 1 finished with value: 0.003284778590237298 and parameters: {'n_estimators': 294, 'max_depth': 4, 'learning_rate': 0.252031090611467, 'subsample': 0.7309191600306024, 'colsample_bytree': 0.7825626853541006, 'gamma': 3.0342572835207084, 'lambda': 4.362253901128992e-07, 'alpha': 6.669198315842487e-06}. Best is trial 0 with value: 0.003283195428258382.
[I 2024-09-07 15:48:14,867] Trial 2 finished with value: 0.003285026268977276 and parameters: {'n_estimators': 362, 'max_depth': 3, 'learning_rate': 0.21876076329807462, 'subsample': 0.7422213464183547, 'colsample_bytree': 0.8

Best model for USDSGD=X saved with RMSE: 0.0032816143421020745
Feature importances for USDSGD=X saved.
