In [1]:
import os
import numpy as np
import pandas as pd
import xgboost

import optuna
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, root_mean_squared_error 
import joblib

logical_cores = os.cpu_count()
print(f"Number of logical CPU cores: {logical_cores}")

num_workers = max(1, logical_cores // 2)
print(f"Number of workers set to: {num_workers}")

def is_gpu_available():
    try:
        import torch
        return torch.cuda.is_available()
    except ImportError:
        return False

gpu_available = is_gpu_available()
print(f"GPU available: {gpu_available}")

print(xgboost.build_info())

Number of logical CPU cores: 16
Number of workers set to: 8
GPU available: True
{'BUILTIN_PREFETCH_PRESENT': False, 'CUDA_VERSION': [12, 4], 'DEBUG': False, 'MM_PREFETCH_PRESENT': True, 'THRUST_VERSION': [2, 3, 2], 'USE_CUDA': True, 'USE_DLOPEN_NCCL': False, 'USE_FEDERATED': False, 'USE_NCCL': False, 'USE_OPENMP': True, 'USE_RMM': False, 'libxgboost': 'C:\\Users\\ng_mi\\Anaconda\\envs\\pytorch-gpu\\Lib\\site-packages\\xgboost\\lib\\xgboost.dll'}


In [2]:
def load_or_create_ticker_df(csv_file_path):
    """
    Load the existing ticker DataFrame from a CSV file if it exists,
    otherwise create a new DataFrame with predefined column types.
    Ensure the DataFrame has the specified columns, add any missing columns,
    and rearrange the columns in alphabetical order, excluding 'Ticker_Symbol'.

    Args:
    csv_file_path (str): The path to the CSV file.

    Returns:
    pd.DataFrame: The loaded or newly created DataFrame.
    """
    # Define the column types
    column_types = {
        "Ticker_Symbol": str,
        "Best_Cov1D_Classification_Accuracy": float,
        "Best_Cov1D_Classification_Path": str,
        "Best_Cov1D_Regression_RMSE": float,
        "Best_Cov1D_Regression_Path": str,
        "Best_LSTM_Classification_Accuracy": float,
        "Best_LSTM_Classification_Path": str,
        "Best_LSTM_Regression_RMSE": float,
        "Best_LSTM_Regression_Path": str,
        "Best_Transformer_Classification_Accuracy": float,
        "Best_TransformerClassification_Path": str,
        "Best_Transformer_Regression_RMSE": float,
        "Best_Transformer_Regression_Path": str,
        "Best_XGBClassifier_Classification_Accuracy": float,
        "Best_XGBClassifier_Classification_Path": str,
        "Best_XGBRegressor_Regression_RMSE": float,
        "Best_XGBRegressor_Regression_Path": str
    }


    if os.path.isfile(csv_file_path):
        # Load the existing file into a DataFrame
        ticker_df = pd.read_csv(csv_file_path)
        
        # Ensure all specified columns are present
        for column, dtype in column_types.items():
            if column not in ticker_df.columns:
                ticker_df[column] = pd.Series(dtype=dtype)
        
        # Reorder columns alphabetically, excluding 'Ticker_Symbol'
        columns = ["Ticker_Symbol"] + sorted([col for col in ticker_df.columns if col != "Ticker_Symbol"])
        ticker_df = ticker_df[columns]
    else:
        # Create a new DataFrame with the specified column types
        ticker_df = pd.DataFrame(columns=column_types.keys()).astype(column_types)
    
    return ticker_df

csv_file_path = "../ticker-best-model.csv"
ticker_df = load_or_create_ticker_df(csv_file_path)

In [3]:
os.makedirs('../feature-importances/xbclassifier', exist_ok=True)
os.makedirs('../feature-importances/xbregressor', exist_ok=True)
os.makedirs('../models/xgboost/xbclassifier', exist_ok=True)
os.makedirs('../models/xgboost/xbregressor', exist_ok=True)

path = '../data/train'

ticker_list = []

if os.path.exists(path):
    ticker_list = [os.path.splitext(f)[0] for f in os.listdir(path) if f.endswith('.csv')]


In [4]:
def preprocess_data(df):
    if df.isna().sum().sum() > 0 or df.isin([float('inf'), float('-inf')]).sum().sum() > 0:
        df = df.replace([float('inf'), float('-inf')], float('nan')).dropna()

    df = df.dropna()

    columns_to_drop = [
        'NEXT_DAY_CLOSEPRICE', 'DAILY_CLOSEPRICE_CHANGE', 'CLOSEPRICE_DIRECTION',
        'DAILY_MIDPRICE', 'NEXT_DAY_MIDPRICE', 'DAILY_MIDPRICE_CHANGE', 'MIDPRICE_DIRECTION', 'Date'
    ]
    X = df.drop(columns=columns_to_drop)
    y_classifier = (df['DAILY_CLOSEPRICE_CHANGE'] > 0).astype(int)
    y_regressor = df['DAILY_CLOSEPRICE_CHANGE']

    return X, y_classifier, y_regressor

In [5]:
def xbclassifier_hyperparameters_search(X, y, gpu_available, ticker_symbol, ticker_df, csv_file_path):
    def objective(trial):
        param = {
            'verbosity': 0,
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'tree_method': 'hist',
            'device': 'cuda' if gpu_available else 'cpu',  
            'use_label_encoder': False,
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),  # Adjusting range
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),     # Adjusting range
            'early_stopping_rounds': 50 
        }

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
        model = XGBClassifier(**param)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        preds = model.predict(X_valid)
        accuracy = accuracy_score(y_valid, preds)
        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    best_model = XGBClassifier(**study.best_params)
    best_model.fit(X, y, eval_set=[(X, y)], verbose=False)

    model_path = f'../models/xgboost/xbclassifier/{ticker_symbol}_best_model.pkl'

    # Update ticker_df
    if ticker_symbol in ticker_df['Ticker_Symbol'].values:
        current_score = ticker_df.loc[ticker_df['Ticker_Symbol'] == ticker_symbol, 'Best_XGBClassifier_Classification_Accuracy'].values[0]
        if pd.isnull(current_score) or study.best_value > current_score:
            ticker_df.loc[ticker_df['Ticker_Symbol'] == ticker_symbol, ['Best_XGBClassifier_Classification_Accuracy', 'Best_XGBClassifier_Classification_Path']] = [study.best_value, model_path]
            joblib.dump(best_model, model_path)
            ticker_df.to_csv(csv_file_path, index=False)
            print(f"Best model for {ticker_symbol} saved with accuracy: {study.best_value}")
             # Save feature importances
            feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
            feature_importances = feature_importances.sort_values(by='importance', ascending=False)
            feature_importances.to_csv(f'../feature-importances/xbclassifier/{ticker_symbol}_feature_importances.csv')
            print(f"Feature importances for {ticker_symbol} saved.")
        else:
            print(f"Previous model accuracy: {current_score} is better for {ticker_symbol} than accuracy: {study.best_value}")
    else:
        new_row = pd.DataFrame({'Ticker_Symbol': [ticker_symbol], 'Best_XGBClassifier_Classification_Accuracy': [study.best_value], 'Best_XGBClassifier_Classification_Path': [model_path]})
        ticker_df = pd.concat([ticker_df, new_row], ignore_index=True)
        joblib.dump(best_model, model_path)
        ticker_df.to_csv(csv_file_path, index=False)
        print(f"Best model for {ticker_symbol} saved with accuracy: {study.best_value}")

        # Save feature importances
        feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
        feature_importances = feature_importances.sort_values(by='importance', ascending=False)
        feature_importances.to_csv(f'../feature-importances/xbclassifier/{ticker_symbol}_feature_importances.csv')
        print(f"Feature importances for {ticker_symbol} saved.")

    return ticker_df


def xbregressor_hyperparameters_search(X, y, gpu_available, ticker_symbol, ticker_df, csv_file_path):
    def objective(trial):
        param = {
            'verbosity': 0,
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'tree_method': 'hist',
            'device': 'cuda' if gpu_available else 'cpu',  
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),  # Adjusting range
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),     # Adjusting range
            'early_stopping_rounds': 50 
        }

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
        model = XGBRegressor(**param)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        preds = model.predict(X_valid)
        rmse = root_mean_squared_error (y_valid, preds)
        return rmse

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100)

    best_model = XGBRegressor(**study.best_params)
    best_model.fit(X, y, eval_set=[(X, y)], verbose=False)

    model_path = f'../models/xgboost/xbclassifier/{ticker_symbol}_best_model.pkl'
    # Update ticker_df
    if ticker_symbol in ticker_df['Ticker_Symbol'].values:
        current_score = ticker_df.loc[ticker_df['Ticker_Symbol'] == ticker_symbol, 'Best_XGBRegressor_Regression_RMSE'].values[0]
        if pd.isnull(current_score) or study.best_value < current_score:
            ticker_df.loc[ticker_df['Ticker_Symbol'] == ticker_symbol, ['Best_XGBRegressor_Regression_RMSE', 'Best_XGBRegressor_Regression_Path']] = [study.best_value, model_path]
            joblib.dump(best_model, model_path)
            ticker_df.to_csv(csv_file_path, index=False)
            print(f"Best model for {ticker_symbol} saved with RMSE: {study.best_value}")

            # Save feature importances
            feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
            feature_importances = feature_importances.sort_values(by='importance', ascending=False)
            feature_importances.to_csv(f'../feature-importances/xbregressor/{ticker_symbol}_feature_importances.csv')
            print(f"Feature importances for {ticker_symbol} saved.")
        else:
            print(f"Previous model MSE: {current_score} is better for {ticker_symbol} than MSE: {study.best_value}")
    else:
        new_row = pd.DataFrame({'Ticker_Symbol': [ticker_symbol], 'Best_XGBRegressor_Regression_RMSE': [study.best_value], 'Best_XGBRegressor_Regression_Path': [model_path]})
        ticker_df = pd.concat([ticker_df, new_row], ignore_index=True)
        joblib.dump(best_model, model_path)
        ticker_df.to_csv(csv_file_path, index=False)
        print(f"Best model for {ticker_symbol} saved with RMSE: {study.best_value}")
        # Save feature importances
        feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
        feature_importances = feature_importances.sort_values(by='importance', ascending=False)
        feature_importances.to_csv(f'../feature-importances/xbregressor/{ticker_symbol}_feature_importances.csv')
        print(f"Feature importances for {ticker_symbol} saved.")
   
    
    return ticker_df

In [6]:
for ticker_symbol in ticker_list:
    dataframe = pd.read_csv(f"../data/train/{ticker_symbol}.csv")
    X, y_classifier, y_regressor = preprocess_data(dataframe)
    ticker_df = xbclassifier_hyperparameters_search(X, y_classifier, gpu_available, ticker_symbol, ticker_df, csv_file_path)
    ticker_df = xbregressor_hyperparameters_search(X, y_regressor, gpu_available, ticker_symbol, ticker_df, csv_file_path)


[I 2024-09-08 13:12:00,265] A new study created in memory with name: no-name-c86c43c2-c222-40ff-8055-eebf5e91b02e
[I 2024-09-08 13:12:00,755] Trial 0 finished with value: 0.5070422535211268 and parameters: {'n_estimators': 338, 'max_depth': 10, 'learning_rate': 0.16662137102816696, 'subsample': 0.7613491259293079, 'colsample_bytree': 0.6415892935807734, 'gamma': 3.9643134681883585, 'lambda': 1.5376939350309252e-08, 'alpha': 6.66776436315883e-07}. Best is trial 0 with value: 0.5070422535211268.
[I 2024-09-08 13:12:01,047] Trial 1 finished with value: 0.5070422535211268 and parameters: {'n_estimators': 311, 'max_depth': 5, 'learning_rate': 0.11859182361873374, 'subsample': 0.6277454366572328, 'colsample_bytree': 0.6932236135038277, 'gamma': 4.521968907825475, 'lambda': 3.339751204199491e-08, 'alpha': 3.27782904720509e-08}. Best is trial 0 with value: 0.5070422535211268.
[I 2024-09-08 13:12:01,344] Trial 2 finished with value: 0.5352112676056338 and parameters: {'n_estimators': 263, 'max_

Best model for CL=F saved with accuracy: 0.6197183098591549
Feature importances for CL=F saved.


[I 2024-09-08 13:12:43,820] Trial 0 finished with value: 1.6611487023055118 and parameters: {'n_estimators': 343, 'max_depth': 10, 'learning_rate': 0.16866903110142842, 'subsample': 0.5333226587216378, 'colsample_bytree': 0.8996816506879591, 'gamma': 1.3907612891272791, 'lambda': 0.03302806393426272, 'alpha': 4.78075963736231e-08}. Best is trial 0 with value: 1.6611487023055118.
[I 2024-09-08 13:12:44,245] Trial 1 finished with value: 1.6241483746118601 and parameters: {'n_estimators': 930, 'max_depth': 9, 'learning_rate': 0.0827500484946988, 'subsample': 0.5141209126931112, 'colsample_bytree': 0.7792511427016138, 'gamma': 2.313613788631889, 'lambda': 2.484525185289064e-06, 'alpha': 0.0006426461179965047}. Best is trial 1 with value: 1.6241483746118601.
[I 2024-09-08 13:12:44,795] Trial 2 finished with value: 1.6411228406437388 and parameters: {'n_estimators': 861, 'max_depth': 7, 'learning_rate': 0.043957265620633, 'subsample': 0.9231068649371075, 'colsample_bytree': 0.909830162152453

Best model for CL=F saved with RMSE: 1.6153158873300126
Feature importances for CL=F saved.


[I 2024-09-08 13:13:27,052] Trial 0 finished with value: 0.5774647887323944 and parameters: {'n_estimators': 293, 'max_depth': 5, 'learning_rate': 0.16160903613686534, 'subsample': 0.5339935822991497, 'colsample_bytree': 0.9878630261579233, 'gamma': 2.136318270704678, 'lambda': 0.001644177339532582, 'alpha': 1.1812633368768266e-08}. Best is trial 0 with value: 0.5774647887323944.
[I 2024-09-08 13:13:27,404] Trial 1 finished with value: 0.5633802816901409 and parameters: {'n_estimators': 775, 'max_depth': 3, 'learning_rate': 0.14272224721420318, 'subsample': 0.6527674387247777, 'colsample_bytree': 0.5303690106643391, 'gamma': 1.4097765465948953, 'lambda': 0.00010380723537184814, 'alpha': 9.955770905770583e-08}. Best is trial 0 with value: 0.5774647887323944.
[I 2024-09-08 13:13:27,787] Trial 2 finished with value: 0.5915492957746479 and parameters: {'n_estimators': 289, 'max_depth': 6, 'learning_rate': 0.17884918658646656, 'subsample': 0.5590991735585585, 'colsample_bytree': 0.564992121

Best model for NVDA saved with accuracy: 0.7183098591549296
Feature importances for NVDA saved.


[I 2024-09-08 13:14:13,970] Trial 0 finished with value: 1.6356088959767208 and parameters: {'n_estimators': 773, 'max_depth': 3, 'learning_rate': 0.06815192487775569, 'subsample': 0.5523608806464064, 'colsample_bytree': 0.5599961515621004, 'gamma': 2.4158334763469878, 'lambda': 0.7333341214973259, 'alpha': 2.2962666591336044e-06}. Best is trial 0 with value: 1.6356088959767208.
[I 2024-09-08 13:14:14,298] Trial 1 finished with value: 1.773440602410112 and parameters: {'n_estimators': 163, 'max_depth': 10, 'learning_rate': 0.2717839584540124, 'subsample': 0.9060432776210267, 'colsample_bytree': 0.7600473034702555, 'gamma': 3.3417985827242735, 'lambda': 0.004384151080477337, 'alpha': 0.012193350999031021}. Best is trial 0 with value: 1.6356088959767208.
[I 2024-09-08 13:14:14,681] Trial 2 finished with value: 1.6690831192041875 and parameters: {'n_estimators': 801, 'max_depth': 5, 'learning_rate': 0.027514499713965114, 'subsample': 0.7183027993163376, 'colsample_bytree': 0.5467466818096

Best model for NVDA saved with RMSE: 1.504827555721081
Feature importances for NVDA saved.


[I 2024-09-08 13:15:00,707] Trial 0 finished with value: 0.7972972972972973 and parameters: {'n_estimators': 940, 'max_depth': 10, 'learning_rate': 0.19812922784694828, 'subsample': 0.626901045753083, 'colsample_bytree': 0.6236407741116385, 'gamma': 1.8578840304468291, 'lambda': 1.3760961355390613e-07, 'alpha': 5.243741997584364e-08}. Best is trial 0 with value: 0.7972972972972973.
[I 2024-09-08 13:15:01,176] Trial 1 finished with value: 0.6756756756756757 and parameters: {'n_estimators': 297, 'max_depth': 7, 'learning_rate': 0.23231214280327028, 'subsample': 0.7405943363507572, 'colsample_bytree': 0.7509981454203156, 'gamma': 0.4937482328855297, 'lambda': 0.0008031495866039162, 'alpha': 0.0003796990632744085}. Best is trial 0 with value: 0.7972972972972973.
[I 2024-09-08 13:15:02,171] Trial 2 finished with value: 0.7297297297297297 and parameters: {'n_estimators': 741, 'max_depth': 9, 'learning_rate': 0.044672686199077695, 'subsample': 0.9237019644326276, 'colsample_bytree': 0.8099269

Best model for SGDUSD=X saved with accuracy: 0.7972972972972973
Feature importances for SGDUSD=X saved.


[I 2024-09-08 13:16:11,818] Trial 0 finished with value: 0.002156658817388229 and parameters: {'n_estimators': 843, 'max_depth': 7, 'learning_rate': 0.019331166835260122, 'subsample': 0.6509701454840875, 'colsample_bytree': 0.6353011018427672, 'gamma': 3.2083697413717527, 'lambda': 0.00021079218667732826, 'alpha': 0.0008800985624130733}. Best is trial 0 with value: 0.002156658817388229.
[I 2024-09-08 13:16:12,161] Trial 1 finished with value: 0.0021566588205090753 and parameters: {'n_estimators': 957, 'max_depth': 9, 'learning_rate': 0.21153693689240027, 'subsample': 0.8715288856664664, 'colsample_bytree': 0.5030210717762882, 'gamma': 1.3804392202262095, 'lambda': 1.8995587662022384e-05, 'alpha': 2.5708700782214554e-06}. Best is trial 0 with value: 0.002156658817388229.
[I 2024-09-08 13:16:12,491] Trial 2 finished with value: 0.0021566592735374353 and parameters: {'n_estimators': 954, 'max_depth': 8, 'learning_rate': 0.12934837976334104, 'subsample': 0.8065492821431499, 'colsample_bytr

Best model for SGDUSD=X saved with RMSE: 0.002156658817138088
Feature importances for SGDUSD=X saved.


[I 2024-09-08 13:17:04,237] Trial 0 finished with value: 0.7567567567567568 and parameters: {'n_estimators': 661, 'max_depth': 5, 'learning_rate': 0.08844247104235561, 'subsample': 0.9134175704026055, 'colsample_bytree': 0.5456459000804282, 'gamma': 1.3380619940263472, 'lambda': 0.7688827140400668, 'alpha': 0.05739620609245347}. Best is trial 0 with value: 0.7567567567567568.
[I 2024-09-08 13:17:04,818] Trial 1 finished with value: 0.7297297297297297 and parameters: {'n_estimators': 411, 'max_depth': 10, 'learning_rate': 0.258940744138602, 'subsample': 0.9182588288382294, 'colsample_bytree': 0.6031335102844704, 'gamma': 3.2953776435882625, 'lambda': 1.488203410273113e-06, 'alpha': 0.004613687597473981}. Best is trial 0 with value: 0.7567567567567568.
[I 2024-09-08 13:17:05,370] Trial 2 finished with value: 0.7162162162162162 and parameters: {'n_estimators': 113, 'max_depth': 3, 'learning_rate': 0.07742312662441474, 'subsample': 0.5440700692167217, 'colsample_bytree': 0.7309112466345603

Best model for USDSGD=X saved with accuracy: 0.7972972972972973
Feature importances for USDSGD=X saved.


[I 2024-09-08 13:18:32,192] Trial 0 finished with value: 0.0038940524673559723 and parameters: {'n_estimators': 489, 'max_depth': 7, 'learning_rate': 0.01338796616483856, 'subsample': 0.7624276446082241, 'colsample_bytree': 0.98543070229776, 'gamma': 0.2759133686285409, 'lambda': 1.841180900207308e-06, 'alpha': 7.412092125151189e-08}. Best is trial 0 with value: 0.0038940524673559723.
[I 2024-09-08 13:18:32,724] Trial 1 finished with value: 0.003894052480293797 and parameters: {'n_estimators': 739, 'max_depth': 3, 'learning_rate': 0.09830816641109773, 'subsample': 0.5039788585469591, 'colsample_bytree': 0.5171264321363547, 'gamma': 4.55637712751048, 'lambda': 1.9335745953655496e-08, 'alpha': 0.019790126673378015}. Best is trial 0 with value: 0.0038940524673559723.
[I 2024-09-08 13:18:33,038] Trial 2 finished with value: 0.003894052877438334 and parameters: {'n_estimators': 771, 'max_depth': 10, 'learning_rate': 0.12565264536688714, 'subsample': 0.6676950419809496, 'colsample_bytree': 0

Best model for USDSGD=X saved with RMSE: 0.0038940524672766872
Feature importances for USDSGD=X saved.
