In [1]:
import os
import numpy as np
import pandas as pd
import xgboost

import optuna
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, root_mean_squared_error 
import joblib

logical_cores = os.cpu_count()
print(f"Number of logical CPU cores: {logical_cores}")

num_workers = max(1, logical_cores // 2)
print(f"Number of workers set to: {num_workers}")

def is_gpu_available():
    try:
        import torch
        return torch.cuda.is_available()
    except ImportError:
        return False

gpu_available = is_gpu_available()
print(f"GPU available: {gpu_available}")

print(xgboost.build_info())

Number of logical CPU cores: 16
Number of workers set to: 8
GPU available: True
{'BUILTIN_PREFETCH_PRESENT': False, 'CUDA_VERSION': [12, 4], 'DEBUG': False, 'MM_PREFETCH_PRESENT': True, 'THRUST_VERSION': [2, 3, 2], 'USE_CUDA': True, 'USE_DLOPEN_NCCL': False, 'USE_FEDERATED': False, 'USE_NCCL': False, 'USE_OPENMP': True, 'USE_RMM': False, 'libxgboost': 'C:\\Users\\ng_mi\\Anaconda\\envs\\pytorch-gpu\\Lib\\site-packages\\xgboost\\lib\\xgboost.dll'}


In [2]:
def load_or_create_ticker_df(csv_file_path):
    """
    Load the existing ticker DataFrame from a CSV file if it exists,
    otherwise create a new DataFrame with predefined column types.
    Ensure the DataFrame has the specified columns, add any missing columns,
    and rearrange the columns in alphabetical order, excluding 'Ticker_Symbol'.

    Args:
    csv_file_path (str): The path to the CSV file.

    Returns:
    pd.DataFrame: The loaded or newly created DataFrame.
    """
    # Define the column types
    column_types = {
        "Ticker_Symbol": str,
        "Best_Cov1D_Classification_Accuracy": float,
        "Best_Cov1D_Classification_Path": str,
        "Best_Cov1D_Regression_RMSE": float,
        "Best_Cov1D_Regression_Path": str,
        "Best_Hybrid_Cov1D_LSTM_Classification_Accuracy": float,
        "Best_Hybrid_Cov1D_LSTM_Classification_Path": str,
        "Best_Hybrid_Cov1D_LSTM_Regression_RMSE": float,
        "Best_Hybrid_Cov1D_LSTM_Regression_Path": str,
        "Best_Hybrid_Cov1D_Transformer_Classification_Accuracy": float,
        "Best_Hybrid_Cov1D_Transformer_Classification_Path": str,
        "Best_Hybrid_Cov1D_Transformer_Regression_RMSE": float,
        "Best_Hybrid_Cov1D_Transformer_Regression_Path": str,
        "Best_LSTM_Classification_Accuracy": float,
        "Best_LSTM_Classification_Path": str,
        "Best_LSTM_Regression_RMSE": float,
        "Best_LSTM_Regression_Path": str,
        "Best_Transformer_Classification_Accuracy": float,
        "Best_TransformerClassification_Path": str,
        "Best_Transformer_Regression_RMSE": float,
        "Best_Transformer_Regression_Path": str,
        "Best_XGBClassifier_Classification_Accuracy": float,
        "Best_XGBClassifier_Classification_Path": str,
        "Best_XGBRegressor_Regression_RMSE": float,
        "Best_XGBRegressor_Regression_Path": str
    }


    if os.path.isfile(csv_file_path):
        # Load the existing file into a DataFrame
        ticker_df = pd.read_csv(csv_file_path)
        
        # Ensure all specified columns are present
        for column, dtype in column_types.items():
            if column not in ticker_df.columns:
                ticker_df[column] = pd.Series(dtype=dtype)
        
        # Reorder columns alphabetically, excluding 'Ticker_Symbol'
        columns = ["Ticker_Symbol"] + sorted([col for col in ticker_df.columns if col != "Ticker_Symbol"])
        ticker_df = ticker_df[columns]
    else:
        # Create a new DataFrame with the specified column types
        ticker_df = pd.DataFrame(columns=column_types.keys()).astype(column_types)
    
    return ticker_df

csv_file_path = "../ticker-best-model.csv"
ticker_df = load_or_create_ticker_df(csv_file_path)

In [3]:
os.makedirs('../feature-importances/xbclassifier', exist_ok=True)
os.makedirs('../feature-importances/xbregressor', exist_ok=True)
os.makedirs('../models/xgboost/xbclassifier', exist_ok=True)
os.makedirs('../models/xgboost/xbregressor', exist_ok=True)

path = '../data'

ticker_list = []

if os.path.exists(path):
    ticker_list = [os.path.splitext(f)[0] for f in os.listdir(path) if f.endswith('.csv')]


In [4]:
def preprocess_data(df):
    if df.isna().sum().sum() > 0 or df.isin([float('inf'), float('-inf')]).sum().sum() > 0:
        df = df.replace([float('inf'), float('-inf')], float('nan')).dropna()

    df = df.dropna()

    columns_to_drop = [
        'NEXT_DAY_CLOSEPRICE', 'DAILY_CLOSEPRICE_CHANGE', 'CLOSEPRICE_DIRECTION',
        'DAILY_MIDPRICE', 'NEXT_DAY_MIDPRICE', 'DAILY_MIDPRICE_CHANGE', 'MIDPRICE_DIRECTION', 'Date'
    ]
    X = df.drop(columns=columns_to_drop)
    y_classifier = (df['DAILY_CLOSEPRICE_CHANGE'] > 0).astype(int)
    y_regressor = df['DAILY_CLOSEPRICE_CHANGE']

    return X, y_classifier, y_regressor

In [5]:
def process_xbclassifier(X, y, gpu_available, ticker, ticker_df, csv_file_path):
    def objective(trial):
        param = {
            'verbosity': 0,
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'tree_method': 'hist',
            'device': 'cuda' if gpu_available else 'cpu',  
            'use_label_encoder': False,
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),  # Adjusting range
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),     # Adjusting range
            'early_stopping_rounds': 50 
        }

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
        model = XGBClassifier(**param)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        preds = model.predict(X_valid)
        accuracy = accuracy_score(y_valid, preds)
        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    best_model = XGBClassifier(**study.best_params)
    best_model.fit(X, y, eval_set=[(X, y)], verbose=False)

    model_path = f'../models/xgboost/xbclassifier/{ticker}_best_model.pkl'

    # Update ticker_df
    if ticker in ticker_df['Ticker_Symbol'].values:
        current_score = ticker_df.loc[ticker_df['Ticker_Symbol'] == ticker, 'Best_XGBClassifier_Classification_Accuracy'].values[0]
        if pd.isnull(current_score) or study.best_value > current_score:
            ticker_df.loc[ticker_df['Ticker_Symbol'] == ticker, ['Best_XGBClassifier_Classification_Accuracy', 'Best_XGBClassifier_Classification_Path']] = [study.best_value, model_path]
            joblib.dump(best_model, model_path)
            ticker_df.to_csv(csv_file_path, index=False)
            print(f"Best model for {ticker} saved with accuracy: {study.best_value}")
             # Save feature importances
            feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
            feature_importances = feature_importances.sort_values(by='importance', ascending=False)
            feature_importances.to_csv(f'../feature-importances/xbclassifier/{ticker}_feature_importances.csv')
            print(f"Feature importances for {ticker} saved.")
        else:
            print(f"Previous model accuracy: {current_score} is better for {ticker} than accuracy: {study.best_value}")
    else:
        new_row = pd.DataFrame({'Ticker_Symbol': [ticker], 'Best_XGBClassifier_Classification_Accuracy': [study.best_value], 'Best_XGBClassifier_Classification_Path': [model_path]})
        ticker_df = pd.concat([ticker_df, new_row], ignore_index=True)
        joblib.dump(best_model, model_path)
        ticker_df.to_csv(csv_file_path, index=False)
        print(f"Best model for {ticker} saved with accuracy: {study.best_value}")

        # Save feature importances
        feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
        feature_importances = feature_importances.sort_values(by='importance', ascending=False)
        feature_importances.to_csv(f'../feature-importances/xbclassifier/{ticker}_feature_importances.csv')
        print(f"Feature importances for {ticker} saved.")

    return ticker_df


def process_xbregressor(X, y, gpu_available, ticker, ticker_df, csv_file_path):
    def objective(trial):
        param = {
            'verbosity': 0,
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'tree_method': 'hist',
            'device': 'cuda' if gpu_available else 'cpu',  
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),  # Adjusting range
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),     # Adjusting range
            'early_stopping_rounds': 50 
        }

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
        model = XGBRegressor(**param)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        preds = model.predict(X_valid)
        rmse = root_mean_squared_error (y_valid, preds)
        return rmse

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100)

    best_model = XGBRegressor(**study.best_params)
    best_model.fit(X, y, eval_set=[(X, y)], verbose=False)

    model_path = f'../models/xgboost/xbclassifier/{ticker}_best_model.pkl'
    # Update ticker_df
    if ticker in ticker_df['Ticker_Symbol'].values:
        current_score = ticker_df.loc[ticker_df['Ticker_Symbol'] == ticker, 'Best_XGBRegressor_Regression_RMSE'].values[0]
        if pd.isnull(current_score) or study.best_value < current_score:
            ticker_df.loc[ticker_df['Ticker_Symbol'] == ticker, ['Best_XGBRegressor_Regression_RMSE', 'Best_XGBRegressor_Regression_Path']] = [study.best_value, model_path]
            joblib.dump(best_model, model_path)
            ticker_df.to_csv(csv_file_path, index=False)
            print(f"Best model for {ticker} saved with RMSE: {study.best_value}")

            # Save feature importances
            feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
            feature_importances = feature_importances.sort_values(by='importance', ascending=False)
            feature_importances.to_csv(f'../feature-importances/xbregressor/{ticker}_feature_importances.csv')
            print(f"Feature importances for {ticker} saved.")
        else:
            print(f"Previous model MSE: {current_score} is better for {ticker} than MSE: {study.best_value}")
    else:
        new_row = pd.DataFrame({'Ticker_Symbol': [ticker], 'Best_XGBRegressor_Regression_RMSE': [study.best_value], 'Best_XGBRegressor_Regression_Path': [model_path]})
        ticker_df = pd.concat([ticker_df, new_row], ignore_index=True)
        joblib.dump(best_model, model_path)
        ticker_df.to_csv(csv_file_path, index=False)
        print(f"Best model for {ticker} saved with RMSE: {study.best_value}")
        # Save feature importances
        feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importance'])
        feature_importances = feature_importances.sort_values(by='importance', ascending=False)
        feature_importances.to_csv(f'../feature-importances/xbregressor/{ticker}_feature_importances.csv')
        print(f"Feature importances for {ticker} saved.")
   
    
    return ticker_df

In [6]:
for ticker in ticker_list:
    dataframe = pd.read_csv(f"../data/{ticker}.csv")
    X, y_classifier, y_regressor = preprocess_data(dataframe)
    ticker_df = process_xbclassifier(X, y_classifier, gpu_available, ticker, ticker_df, csv_file_path)
    ticker_df = process_xbregressor(X, y_regressor, gpu_available, ticker, ticker_df, csv_file_path)


[I 2024-09-06 15:56:22,439] A new study created in memory with name: no-name-291c4777-c645-4d53-8822-a890c1d4fcfc
[I 2024-09-06 15:56:22,855] Trial 0 finished with value: 0.42424242424242425 and parameters: {'n_estimators': 717, 'max_depth': 8, 'learning_rate': 0.2578726919794994, 'subsample': 0.6616908476372866, 'colsample_bytree': 0.504396762137086, 'gamma': 4.368638474884528, 'lambda': 2.624904758385946e-07, 'alpha': 2.021248677536832e-08}. Best is trial 0 with value: 0.42424242424242425.
[I 2024-09-06 15:56:23,163] Trial 1 finished with value: 0.3333333333333333 and parameters: {'n_estimators': 510, 'max_depth': 10, 'learning_rate': 0.2148062510088084, 'subsample': 0.6817039342416323, 'colsample_bytree': 0.657002729169024, 'gamma': 3.2899550453784436, 'lambda': 0.00012836752142067598, 'alpha': 6.107055806025914e-06}. Best is trial 0 with value: 0.42424242424242425.
[I 2024-09-06 15:56:23,572] Trial 2 finished with value: 0.5757575757575758 and parameters: {'n_estimators': 476, 'max

Best model for CL=F saved with accuracy: 0.7272727272727273
Feature importances for CL=F saved.


[I 2024-09-06 15:57:02,478] Trial 0 finished with value: 1.4204850240965485 and parameters: {'n_estimators': 764, 'max_depth': 10, 'learning_rate': 0.21671868331234706, 'subsample': 0.846439949668232, 'colsample_bytree': 0.6338814872090506, 'gamma': 4.143933436641692, 'lambda': 0.0003556429088955887, 'alpha': 3.029927180774255e-05}. Best is trial 0 with value: 1.4204850240965485.
[I 2024-09-06 15:57:02,847] Trial 1 finished with value: 1.3778397065456849 and parameters: {'n_estimators': 559, 'max_depth': 6, 'learning_rate': 0.1087570387815808, 'subsample': 0.6318591033503643, 'colsample_bytree': 0.6355828774283734, 'gamma': 3.040557364355423, 'lambda': 0.40893013826459534, 'alpha': 0.0017314605094636215}. Best is trial 1 with value: 1.3778397065456849.
[I 2024-09-06 15:57:03,209] Trial 2 finished with value: 1.3626614140718416 and parameters: {'n_estimators': 454, 'max_depth': 7, 'learning_rate': 0.08785036520384482, 'subsample': 0.6891393370405621, 'colsample_bytree': 0.58461681824564

Best model for CL=F saved with RMSE: 1.2381394242601436
Feature importances for CL=F saved.


[I 2024-09-06 15:57:42,469] Trial 0 finished with value: 0.5454545454545454 and parameters: {'n_estimators': 735, 'max_depth': 5, 'learning_rate': 0.2044479026867728, 'subsample': 0.8398413353965347, 'colsample_bytree': 0.9054848057172518, 'gamma': 3.363646129233185, 'lambda': 0.0027294046468829512, 'alpha': 0.9534639419896829}. Best is trial 0 with value: 0.5454545454545454.
[I 2024-09-06 15:57:42,771] Trial 1 finished with value: 0.5454545454545454 and parameters: {'n_estimators': 252, 'max_depth': 10, 'learning_rate': 0.26913068134358126, 'subsample': 0.852738877642929, 'colsample_bytree': 0.6957464063565572, 'gamma': 3.853643558615522, 'lambda': 1.7169968487832325e-08, 'alpha': 6.845675662553001e-06}. Best is trial 0 with value: 0.5454545454545454.
[I 2024-09-06 15:57:43,103] Trial 2 finished with value: 0.5454545454545454 and parameters: {'n_estimators': 515, 'max_depth': 10, 'learning_rate': 0.15137121297670503, 'subsample': 0.668434482371661, 'colsample_bytree': 0.57599680624896

Best model for NVDA saved with accuracy: 0.7272727272727273
Feature importances for NVDA saved.


[I 2024-09-06 15:58:22,856] Trial 0 finished with value: 4.465373603611204 and parameters: {'n_estimators': 669, 'max_depth': 7, 'learning_rate': 0.2895225324062545, 'subsample': 0.6570010091426481, 'colsample_bytree': 0.5591351592885463, 'gamma': 3.6329951744235442, 'lambda': 3.369998895023631e-05, 'alpha': 0.0014356952540480173}. Best is trial 0 with value: 4.465373603611204.
[I 2024-09-06 15:58:23,235] Trial 1 finished with value: 4.255805219044707 and parameters: {'n_estimators': 590, 'max_depth': 4, 'learning_rate': 0.18433361247618502, 'subsample': 0.7874039118780009, 'colsample_bytree': 0.8263434140680599, 'gamma': 1.4399807800432451, 'lambda': 0.002013943991164303, 'alpha': 0.02058926288374654}. Best is trial 1 with value: 4.255805219044707.
[I 2024-09-06 15:58:23,755] Trial 2 finished with value: 4.207241660799216 and parameters: {'n_estimators': 817, 'max_depth': 6, 'learning_rate': 0.12908132321386293, 'subsample': 0.5423432642433226, 'colsample_bytree': 0.9920306300888331, 

Best model for NVDA saved with RMSE: 4.061127541876111
Feature importances for NVDA saved.


[I 2024-09-06 15:59:09,411] Trial 0 finished with value: 0.6285714285714286 and parameters: {'n_estimators': 989, 'max_depth': 8, 'learning_rate': 0.03338192754890154, 'subsample': 0.7260908820959641, 'colsample_bytree': 0.6320755225704444, 'gamma': 4.187928452236503, 'lambda': 0.00011957815051716655, 'alpha': 0.25726572549547627}. Best is trial 0 with value: 0.6285714285714286.
[I 2024-09-06 15:59:09,794] Trial 1 finished with value: 0.42857142857142855 and parameters: {'n_estimators': 824, 'max_depth': 7, 'learning_rate': 0.1871988457977059, 'subsample': 0.5625350822879682, 'colsample_bytree': 0.5582635967097178, 'gamma': 0.8867125775921247, 'lambda': 7.185310865682394e-08, 'alpha': 0.0016741199038118784}. Best is trial 0 with value: 0.6285714285714286.
[I 2024-09-06 15:59:10,235] Trial 2 finished with value: 0.4 and parameters: {'n_estimators': 772, 'max_depth': 5, 'learning_rate': 0.11940057875374145, 'subsample': 0.5693790564122216, 'colsample_bytree': 0.9385525723006605, 'gamma':

Best model for SGDUSD=X saved with accuracy: 0.6857142857142857
Feature importances for SGDUSD=X saved.


[I 2024-09-06 15:59:55,144] Trial 0 finished with value: 0.0015893538979231923 and parameters: {'n_estimators': 339, 'max_depth': 9, 'learning_rate': 0.26109013031481215, 'subsample': 0.5025872403824398, 'colsample_bytree': 0.9908188589855036, 'gamma': 3.7970858880375675, 'lambda': 0.00665578662154125, 'alpha': 2.7079299235192305e-06}. Best is trial 0 with value: 0.0015893538979231923.
[I 2024-09-06 15:59:55,639] Trial 1 finished with value: 0.0015955198182957138 and parameters: {'n_estimators': 547, 'max_depth': 3, 'learning_rate': 0.12122720337226232, 'subsample': 0.8708086805498065, 'colsample_bytree': 0.6029220113447984, 'gamma': 1.516808522166656, 'lambda': 0.021231112580900997, 'alpha': 0.6721232610450881}. Best is trial 0 with value: 0.0015893538979231923.
[I 2024-09-06 15:59:55,989] Trial 2 finished with value: 0.001593198382365695 and parameters: {'n_estimators': 122, 'max_depth': 6, 'learning_rate': 0.15734811012735195, 'subsample': 0.8964910721183539, 'colsample_bytree': 0.7

Best model for SGDUSD=X saved with RMSE: 0.0015891294353580225
Feature importances for SGDUSD=X saved.


[I 2024-09-06 16:00:40,301] Trial 0 finished with value: 0.6285714285714286 and parameters: {'n_estimators': 575, 'max_depth': 3, 'learning_rate': 0.03493803704937833, 'subsample': 0.8250687451879282, 'colsample_bytree': 0.6128218446061977, 'gamma': 1.3403723469465993, 'lambda': 0.0003108922218504458, 'alpha': 0.6714383142861762}. Best is trial 0 with value: 0.6285714285714286.
[I 2024-09-06 16:00:40,696] Trial 1 finished with value: 0.5714285714285714 and parameters: {'n_estimators': 827, 'max_depth': 9, 'learning_rate': 0.11199612148842084, 'subsample': 0.5783990525795097, 'colsample_bytree': 0.6124589920769534, 'gamma': 2.996110847098133, 'lambda': 1.7228643035047987e-08, 'alpha': 0.06978703087783199}. Best is trial 0 with value: 0.6285714285714286.
[I 2024-09-06 16:00:41,091] Trial 2 finished with value: 0.4857142857142857 and parameters: {'n_estimators': 151, 'max_depth': 4, 'learning_rate': 0.11595714905864236, 'subsample': 0.591038902938311, 'colsample_bytree': 0.568642674881029

Best model for USDSGD=X saved with accuracy: 0.7142857142857143
Feature importances for USDSGD=X saved.


[I 2024-09-06 16:01:28,328] Trial 0 finished with value: 0.0028392896750757415 and parameters: {'n_estimators': 466, 'max_depth': 4, 'learning_rate': 0.18333132017368928, 'subsample': 0.7777660449067034, 'colsample_bytree': 0.9321237553530723, 'gamma': 1.7054193594847984, 'lambda': 1.3588815107122792e-06, 'alpha': 0.08158746721570494}. Best is trial 0 with value: 0.0028392896750757415.
[I 2024-09-06 16:01:28,617] Trial 1 finished with value: 0.002835855418514503 and parameters: {'n_estimators': 907, 'max_depth': 9, 'learning_rate': 0.15958547811975504, 'subsample': 0.8022998468837362, 'colsample_bytree': 0.6276329928581376, 'gamma': 3.294840161464678, 'lambda': 0.0003744538790496707, 'alpha': 0.006318089786709104}. Best is trial 1 with value: 0.002835855418514503.
[I 2024-09-06 16:01:28,912] Trial 2 finished with value: 0.0028302766260399656 and parameters: {'n_estimators': 311, 'max_depth': 9, 'learning_rate': 0.13734985776069264, 'subsample': 0.6354588989509005, 'colsample_bytree': 0

Best model for USDSGD=X saved with RMSE: 0.002828493019121821
Feature importances for USDSGD=X saved.
