In [1]:
%%time 

# General library imports:-
from IPython.display import display_html, clear_output, Markdown;
from gc import collect;

from copy import deepcopy;
import pandas as pd;
import numpy as np;
import joblib;
from os import system, getpid, walk;
from psutil import Process;
import ctypes;
libc = ctypes.CDLL("libc.so.6");

from pprint import pprint;
from colorama import Fore, Style, init;
from warnings import filterwarnings;
filterwarnings('ignore');

from tqdm.notebook import tqdm;

print();
collect();


CPU times: user 473 ms, sys: 64.1 ms, total: 537 ms
Wall time: 610 ms


In [2]:
%%time 

# Model development:-
from sklearn.model_selection import (RepeatedStratifiedKFold as RSKF, 
                                     StratifiedKFold as SKF,
                                     KFold, 
                                     RepeatedKFold as RKF, 
                                     cross_val_score);

from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR;
from xgboost import XGBRegressor as XGBR;
from catboost import CatBoostRegressor as CBR;
from sklearn.ensemble import HistGradientBoostingRegressor as HGBR;
from sklearn.metrics import mean_absolute_error as mae, make_scorer;

print();
collect();


CPU times: user 1.67 s, sys: 507 ms, total: 2.17 s
Wall time: 2.85 s


In [3]:
%%time

# Defining global configurations and functions:-

# Color printing    
def PrintColor(text:str, color = Fore.BLUE, style = Style.BRIGHT):
    "Prints color outputs using colorama using a text F-string";
    print(style + color + text + Style.RESET_ALL); 
    
def GetMemUsage():
    """
    This function defines the memory usage across the kernel. 
    Source-
    https://stackoverflow.com/questions/61366458/how-to-find-memory-usage-of-kaggle-notebook
    """;
    
    pid = getpid();
    py = Process(pid);
    memory_use = py.memory_info()[0] / 2. ** 30;
    return f"RAM memory GB usage = {memory_use :.4}";

# Making sklearn pipeline outputs as dataframe:-
from sklearn import set_config; 
set_config(transform_output = "pandas");
pd.set_option('display.max_columns', 50);
pd.set_option('display.max_rows', 50);

print();
collect();


CPU times: user 109 ms, sys: 186 µs, total: 109 ms
Wall time: 108 ms


In [4]:
%%time 

# Configuration class:-
class CFG:
    """
    Configuration class for parameters and CV strategy for tuning and training
    Please use caps lock capital letters while filling in parameters
    """;
    
    # Data preparation:-   
    version_nb         = 5;
    test_req           = "N";
    test_frac          = 0.01;
    load_tr_data       = "N";
    gpu_switch         = "OFF"; 
    state              = 42;
    target             = 'target';
    
    path               = f"/kaggle/input/optiver-memoryreduceddatasets/";
    test_path          = f"/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv";
    df_choice          = f"XTrIntCmpNewFtre.parquet";
    mdl_path           = f'/kaggle/working/BaselineML/';
    inf_path           = f'/kaggle/input/optiverbaselinemodels/';
     
    # Model Training:-
    methods            = ["LGBMR", "CBR", "HGBR"];
    ML                 = "N";
    n_splits           = 5;
    n_repeats          = 1;
    nbrnd_erly_stp     = 100 ;
    mdlcv_mthd         = 'KF';
    
    # Ensemble:-    
    ensemble_req       = "N";
    enscv_mthd         = "KF";
    metric_obj         = 'minimize';
    ntrials            = 10 if test_req == "Y" else 200;
    ens_weights        = [0.54, 0.44, 0.02];
    
    # Inference:-
    inference_req      = "Y";
    
    # Global variables for plotting:-
    grid_specs = {'visible': True, 'which': 'both', 'linestyle': '--', 
                  'color': 'lightgrey', 'linewidth': 0.75
                 };
    title_specs = {'fontsize': 9, 'fontweight': 'bold', 'color': 'tab:blue'};

print();
PrintColor(f"--> Configuration done!\n");
collect();

PrintColor(f"\n" + GetMemUsage(), color = Fore.RED);


[1m[34m--> Configuration done!
[0m
[1m[31m
RAM memory GB usage = 0.2539[0m
CPU times: user 106 ms, sys: 2.17 ms, total: 108 ms
Wall time: 106 ms


In [5]:
%%time 

# Commonly used CV strategies for later usage:-
all_cv= {'KF'  : KFold(n_splits= CFG.n_splits, shuffle = True, random_state= CFG.state),
         'RKF' : RKF(n_splits= CFG.n_splits, n_repeats = CFG.n_repeats, random_state= CFG.state),
         'RSKF': RSKF(n_splits= CFG.n_splits, n_repeats = CFG.n_repeats, random_state= CFG.state),
         'SKF' : SKF(n_splits= CFG.n_splits, shuffle = True, random_state= CFG.state)
        };

# Defining the competition metric:-
def ScoreMetric(ytrue, ypred)-> float:
    return mae(ytrue, ypred);

# Designing a custom scorer to use in cross_val_predict and cross_val_score:-
myscorer = make_scorer(ScoreMetric, greater_is_better = False, needs_proba=False,);

print();
collect();

PrintColor(f"\n" + GetMemUsage(), color = Fore.RED);


[1m[31m
RAM memory GB usage = 0.2539[0m
CPU times: user 112 ms, sys: 2.82 ms, total: 115 ms
Wall time: 113 ms


In [6]:
%%time

def goto_conversion(listOfOdds, total = 1, eps = 1e-6, isAmericanOdds = False):
    "Source - https://www.kaggle.com/code/kaito510/goto-conversion-optiver-baseline-models";

    #Convert American Odds to Decimal Odds
    if isAmericanOdds:
        for i in range(len(listOfOdds)):
            currOdds = listOfOdds[i];
            isNegativeAmericanOdds = currOdds < 0;
            if isNegativeAmericanOdds:
                currDecimalOdds = 1 + (100/(currOdds*-1));
            else: 
                #Is non-negative American Odds
                currDecimalOdds = 1 + (currOdds/100);
            listOfOdds[i] = currDecimalOdds;

    #Error Catchers
    if len(listOfOdds) < 2:
        raise ValueError('len(listOfOdds) must be >= 2');
    if any(x < 1 for x in listOfOdds):
        raise ValueError('All odds must be >= 1, set isAmericanOdds parameter to True if using American Odds');

    #Computation:-
    #initialize probabilities using inverse odds
    listOfProbabilities = [1/x for x in listOfOdds];
    
    #compute the standard error (SE) for each probability
    listOfSe = [pow((x-x**2)/x,0.5) for x in listOfProbabilities];
    
    #compute how many steps of SE the probabilities should step back by
    step = (sum(listOfProbabilities) - total)/sum(listOfSe) ;
    outputListOfProbabilities = [min(max(x - (y*step),eps),1) for x,y in zip(listOfProbabilities, listOfSe)];
    return outputListOfProbabilities;

def zero_sum(listOfPrices, listOfVolumes):
    """
    Source - https://www.kaggle.com/code/kaito510/goto-conversion-optiver-baseline-models
    """;
    
    #compute standard errors assuming standard deviation is same for all stocks
    listOfSe = [x**0.5 for x in listOfVolumes];
    step = sum(listOfPrices)/sum(listOfSe);
    outputListOfPrices = [x - (y*step) for x,y in zip(listOfPrices, listOfSe)];
    return outputListOfPrices;

collect();

CPU times: user 129 ms, sys: 0 ns, total: 129 ms
Wall time: 128 ms


<a id="3"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #003380; border-bottom: 10px solid #80ffff"> DATA PROCESSING<br><div> 

<div class="alert alert-block alert-info" style = "font-family: Cambria Math;font-size: 115%; color: black; background-color: #e6f9ff; border: dashed black 1.0px; padding: 3.5px" >
In this version, we choose the int-float compressed dataset with new features as per the reference notebook <br>
</div>

In [30]:
root_path = '/kaggle/input/optiver-memoryreduceddatasets/'
name_file = 'XTrIntCmpNewFtre.parquet'

# testing
debug = True
testing_sample = 100

In [31]:
if debug:
    X = pd.read_parquet(root_path + name_file).sample(n = testing_sample)
else:
    X = pd.read_parquet(root_path + name_file)
    
y = pd.read_parquet(CFG.path + f"Ytrain.parquet").loc[X.index].squeeze()
X.index, y.index = range(len(X)), range(len(y))

In [7]:
%%time 

if (CFG.load_tr_data == "Y" or CFG.ML == "Y") and CFG.test_req == "Y":
    if isinstance(CFG.test_frac, float):
        X = pd.read_parquet(CFG.path + CFG.df_choice).sample(frac = CFG.test_frac);
    else:
        X = pd.read_parquet(CFG.path + CFG.df_choice).sample(n = CFG.test_frac);
        
    y = pd.read_parquet(CFG.path + f"Ytrain.parquet").loc[X.index].squeeze();
    PrintColor(f"---> Sampled train shapes for code testing = {X.shape} {y.shape}", 
               color = Fore.RED);
    X.index, y.index = range(len(X)), range(len(y));
    
    PrintColor(f"\n---> Train set columns for model development");
    pprint(X.columns, width = 100, depth = 1, indent = 5);
    print();

elif CFG.load_tr_data == "Y" or CFG.ML == "Y":
    X = pd.read_parquet(CFG.path + CFG.df_choice);
    y = pd.read_parquet(CFG.path + f"Ytrain.parquet").squeeze();  
    PrintColor(f"---> Train shapes for code testing = {X.shape} {y.shape}");

elif CFG.load_tr_data != "Y" or CFG.inference_req == "Y":
    PrintColor(f"---> Train data is not required as we are infering from the model");
    
print();
collect();
libc.malloc_trim(0);

PrintColor(f"\n" + GetMemUsage(), color = Fore.RED);

[1m[34m---> Train data is not required as we are infering from the model[0m

[1m[31m
RAM memory GB usage = 0.2523[0m
CPU times: user 122 ms, sys: 2.14 ms, total: 124 ms
Wall time: 120 ms


<a id="4"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #003380; border-bottom: 10px solid #80ffff"> MODEL TRAINING AND CV<br><div> 

In [None]:
%%time 

# Initializing model I-O:-

if CFG.ML == "Y":
    Mdl_Master = \
    {'CBR': CBR(**{'task_type'           : "GPU" if CFG.gpu_switch == "ON" else "CPU",
                   'objective'           : "MAE",
                   'eval_metric'         : "MAE",
                   'bagging_temperature' : 0.5,
                   'colsample_bylevel'   : 0.7,
                   'iterations'          : 500,
                   'learning_rate'       : 0.065,
                   'od_wait'             : 25,
                   'max_depth'           : 7,
                   'l2_leaf_reg'         : 1.5,
                   'min_data_in_leaf'    : 1000,
                   'random_strength'     : 0.65, 
                   'verbose'             : 0,
                   'use_best_model'      : True,
                  }
               ), 

      'LGBMR': LGBMR(**{'device'            : "gpu" if CFG.gpu_switch == "ON" else "cpu",
                        'objective'         : 'regression_l1',
                        'boosting_type'     : 'gbdt',
                        'random_state'      : CFG.state,
                        'colsample_bytree'  : 0.7,
                        'subsample'         : 0.65,
                        'learning_rate'     : 0.065,
                        'max_depth'         : 6,
                        'n_estimators'      : 500,
                        'num_leaves'        : 150,  
                        'reg_alpha'         : 0.01,
                        'reg_lambda'        : 3.25,
                        'verbose'           : -1,
                       }
                    ),

      'XGBR': XGBR(**{'tree_method'        : "gpu_hist" if CFG.gpu_switch == "ON" else "hist",
                      'objective'          : 'reg:absoluteerror',
                      'random_state'       : CFG.state,
                      'colsample_bytree'   : 0.7,
                      'learning_rate'      : 0.07,
                      'max_depth'          : 6,
                      'n_estimators'       : 500,                         
                      'reg_alpha'          : 0.025,
                      'reg_lambda'         : 1.75,
                      'min_child_weight'   : 1000,
                      'early_stopping_rounds' : CFG.nbrnd_erly_stp,
                     }
                  ),

      "HGBR" : HGBR(loss              = 'squared_error',
                    learning_rate     = 0.075,
                    early_stopping    = True,
                    max_iter          = 200,
                    max_depth         = 6,
                    min_samples_leaf  = 1500,
                    l2_regularization = 1.75,
                    scoring           = myscorer,
                    random_state      = CFG.state,
                   )
    };

print();
collect();

PrintColor(f"\n" + GetMemUsage(), color = Fore.RED);


In [None]:
%%time 

if CFG.ML == "Y":
    # Initializing the models from configuration class:-
    methods = CFG.methods;

    # Initializing a folder to store the trained and fitted models:-
    system('mkdir BaselineML');

    # Initializing the model path for storage:-
    model_path = CFG.mdl_path;

    # Initializing the cv object:-
    cv = all_cv[CFG.mdlcv_mthd];
        
    # Initializing score dataframe:-
    Scores = pd.DataFrame(index = range(CFG.n_splits * CFG.n_repeats),
                          columns = methods).fillna(0).astype(np.float32);
    
    FtreImp = pd.DataFrame(index = X.columns, columns = [methods]).fillna(0);

print();
collect();
libc.malloc_trim(0);

PrintColor(f"\n" + GetMemUsage(), color = Fore.RED);

In [None]:
%%time 

if CFG.ML == "Y":
    PrintColor(f"\n{'=' * 25} ML Training {'=' * 25}\n");
    
    # Initializing CV splitting:-       
    for fold_nb, (train_idx, dev_idx) in tqdm(enumerate(cv.split(X, y)), 
                                              f"{CFG.mdlcv_mthd} CV {CFG.n_splits}x{CFG.n_repeats}"
                                             ): 
        # Creating the cv folds:-    
        Xtr  = X.iloc[train_idx];   
        Xdev = X.iloc[dev_idx];
        ytr  = y.iloc[train_idx];
        ydev = y.iloc[dev_idx];
        
        PrintColor(f"-------> Fold{fold_nb} <-------");
        # Fitting the models:- 
        for method in methods:
            model = Mdl_Master[method];
            if method == "LGBMR":
                model.fit(Xtr, ytr, 
                          eval_set = [(Xdev, ydev)], 
                          verbose = 0, 
                          eval_metric = "mae",
                          callbacks = [log_evaluation(0,), 
                                       early_stopping(CFG.nbrnd_erly_stp, verbose = False)], 
                         );

            elif method == "XGBR":
                model.fit(Xtr, ytr, 
                          eval_set = [(Xdev, ydev)], 
                          verbose = 0, 
                          eval_metric = "mae",
                         );  

            elif method == "CBR":
                model.fit(Xtr, ytr, 
                          eval_set = [(Xdev, ydev)], 
                          verbose = 0, 
                          early_stopping_rounds = CFG.nbrnd_erly_stp,
                         ); 

            else:
                model.fit(Xtr, ytr);

            #  Saving the model for later usage:-
            joblib.dump(model, CFG.mdl_path + f'{method}V{CFG.version_nb}Fold{fold_nb}.model');
            
            # Creating OOF scores:-
            score = ScoreMetric(ydev, model.predict(Xdev));
            Scores.at[fold_nb, method] = score;
            num_space = 6- len(method);
            PrintColor(f"---> {method} {' '* num_space} OOF = {score:.5f}", 
                       color = Fore.MAGENTA);  
            del num_space, score;
            
            # Collecting feature importances:-
            try:
                FtreImp[method] = \
                FtreImp[method].values + (model.feature_importances_ / (CFG.n_splits * CFG.n_repeats));
            except:
                pass;
            
            collect();
            
        PrintColor(GetMemUsage());
        print();
        del Xtr, ytr, Xdev, ydev;
        collect();
    
    clear_output();
    PrintColor(f"\n---> OOF scores across methods <---\n");
    Scores.index.name = "FoldNb";
    Scores.index = Scores.index + 1;
    display(Scores.style.format(precision = 5).\
            background_gradient(cmap = "Pastel1")
           );
    
    PrintColor(f"\n---> Mean OOF scores across methods <---\n");
    display(Scores.mean());
    
    try: FtreImp.to_csv(CFG.mdl_path + f"FtreImp_V{CFG.version_nb}.csv");
    except: pass;
        
collect();
print();
libc.malloc_trim(0);

PrintColor(f"\n" + GetMemUsage(), color = Fore.GREEN);

<a id="5"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #003380; border-bottom: 10px solid #80ffff"> MODEL INFERENCING AND SUBMISSION<br><div> 

In [None]:
%%time 

def MakeFtre(df : pd.DataFrame, prices: list) -> pd.DataFrame:
    """
    This function creates new features using the price columns. This was used in a baseline notebook as below-
    https://www.kaggle.com/code/yuanzhezhou/baseline-lgb-xgb-and-catboost
    
    Inputs-
    df:- pd.DataFrame -- input dataframe
    cols:- price columns for transformation
    
    Returns-
    df:- pd.DataFrame -- dataframe with extra columns
    """;
    
    features = ['overall_medvol', "first5min_medvol", "last5min_medvol",
                'seconds_in_bucket', 'imbalance_buy_sell_flag',
                'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
                'imb_s1', 'imb_s2'
               ];
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)').astype(np.float32);
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)').astype(np.float32);
       
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                df[f'{a}_{b}_imb'] = df.eval(f'({a}-{b})/({a}+{b})');
                features.append(f'{a}_{b}_imb'); 
                    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = df[[a,b,c]].max(axis=1);
                    min_ = df[[a,b,c]].min(axis=1);
                    mid_ = df[[a,b,c]].sum(axis=1)-min_-max_;

                    df[f'{a}_{b}_{c}_imb2'] = ((max_-mid_)/(mid_-min_)).astype(np.float32);
                    features.append(f'{a}_{b}_{c}_imb2');
    
    return df[features];

print();
collect();

In [None]:
%%time 

# Creating the testing environment:-
if CFG.inference_req == "Y":
    try: 
        del X, y;
    except: 
        pass;
        
    prices = ['reference_price', 'far_price', 'near_price', 'bid_price', 'ask_price', 'wap'];
    
    # Making the test environment for inferencing:-
    import optiver2023;
    try: 
        env = optiver2023.make_env();
        iter_test = env.iter_test();
        PrintColor(f"\n---> Curating the inference environment");
    except: 
        pass;
    
    # Collating a list of models to be used for inferencing:-
    models = [];

    # Loading the models for inferencing:-
    if CFG.ML != "Y": 
        model_path = CFG.inf_path;
        PrintColor(f"---> Loading models from the input data for the kernel - V{CFG.version_nb}\n", 
                  color = Fore.RED);
    elif CFG.ML == "Y": 
        model_path = CFG.mdl_path;
        PrintColor(f"---> Loading models from the working directory for the kernel\n");
    
    # Loading the models from the models dataframe:-
    mdl_lbl = [];
    for _, _, filename in walk(model_path):
        mdl_lbl.extend(filename);

    models = [];
    for filename in mdl_lbl:
        models.append(joblib.load(model_path + f"{filename}"));
        
    mdl_lbl    = [m.replace(r".model", "") for m in mdl_lbl];
    model_dict = {l:m for l,m in zip(mdl_lbl, models)};
    PrintColor(f"\n---> Trained models\n");    
    pprint(np.array(mdl_lbl), width = 100, indent = 10, depth = 1);  
       
print();
collect();  
libc.malloc_trim(0);
PrintColor(f"\n" + GetMemUsage(), color = Fore.RED); 

In [None]:
%%time 

if CFG.inference_req == "Y":
    print();
    counter = 0;
    
    try:
        median_vol = pd.read_csv(CFG.path + f"MedianVolV2.csv", index_col = ['Unnamed: 0']);
    except:
        median_vol = pd.read_csv(CFG.path + f"MedianVolV2.csv"); 
    median_vol.index.name = "stock_id";
    median_vol = median_vol[['overall_medvol', "first5min_medvol", "last5min_medvol"]];
    
    for test, revealed_targets, sample_prediction in iter_test:
        if counter >= 99: num_space = 1;
        elif counter >= 9: num_space = 2;
        else: num_space = 3;
        
        PrintColor(f"{counter + 1}. {' ' * num_space} Inference", color = Fore.MAGENTA);
        test  = test.merge(median_vol, how = "left", left_on = "stock_id", right_index = True);
        Xtest = MakeFtre(test, prices = prices);
        del num_space;
        
        # Curating model predictions across methods and folds:-        
        preds = pd.DataFrame(columns = CFG.methods, index = Xtest.index).fillna(0);
        for method in CFG.methods:
            for mdl_lbl, mdl in model_dict.items():
                if mdl_lbl.startswith(f"{method}V{CFG.version_nb}"):
                    if CFG.test_req == "Y":
                        print(mdl_lbl);
                    else:
                        pass;
                    preds[method] = preds[method] + mdl.predict(Xtest)/ (CFG.n_splits * CFG.n_repeats);
        
        # Curating the weighted average model predictions:-       
        sample_prediction['target'] = \
        np.average(preds.values, weights= CFG.ens_weights, axis=1);
        
        # Source - https://www.kaggle.com/code/kaito510/goto-conversion-optiver-baseline-models     
        sample_prediction['target'] = \
        zero_sum(sample_prediction['target'], test.loc[:,'bid_size'] + test.loc[:,'ask_size'])
        
        try: 
            env.predict(sample_prediction);
        except: 
            PrintColor(f"---> Submission did not happen as we have the file already");
            pass;
        
        counter = counter+1;
        collect();
    
    PrintColor(f"\n---> Submission file\n");
    display(sample_prediction.head(10));
            
print();
collect();  
libc.malloc_trim(0);
PrintColor(f"\n" + GetMemUsage(), color = Fore.RED); 

<a id="6"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #003380; border-bottom: 10px solid #80ffff"> OUTRO<br><div> 

<div class="alert alert-block alert-info" style = "font-family: Cambria Math;font-size: 115%; color: black; background-color: #e6f9ff; border: dashed black 1.0px; padding: 3.5px" >
<b>Next steps</b> <br>
1. Exploring better models and ensemble strategy <br>
2. Purging redundant features from the existing list of features <br>
3. Fostering improvements in the existing process based on public discussions and kernels<br>
</div>

<b>References</b> <br>
1. https://www.kaggle.com/code/yuanzhezhou/baseline-lgb-xgb-and-catboost <br>
2. https://www.kaggle.com/code/renatoreggiani/optv-lightgbm -- Median volume column <br> 
3. https://www.kaggle.com/code/kaito510/goto-conversion-optiver-baseline-models -- goto conversion <br>

<div class="alert alert-block alert-info" align = "center" style = "font-family: Calibri;font-size: 150%; color: black; background-color:#ccf2ff; border: solid black 2.5px; padding: 3.5px" >
    <b>If you find this useful, please upvote the kernel and the input kernel and dataset too. <br> Best regards!</b>
</div>