# 1. Hyperparameter optimization 
To find the best hyperparemeter values, time-segmented k-fold evaluation is performed.

In [25]:
#### Loading the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.impute import KNNImputer
import time

In [None]:
def create_folds(number_of_folds,rolling_test_size_ratio,fold_size,df):
    """
    Splits a DataFrame into training and testing sets for cross-validation with a rolling window approach.
    
    Parameters:
    - number_of_folds: The number of folds or partitions to create for cross-validation.
    - rolling_test_size_ratio: The ratio of each fold to be used as the test set.
    - fold_size: The size of each fold in terms of the DataFrame's rows.
    - df: The DataFrame to be split into folds.

    Returns:
    - fold_dict: A dictionary containing each fold's DataFrame.
    - X_train_dict, y_train_dict: Dictionaries containing the training data features and target values for each fold.
    - X_test_dict, y_test_dict: Dictionaries containing the testing data features and target values for each fold.
    """
    rolling_test_size = int(fold_size*rolling_test_size_ratio)
    print("Creating",number_of_folds,'folds')
    # Preparing dictionaries
    fold_dict = {}
    X_train_dict = {}
    y_train_dict = {}
    X_test_dict = {}
    y_test_dict = {}
    for f_i in range(1,number_of_folds+1):
        train_start_date = (f_i-1) * fold_size
        train_end_date = f_i * fold_size - rolling_test_size
        test_start_date = f_i * fold_size - rolling_test_size
        test_end_date =  f_i * fold_size
        
        # Filtering folds from the dataframe
        fold = df.loc[(df['date_id'] < test_end_date)& (df['date_id']  >= train_start_date)]
        X_train = df.loc[(df['date_id'] < train_end_date)& (df['date_id']  >= train_start_date)][X_columns]
        y_train = df.loc[(df['date_id'] < train_end_date)& (df['date_id']  >= train_start_date)][['target']]
        
        X_test =  df.loc[(df['date_id'] < test_end_date)& (df['date_id']  >=  test_start_date )][X_columns]
        y_test = df.loc[(df['date_id'] < test_end_date)& (df['date_id']   >=  test_start_date )][['target']]
        if f_i == number_of_folds: # Addressing the last fold
            fold = df.loc[(df['date_id'] <= test_end_date)& (df['date_id']  >= train_start_date)]
            X_test =  df.loc[(df['date_id'] <= test_end_date)& (df['date_id']  >=  test_start_date )][X_columns]
            y_test = df.loc[(df['date_id'] <= test_end_date)& (df['date_id']   >=  test_start_date )][['target']]
        
        # Creating copies of the dataframes to avoid pointer issues
        fold_dict[f_i] = fold.copy()
        
        X_train_dict[f_i] =  X_train.copy()
        y_train_dict[f_i] =  y_train.copy()
        
        X_test_dict[f_i] =  X_test.copy()
        y_test_dict[f_i] =  y_test.copy()
    return fold_dict.copy(),X_train_dict.copy(), y_train_dict.copy(),X_test_dict.copy(), y_test_dict.copy()



In [None]:
def evaluate_folds(df,fold_dict,X_train_dict, y_train_dict,X_test_dict, y_test_dict,par_dict):
        """
        Evaluates model performance across all folds using Mean Absolute Error (MAE) as the metric.

        Parameters:
        - df: The original DataFrame.
        - fold_dict: A dictionary of DataFrames representing each fold.
        - X_train_dict, y_train_dict: Dictionaries containing the training features and target values for each fold.
        - X_test_dict, y_test_dict: Dictionaries containing the testing features and target values for each fold.
        - par_dict: A dictionary of model parameters.

        Returns:
        - mae_test_list: A list of MAE scores for the test sets across all folds.
        - mae_train_list: A list of MAE scores for the training sets across all folds.
        """
            mae_test_list = []
            mae_train_list = []
            for f_i in range(1,number_of_folds+1):
                start_fold = time.time()
                # Model is pretrained with the training data
                model = xgb.XGBRegressor(base_score=0, booster='gbtree',    
                           n_estimators=par_dict['n_estimators'],
                           objective='reg:squarederror',
                           max_depth=par_dict['max_depth'],
                           eta=par_dict['eta'],
                            min_child_weight=par_dict['min_child'],
                            subsample =par_dict['subsample'],
                            gamma =par_dict['gamma'],
                            reg_lambda = par_dict['lambda'])
                X_seconds_list = []
                
                # Setting up the dataframes
                X_train_original = X_train_dict[f_i].copy()
                y_train_original = y_train_dict[f_i].copy()
                X_current_dataset = X_train_dict[f_i].copy()
                y_current_dataset = y_train_dict[f_i].copy()
                
                # Training the model and giving predictions on the test set
                model.fit(X_train_original , y_train_original)
                y_train_prediction_results = model.predict(X_train_original)
                X_train_original['target_pred'] = list(y_train_prediction_results)
                X_train_original['target']  = y_train_original
                X_train_original['absolute_error'] = abs(X_train_original['target'] - X_train_original['target_pred'])
                mae_train_list.append(np.round(X_train_original['absolute_error'].mean(),5))
                
                for date_id in sorted(X_test_dict[f_i]['date_id'].unique()):
                    # Retraining the model with previous day data
                    test_start_date_id =  min(X_test_dict[f_i]['date_id'].unique())
                    if date_id > test_start_date_id:
                        X_previous_day_test = X_test_dict[f_i][X_test_dict[f_i]['date_id'] == date_id - 1].copy()
                        y_previous_day_test =  y_test_dict[f_i][X_test_dict[f_i]['date_id'] == date_id - 1].copy()
                        
                        # Retraining the model
                        X_current_dataset = pd.concat([ X_current_dataset,X_previous_day_test ]).reset_index(drop=True).copy()
                        y_current_dataset = pd.concat([ y_current_dataset,y_previous_day_test]).reset_index(drop=True).copy()
                        model.fit(X_current_dataset , y_current_dataset, xgb_model =  model.get_booster())    
                        
                    #Predicting the current day data
                    X_day_test = X_test_dict[f_i][X_test_dict[f_i]['date_id'] == date_id].copy()
                    y_day_test = y_test_dict[f_i][X_test_dict[f_i]['date_id'] == date_id].copy()
                    
                    # Predicting for each batch of 10 seconds
                    for seconds_in_bucket in sorted(list(X_day_test['seconds_in_bucket'].unique())):
                        X_seconds_test = X_day_test[X_day_test['seconds_in_bucket'] == seconds_in_bucket].copy()
                        y_seconds_test = y_day_test[X_day_test['seconds_in_bucket'] == seconds_in_bucket].copy()
  
                        # Testing predictions
                        X_seconds_test['target_pred'] = list(model.predict(X_seconds_test))
                        X_seconds_test['target'] = y_seconds_test.copy()
                        X_seconds_list.append(X_seconds_test.copy())
                
                    end_fold = time.time()
                    print('Date:',date_id,'from',min( X_test_dict[f_i]['date_id'].unique()),'to', max(X_test_dict[f_i]['date_id'].unique()),'| Total time spent on this fold',np.round((end_fold - start_fold)/60,2),'minutes ****')                
                X_test_df = pd.concat(X_seconds_list).copy()
                X_test_df['absolute_error'] = abs(X_test_df['target'] - X_test_df['target_pred'])
                mae_test_list.append(np.round(X_test_df['absolute_error'].mean(),5))
            return  mae_test_list.copy(), mae_train_list.copy()

In [None]:
### A bit of data cleaning to replace the missing nan values
df['far_price'] = df.apply(
    lambda row: row['reference_price'] if np.isnan(row['far_price']) else row['far_price'],
    axis=1)

df['near_price'] = df.apply(
    lambda row: row['reference_price'] if np.isnan(row['near_price']) else row['near_price'],
    axis=1)

### Droping rows with missing values
df = df.dropna()

### Print statement to indicate the start of fold creation
print("Creating folds")

### Constants for fold creation
number_of_stocks = 200
number_of_bucket_iter = 55
number_of_folds = 3
total_number_of_days = 480
fold_size =  total_number_of_days/number_of_folds
rolling_test_size_ratio = 0.01
rolling_test_size = int(fold_size*rolling_test_size_ratio)

### Hyperparameter lists for tuning
max_depth_list = [1,2,3,4,5]
n_estimators_list = [3,10]
gamma_list = [0,100]
lambda_list = [0,100]
min_child_list = [0,100]
subsample_list = [1,0.5]
eta_list = [0.3,0.001]

### Total number of iterations for hyperparameter tuning
total_iter = len(max_depth_list) * len(n_estimators_list) * len(gamma_list) * len(min_child_list)*len(subsample_list)*len(eta_list)*len(lambda_list)

### Dictionaries for storing results
result_mean_dict = {}
result_std_dict = {}
results_df_dict =  {'max_depth':[],'n_estimators':[],'eta':[],'min_child':[],'subsample':[],'gamma':[],'lambda':[],'train_mean_mae':[],'train_std_mae':[],'test_mean_mae':[],'test_std_mae':[]}
count = 1

### Printing statement to indicate the start of hyperparameter evaluation
print('Hyperparameter evaluation starting')
### Dictionary to hold method-specific configurations
method_dict = {'model_name' : 'all_stocks_xgboost',
'retraining_freq' : 'daily_retraining',
'retraining_method' : 'on_full_data'}

### Looping through all combinations of hyperparameters
for subsample in subsample_list:
    for min_child in min_child_list:
        for gamma in gamma_list:
            for lambda_p in lambda_list:
                for eta in eta_list:
                    for n_estimators in n_estimators_list:
                        for max_depth in max_depth_list:
                            start = time.time()
                            ### Dictionary for current set of hyperparameters
                            par_dict = {}
                            par_dict['n_estimators'] = n_estimators
                            par_dict['max_depth'] = max_depth
                            par_dict['gamma'] = gamma
                            par_dict['subsample'] = subsample
                            par_dict['min_child'] = min_child
                            par_dict['eta'] = eta
                            par_dict['lambda'] = lambda_p

                            ### Appending current hyperparameters to results dictionary
                            results_df_dict['n_estimators'].append(n_estimators)        
                            results_df_dict['max_depth'].append(max_depth)
                            results_df_dict['gamma'].append(gamma)
                            results_df_dict['subsample'].append(subsample)
                            results_df_dict['min_child'].append(min_child)
                            results_df_dict['eta'].append
                            results_df_dict['lambda'].append(lambda_p) 
                            
                            ### Splitting the folds
                            fold_dict,X_train_dict, y_train_dict,X_test_dict, y_test_dict = create_folds(number_of_folds,rolling_test_size_ratio,fold_size,df.copy())
                            
                            ### Obtaining test results and storing them
                            mae_test_list, mae_train_list = evaluate_folds(df.copy(),fold_dict.copy(),X_train_dict.copy(), y_train_dict.copy(),X_test_dict.copy(), y_test_dict.copy(),par_dict.copy())

                            test_mean = np.round(np.mean(mae_test_list.copy()),5)
                            test_std = np.round(np.std(mae_test_list.copy()),5)
                            results_df_dict['test_mean_mae'].append(test_mean)
                            results_df_dict['test_std_mae'].append(test_std)

                            train_mean = np.round(np.mean(mae_train_list.copy()),5)
                            train_std = np.round(np.std(mae_train_list.copy()),5)        
                            results_df_dict['train_mean_mae'].append(train_mean)
                            results_df_dict['train_std_mae'].append(train_std)

                            result_df = pd.DataFrame(results_df_dict)
                            result_df.to_csv(method_dict['model_name'] + '_' + method_dict['retraining_freq'] + '_' + method_dict['retraining_method']+ '_n_folds_' + str(number_of_folds) + '_test_size_' + str(rolling_test_size) + '.csv')
                            # Print progress update and timing
                            end = time.time()
                            print('**** Validation',count,'out of',total_iter,'Total time spent on these hyperparameters',np.round((end - start)/60,2),'minutes ****')
                            count+=1

In [None]:
# Ranking the test set results
result_df.sort_values(['test_mean_mae'], ascending=[True]).round(4)