## 2. Submission Pipeline
In the section below, the hyperparameters obtained in the first part are utilized for the model, which is trained and used for dynamic price prediction (the data stream is provided in the competition).

In [25]:
#### Loading the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.impute import KNNImputer
import time

In [None]:
### Setting up the data types
dtypes = {
    'stock_id' : np.float32,
    'date_id' : np.uint16,
    'seconds_in_bucket' : np.uint16,
    'imbalance_buy_sell_flag' : np.int8,
    'time_id' : np.uint16,
}

### X columns to be used
X_columns = ['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap','time_id']

### Columns used in prediction (at this stage, all of them)
X_predict = X_columns[:]

# Target column
y_columns = ['target']


### Loading the data and transforming far/near price nan values (before 300 seconds in bucket)
df = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv', dtype = dtypes).drop(['row_id'], axis = 1)

df['far_price'] = df.apply(
    lambda row: row['reference_price'] if np.isnan(row['far_price']) else row['far_price'],
    axis=1)

df['near_price'] = df.apply(
    lambda row: row['reference_price'] if np.isnan(row['near_price']) else row['near_price'],
    axis=1)



### Removing NaN values, since there is only 200 of them
df = df.dropna()

### Removing NaN values, since there is only 200 of them
X_current_dataset, y_current_dataset = df.loc[:,X_columns].copy(), df.loc[:,y_columns].copy()

In [None]:
### Creating the XGB Regressor model, parameters estimated by hyperparmeter optimization
model =xgb.XGBRegressor(base_score=0.5, booster='gbtree',    
                       n_estimators=100,
                       objective='reg:squarederror',
                       max_depth=3,
                       learning_rate=0.01).fit(X_current_dataset[X_predict], y_current_dataset)

### Using and fitting an imputer for the filling of NaN values in the testing set, before prediction or retraining is done
x_imputer = KNNImputer(n_neighbors=5)
x_imputer.fit(X_current_dataset[X_columns])
y_imputer = KNNImputer(n_neighbors=5)
y_imputer.fit(y_current_dataset)

In [None]:
### Counter for each iteration
counter = 0

### Counter for retraining (or a "daily counter")
rt_i  = 0

### Lists and dictionaries to store the data
test_history = []
revealed_targets_history = {}
prediction_history = []
sample_predictions_history = []

### How many counters to skip
n_counts_to_skip = 54
for (test, revealed_targets, sample_prediction) in iter_test:
    print('Counter:',counter)
    if counter >n_counts_to_skip and counter%number_of_bucket_iter == 0 and len(revealed_targets) > 1 and False:
        ### Storing the revealed targets
        revealed_targets_history[counter] = revealed_targets
        
        ### The data for revealed targets that given in this itertation(i-1), corresponds to the previous day (i-1)
        X_revealed = pd.concat(test_history[number_of_bucket_iter*rt_i:number_of_bucket_iter*(rt_i+1)])
        y_revealed = revealed_targets_history[number_of_bucket_iter*(rt_i+1)]['revealed_target']
        
        ### Making sure nan values are filled (some other way should be done)
        X_revealed = pd.DataFrame(x_imputer.transform(X_revealed[X_columns]), columns=X_columns )
        y_revealed = pd.DataFrame(y_imputer.transform(y_revealed.to_frame().rename(columns= {'revealed_target': 'target'})),columns=y_columns)
        
        ### The revealed target index (rt_i) is increased by 1
        rt_i +=1
        
        ### Appending the data to the entire dataset, if the date_id is not present in the current dataset
        date_id = X_revealed['date_id'].values[0]
        if date_id not in X_current_dataset['date_id'].values:
            X_current_dataset = pd.concat([X_current_dataset,X_revealed]).reset_index(drop=True).copy()
            y_current_dataset = pd.concat([y_current_dataset,y_revealed]).reset_index(drop=True).copy()
        
            #### Retraining the model 
            x_imputer.fit(X_current_dataset[X_columns])
            y_imputer.fit(y_current_dataset[X_columns])
            model.fit( X_current_dataset, y_current_dataset,
                verbose=100,xgb_model =model.get_booster())
                
    ### Dealing with null values of the current iteration of the test data
    test.loc[:,'far_price']= test.apply(lambda row: row['reference_price']  if np.isnan(row['far_price']) else row['far_price'],axis=1)
    test.loc[:,'near_price'] =test.apply(lambda row: row['reference_price']  if np.isnan(row['near_price']) else row['near_price'],axis=1)
    test.loc[:,'time_id'] = test.apply(lambda row: int(55 * row['date_id'] + row['seconds_in_bucket']/10),axis=1)
    
    ### Using the imputer to deal with potential missing values in X_test
    X_test = test[X_columns]
    #X_test['target_class'] = model_target_class.predict(test[X_columns])
    X_test = pd.DataFrame(x_imputer.transform(X_test),columns = X_columns)

    ### Creating the prediction column by using the current iteration of test data
    sample_prediction['target'] = 0
    if counter > n_counts_to_skip:#164:
        sample_prediction['target'] = model.predict(X_test[X_predict])
        
    ### Storing the current iteration of the data in the lists
    test_history.append(X_test)
    prediction_history.append(sample_prediction)
    
    ### Submiting the prediction
    env.predict(sample_prediction)
    
    #Movin the iter counter
    counter += 1