In [5]:
DEBUG = True
MODE = 'TRAIN'
#MODE = 'Inference'
MODEL_DIR = '../input/optiver-lgb-and-te-baseline'

In [6]:
import numpy as np
import pandas as pd
import gc
import pathlib
from tqdm.auto import tqdm # Widget progress bar
import json
from multiprocessing import Pool, cpu_count
import time
import requests as re
from datetime import datetime
from dateutil.relativedelta import relativedelta, FR


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import pyarrow.parquet as pq
import glob
import os
from sklearn import model_selection
import joblib
import lightgbm as lgb

import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.style as style
from matplotlib_venn import venn2, venn3
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context('talk')
style.use('seaborn-colorblind')

import warnings
warnings.simplefilter('ignore')

pd.get_option('display.max_columns')

In [7]:
class CFG:
    INPUT_DIR = '../input/optiver-realized-volatility-prediction'
    OUTPUT_DIR = './'

In [8]:
# Logging = essential 
# import logging
# logging.basicConfig(
#     level = logging.DEBUG,
#     format='{asctime} {levelname:<8} {message}',
#     styel='{'
# )

def init_logger(log_file='train.log'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter('%(message)s'))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter('%(message)s'))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

logger = init_logger(log_file=f'{CFG.OUTPUT_DIR}/baseline.log')
logger.info(f'Start Logging...')

In [9]:
train = pd.read_csv(os.path.join(CFG.INPUT_DIR, 'train.csv'))  # target, what you are trying to predict, first appears in train
test = pd.read_csv(os.path.join(CFG.INPUT_DIR, 'test.csv'))
ss = pd.read_csv(os.path.join(CFG.INPUT_DIR, 'sample_submission.csv'))

In [10]:
train

In [15]:
train['target'].describe()

In [16]:
# train['stock_id'].value_counts()
# train['time_id'].value_counts()

In [17]:
test.head()

In [18]:
ss.head()

In [19]:
sns.displot(data=train['target'])

In [None]:
fig, ax = plt.subplots(16, 7, figsize=(20, 60))
ax = ax.flatten()

for i, stock_id in tqdm(enumerate(train['stock_id'].unique())):
    ax[i].hist(train.query('stock_id == @stock_id')['target'], bins=100)
    ax[i].set_title(stock_id)
plt.tight_layout()

In [20]:
train_book_stocks = os.listdir(os.path.join(CFG.INPUT_DIR, 'book_train.parquet'))
train_trade_stocks = os.listdir(os.path.join(CFG.INPUT_DIR, 'trade_train.parquet'))
if DEBUG:
    logger.info('Debug mode: using 5 stocks only')
    train_book_stocks = train_book_stocks[:5]
    
logger.info('{:,} train book stocks: {}'.format(len(train_book_stocks), train_book_stocks))

In [21]:
test_book_stocks = os.listdir(os.path.join(CFG.INPUT_DIR, 'book_test.parquet'))
test_trade_stocks = os.listdir(os.path.join(CFG.INPUT_DIR, 'trade_test.parquet'))
print('{:,} test book stocks: {}'.format(len(test_book_stocks), test_book_stocks))
print('{:,} test trade stocks: {}'.format(len(test_trade_stocks), test_trade_stocks))
print(test_trade_stocks == test_book_stocks)

In [22]:
#load stock_id=0
def load_book(stock_id, data_type='train'):
    """
    load parquet book data for given stock_id
    """
    #the following line imports the parquet book train data and selects the specified stock_id to be stored in book_df
    book_df = pd.read_parquet(os.path.join(CFG.INPUT_DIR, f'book_{data_type}.parquet/stock_id={stock_id}')) 
    
    book_df['stock_id'] = stock_id
    book_df['stock_id'] = book_df['stock_id'].astype(np.int8) #int8 Byte (-128 to 127), parquet returns stock_id as categorical
    
    return book_df

In [23]:
def load_trade(stock_id=0, data_type='train'):
    """
    load parquet trade data for given stock_id
    """
    #the following line imports the parquet trade train data and selects the specified stock_id to be stored in trade_df
    trade_df = pd.read_parquet(os.path.join(CFG.INPUT_DIR, f'trade_{data_type}.parquet/stock_id={stock_id}'))
    trade_df['stock_id'] = stock_id
    trade_df['stock_id'] = trade_df['stock_id'].astype(np.int8)
    
    return trade_df

In [24]:
def fix_jsonerr(df):
    # fix json column error for lightgbm
    # isalnum returns true if all digits in string are alphanumeric, letters or numbers
    df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]
    return df

In [25]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()


def realized_volatility(stock_price):
    series_log_return = log_return(stock_price)
    return np.sqrt(np.sum(series_log_return ** 2))

In [26]:
def fe_row(book):
    #Feature engineering (just volatility) for each row
    
    #volatility
    for i in[1,2]:
        #wap
        book[f'book_wap{i}'] = (book[f'bid_price{i}'] * book[f'ask_size{i}'] + book[f'ask_price{i}'] * book[f'bid_size{i}']) / (book[f'bid_size{i}'] + book[f'ask_size{i}'])
            
    #mean wap
    book['book_wap_mean'] = (book['book_wap1'] + book['book_wap2']) / 2
    
    #wap diff
    book['book_wap_diff'] = book['book_wap1'] - book['book_wap2']
    
    #other orderbook features
    book['book_price_spread'] = (book['ask_price1'] - book['bid_price1']) / (book['ask_price1'] + book['bid_price1'])
    book['book_bid_spread'] = book['bid_price1'] - book['bid_price2']
    book['book_ask_spread'] = book['ask_price1'] - book['ask_price2']
    book['book_total_volume'] = book['ask_size1'] + book['ask_size2'] + book['bid_size1'] + book['bid_size2']
    book['book_volume_imbalance'] = (book['ask_size1'] + book['ask_size2']) - (book['bid_size1'] + book['bid_size2'])
    
    #level volume imbalance
    for i in[1,2]:
        book[f'book_bid_ask_vol_imbalance{i}'] = book[f'ask_size{i}'] - book[f'bid_size{i}']
        
    return book

In [27]:
def fe_agg(book_df):
    #feature engineering (aggregation by stock_id x time_id)
    
    # features
    book_feats = book_df.columns[book_df.columns.str.startswith('book_')].values.tolist()
    trade_feats = ['price', 'size', 'order_count', 'seconds_in_bucket']
    
    # agg trade features
    trade_df = book_df.groupby(['time_id', 'stock_id'])[trade_feats].agg(['sum', 'mean', 'std', 'max', 'min']).reset_index()
    
    # agg volatility features
    fe_df = book_df.groupby(['time_id', 'stock_id'])[book_feats].agg([realized_volatility]).reset_index()
    fe_df.columns = [" ".join(col).strip() for col in fe_df.columns.values]
    
    # merge
    fe_df = fe_df.merge(trade_df, how='left', on=['time_id', 'stock_id'])
    
    return fe_df

In [28]:
%time
#stock_ids = [int(i.split('=')[-1]) for i in train_book_stocks]
stock_ids = [0]
book_df = list(tqdm(map(load_book, stock_ids), total=len(stock_ids)))
book_df = pd.concat(book_df)

In [30]:
book_df #lots of data on single stock

In [31]:
%time
trade_df = list(tqdm(map(load_trade, stock_ids), total=len(stock_ids)))
trade_df = pd.concat(trade_df)

In [33]:
trade_df

In [34]:
# Just for inner observation, don't run
# book_df = book_df.merge(trade_df, how='inner', on=['time_id','seconds_in_bucket','stock_id'])
# book_df.head()

In [35]:
book_df = book_df.merge(trade_df, how='outer', on=['time_id','seconds_in_bucket','stock_id'])
book_df.head(15)

In [36]:
book_df = fe_row(book_df) # more features

In [37]:
book_df

In [38]:
# Realized Volatility calculation based on book_wap1 example
book_df1 = book_df[['time_id','seconds_in_bucket','bid_price1','ask_price1','bid_size1','ask_size1','bid_price2','ask_price2','book_wap1','book_wap2','stock_id','price','size','order_count']]
book_df1

In [39]:
book_df1 = book_df1.merge(train[train['stock_id'] == 0], how='outer', on=['stock_id','time_id'])

In [40]:
book_df1

In [41]:
b1 = book_df1[book_df1['time_id'] == 5]
b2 = b1[['seconds_in_bucket','price']].dropna()
b3 = b1[['seconds_in_bucket', 'bid_price1','ask_price1']]
b4 = b1[['seconds_in_bucket', 'bid_price2','ask_price2']]

In [42]:
fig, ax = plt.subplots()

line1 = ax.plot(b2['seconds_in_bucket'], b2['price'], label='Traded Price',  color='black')
line2 = ax.plot(b3['seconds_in_bucket'],b3['bid_price1'], label='bid1', dashes=[6,2])
line3 = ax.plot(b3['seconds_in_bucket'],b3['ask_price1'], label='ask1', dashes=[6,2])
line4 = ax.plot(b4['seconds_in_bucket'],b4['bid_price2'], label='bid2', dashes=[6,2])
line5 = ax.plot(b4['seconds_in_bucket'],b4['ask_price2'], label='ask2', dashes=[6,2], color='purple')

ax.set_title('Stock:0, Time_id: 5, Bids vs Traded Price vs Asks')
ax.legend(prop={'size': 8})
fig = plt.figure(figsize=(20, 16))
plt.tight_layout()

In [45]:
df = book_df1[(book_df1['stock_id']==0) & (book_df1['time_id']==5)]

In [46]:
# Realized Volatility calculation, Root Sum of Squares
log_df = np.log(df['book_wap1']).diff()
log_df = np.power(log_df,2)
log_df = np.sum(log_df)
np.sqrt(log_df)

In [47]:
book_feats = book_df1.columns[book_df1.columns.str.startswith('book_')].values.tolist()
fe_df = book_df1.groupby(['time_id', 'stock_id'])[book_feats].agg([realized_volatility]).reset_index()
fe_df.columns = [" ".join(col).strip() for col in fe_df.columns.values]
fe_df

In [48]:
fe_df['book_wap1 realized_volatility'].describe()

In [49]:
train[train['stock_id']==0]['target'].describe()

In [50]:
fe_df['book_wap1 realized_volatility'].hist(bins=100) #blue
train[train['stock_id']==0]['target'].hist(bins=100) #green
plt.title(label='Stock 0: Calculated Realized Volatility and Stock 0: Train Target Histogram')

In [51]:
# plt.boxplot(fe_df['book_wap1 realized_volatility'],vert=False)
# plt.title(label='Stock 0: Realized Volatility Box Plot')
# Use x instead of y argument for horizontal plot
#fig.add_trace(go.Box(x=x1))

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Box(x=fe_df['book_wap1 realized_volatility'],boxpoints='all',jitter=0.44))
fig.show()

In [53]:
# This is where the previous functions come together
def fe_all(book_df):
    """
    perform feature engineerings
    """
      
    # row-wise feature engineering, ADDS THE EXTRA FEATURES TO THE DATAFRAME
    book_df = fe_row(book_df)
    
    # feature engineering agg by stock_id x time_id, ADDS REALIZED VOLATILITY TO SOME OF THE FEATURES
    fe_df = fe_agg(book_df)
    
    return fe_df

In [54]:
def book_fe_by_stock(stock_id=0):
    """
    load orderbook and trade data for the given stock_id and merge
    
    """
    # load data
    book_df = load_book(stock_id, 'train')
    trade_df = load_trade(stock_id, 'train')
    book_feats = book_df.columns.values.tolist()
    
    # merge
    book_df = book_df.merge(trade_df, how='outer', on=['time_id', 'seconds_in_bucket', 'stock_id'])
    
    # sort by time
    book_df = book_df.sort_values(by=['time_id', 'seconds_in_bucket'])
    
    # fillna for book_df
    book_df[book_feats] = book_df[book_feats].fillna(method='ffill')
    
    # feature engineering
    fe_df = fe_all(book_df)
    return fe_df

In [55]:
def book_fe_by_stock_test(stock_id=0):
    """
    same function but for the test
    
    """
    # load data
    book_df = load_book(stock_id, 'test')
    trade_df = load_trade(stock_id, 'test')
    book_feats = book_df.columns.values.tolist()
    
    # merge
    book_df = book_df.merge(trade_df, how='outer', on=['time_id', 'seconds_in_bucket', 'stock_id'])
    
    # sort by time
    book_df = book_df.sort_values(by=['time_id', 'seconds_in_bucket'])
    
    # fillna for book_df
    book_df[book_feats] = book_df[book_feats].fillna(method='ffill')
    
    # feature engineering
    fe_df = fe_all(book_df)
    return fe_df

In [56]:
"""
So, the primary reasons to use imap/imap_unordered over map/map_async are:
Your iterable is large enough that converting it to a list would cause you to run out of/use too much memory.
You want to be able to start processing the results before all of them are completed.
"""

In [57]:
def book_fe_all(stock_ids, data_type='train'):
    # feature engineering with multithread processing
    
    # feature engineering agg by stock_id x time_id
    with Pool(cpu_count()) as p:
        if data_type == 'train':
            feature_dfs = list(tqdm(p.imap(book_fe_by_stock, stock_ids), total=len(stock_ids))) # this is where it can take a list of the stock ids
        elif data_type == 'test':
            feature_dfs = list(tqdm(p.imap(book_fe_by_stock_test, stock_ids), total=len(stock_ids)))
            
    fe_df = pd.concat(feature_dfs)
    
    # feature engineering agg by stock_id
    volatility_feats = [f for f in fe_df.columns if ('realized' in f) & ('wap' in f)]
    if data_type == 'train':
        # agg
        stock_df = fe_df.groupby('stock_id')[volatility_feats].agg(['mean', 'std', 'max', 'min']).reset_index()
        
        # fix column names
        stock_df.columns = ['stock_id'] + [f'{f}_stock' for f in stock_df.columns.values.tolist()[1:]]
        stock_df = fix_jsonerr(stock_df)
        
    # feature engineering agg by time_id
    time_df = fe_df.groupby('time_id')[volatility_feats].agg(['mean', 'std', 'max', 'min']).reset_index()
    time_df.columns = ['time_id'] + [f'{f}_time' for f in time_df.columns.values.tolist()[1:]]
    
    # merge
    fe_df = fe_df.merge(time_df, how='left', on='time_id')
    
    # make sure to fix json error for lightgbm
    fe_df = fix_jsonerr(fe_df)
    
    # out
    if data_type == 'train':
        return fe_df, stock_df
    elif data_type == 'test':
        return fe_df

In [58]:
%%time
MODE = 'TRAIN'
if MODE == 'TRAIN':
    # all book data feature engineering
    stock_ids = [int(i.split('=')[-1]) for i in train_book_stocks]
    book_df, stock_df = book_fe_all(stock_ids, data_type='train')
    
    assert book_df['stock_id'].nunique() > 2
    assert book_df['time_id'].nunique() > 2
    
    # save stock_df for the test
    stock_df.to_pickle('train_stock_df.pkl')
    logger.info('train stock df saved!')
    
    # merge book_df,stock_df, and train
    book_df = book_df.merge(stock_df, how='left', on='stock_id').merge(train, how='left', on=['stock_id', 'time_id']).replace([np.inf, -np.inf], np.nan).fillna(method='ffill')
    
    # make row_id
    book_df['row_id'] = book_df['stock_id'].astype(str) + '-' + book_df['time_id'].astype(str)
    
    print(book_df.shape)
    book_df.head()

In [59]:
book_df

In [60]:
stock_df.columns

In [61]:
stock_df

In [62]:
# test
test_book_stocks = os.listdir(os.path.join(CFG.INPUT_DIR, 'book_test.parquet'))

logger.info('{:,} test book stocks: {}'.format(len(test_book_stocks), test_book_stocks))

# all book data feature engineering
test_stocks_ids = [int(i.split('=')[-1]) for i in test_book_stocks]
test_book_df = book_fe_all(test_stocks_ids, data_type='test')

# load stock_df, if inference
#MODE = 'INFERENCE'
if MODE == 'INFERENCE':
    stock_df = pd.read_pickle('./train_stock_df.pkl')
    #stock_df = pd.read_pickle(f'{MODEL_DIR}/train_stock_df.pkl')
    
# merge
test_book_df = test.merge(stock_df, how='left', on='stock_id').merge(test_book_df, how='left', on=['stock_id', 'time_id']).replace([np.inf, -np.inf], np.nan).fillna(method='ffill')

# make row_id
test_book_df['row_id'] = test_book_df['stock_id'].astype(str) + '-' + test_book_df['time_id'].astype(str)

print(test_book_df.shape)
test_book_df.head()

In [63]:
#prepares model input parameters for testing/inference
# LIGHTGBM will ignore missing values during a split, then allocate them to whichever side reduces the loss the most
target = 'target'
drops = [target, 'row_id', 'time_id']
features = [f for f in test_book_df.columns.values.tolist() if (f not in drops) & (test_book_df[f].isna().sum() == 0) & (book_df[f].isna().sum() == 0)]
cats = ['stock_id']

logger.info('{:,} features ({:,} categorical): {}'.format(len(features), len(cats), features))

In [64]:
#training
target = 'target'
drops = [target, 'row_id', 'time_id']
train_features = [f for f in book_df.columns.values.tolist() if (f not in drops) & (book_df[f].isna().sum() == 0)]
cats = ['stock_id']

logger.info('{:,} features ({:,} categorical): {}'.format(len(train_features), len(cats), train_features))

In [65]:
features

In [66]:
train_features

In [None]:
# evaluation metric
def RMSPEMetric():
    
    def RMSPE(yhat, dtrain):
        y = dtrain.get_label()
        elements = ((y - yhat) / y) ** 2
        return 'RMSPE', float(np.sqrt(np.sum(elements) / len(y))), False
        
    return RMSPE

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [None]:
def fit_model(params, X_train, y_train, X_test, features=features, cats=[], era='stock_id', fold_type='kfold', n_fold = 5, seed=42):
    """
    fit model with cv (cross validation)
    """
    
    models = []
    oof_df = X_train[['time_id', 'stock_id', target]].copy()
    oof_df['pred'] = np.nan
    y_preds = np.zeros((len(X_test),))
    
    if fold_type == 'stratifiedshuffle':
        cv = model_selection.StratifiedShuffleSplit(n_splits=n_fold, random_state=seed)
        kf = cv.split(X_train, X_train[era])
    elif fold_type == 'kfold':
        cv = model_selection.KFold(n_splits=n_fold, shuffle=True, random_state=seed)
        kf = cv.split(X_train, y_train)
    
    fi_df = pd.DataFrame()
    fi_df['features'] = features
    fi_df['importance'] = 0
    
    for fold_id, (train_index, valid_index) in tqdm(enumerate(kf)):
        #split
        X_tr = X_train.loc[train_index, features]
        X_val = X_train.loc[valid_index, features]
        y_tr = y_train.loc[train_index]
        y_val = y_train.loc[valid_index]
        
        
        # model, note inverse weighting
        train_set = lgb.Dataset(X_tr, y_tr, categorical_feature = cats, weight = 1/np.power(y_tr, 2))
        val_set = lgb.Dataset(X_val, y_val, categorical_feature = cats, weight = 1/np.power(y_val, 2))
        # model training
        model = lgb.train(params, train_set, valid_sets = [train_set, val_set], feval=RMSPEMetric(), verbose_eval=250)
        
        # feature importance
        fi_df[f'importance_fold{fold_id}'] = model.feature_importance(importance_type='gain')
        fi_df['importance'] += fi_df[f'importance_fold{fold_id}'].values
        
        # save model
        joblib.dump(model, f'model_fold{fold_id}.pkl')
        logger.debug('model saved!')
        
        # predict
        oof_df['pred'].iloc[valid_index] = model.predict(X_val)
        y_pred = model.predict(X_test[features])
        y_preds += y_pred / n_fold
        models.append(model)
        
    return oof_df, y_preds, models, fi_df

In [None]:
%%time

#Overfitting on the training data

params = {
    "device": "gpu",
    "gpu_platform_id": 0,
    "gpu_device_id": 0,
    'n_estimators' : 500,
    'objective': 'rmse',
    'boosting_type': 'dart',
    'max_depth': -1,
    'learning_rate': 0.3,
    'num_leaves': 128,
    'min_data_in_leaf': 1000,
    'max_bin': 70,
    'subsample': 0.82,
    'subsample_freq': 7,
    'feature_fraction': 0.6,
    'lambda_l1': 0.5,
    'lambda_l2': 1,
    #'min_gain_to_split': 0.009275035155177913,
    'seed': 42,
    'early_stopping_rounds': 50,
    'verbose': -1
}

if MODE == 'TRAIN':
    oof_df, y_preds, models, fi_df = fit_model(params,
                                              book_df,
                                              book_df[target],
                                              test_book_df,
                                              features=train_features,
                                              cats = cats,
                                              era = 'stock_id',
                                              fold_type = 'stratifiedshuffle',
                                              n_fold = 5,
                                              seed = 42)

In [None]:
fi_df.sort_values(by=['importance'],ascending=False)

In [None]:
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

if MODE == 'TRAIN':
    oof_df.dropna(inplace=True)
    y_true = oof_df[target].values
    y_pred = oof_df['pred'].values
    
    oof_df[target].hist(bins=100)
    oof_df['pred'].hist(bins=100)
    
    R2 = round(r2_score(y_true, y_pred), 3)
    RMSPE = round(rmspe(y_true, y_pred), 3)
    logger.info(f'Performance of the prediction: R2 score: {R2}, RMSPE: {RMSPE}')

In [None]:
if MODE == 'TRAIN':
    for stock_id in oof_df['stock_id'].unique():
        y_true = oof_df.query('stock_id == @stock_id')[target].values
        y_pred = oof_df.query('stock_id == @stock_id')['pred'].values
        
        R2 = round(r2_score(y_true, y_pred), 3)
        RMSPE = round(rmspe(y_true, y_pred), 3)
        logger.info(f'Performance by stock_id={stock_id}: R2 score: {R2}, RMSPE: {RMSPE}')

In [None]:
MODE= 'TRAIN'

In [None]:
%%time

#Overfitting on the training data

params = {
    "device": "gpu",
    "gpu_platform_id": 0,
    "gpu_device_id": 0,
    'n_estimators' : 500,
    'objective': 'rmse',
    'boosting_type': 'dart',
    'max_depth': 8,
    'learning_rate': 0.3,
    'num_leaves': 128,
    'max_bin': 68,
    'seed': 42,
    'early_stopping_rounds': 50,
    'verbose': -1
}

if MODE == 'TRAIN':
    oof_df, y_preds, models, fi_df = fit_model(params,
                                              book_df,
                                              book_df[target],
                                              test_book_df,
                                              features=train_features,
                                              cats = cats,
                                              era = None,
                                              fold_type = 'kfold',
                                              n_fold = 5,
                                              seed = 42)

In [None]:
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

if MODE == 'TRAIN':
    oof_df.dropna(inplace=True)
    y_true = oof_df[target].values
    y_pred = oof_df['pred'].values
    
    oof_df[target].hist(bins=100) #blue
    oof_df['pred'].hist(bins=100) #green
    
    R2 = round(r2_score(y_true, y_pred), 3)
    RMSPE = round(rmspe(y_true, y_pred), 3)
    logger.info(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

In [None]:
if MODE == 'TRAIN':
    for stock_id in oof_df['stock_id'].unique():
        y_true = oof_df.query('stock_id == @stock_id')[target].values
        y_pred = oof_df.query('stock_id == @stock_id')['pred'].values
        
        R2 = round(r2_score(y_true, y_pred), 3)
        RMSPE = round(rmspe(y_true, y_pred), 3)
        logger.info(f'Performance by stock_id={stock_id}: R2 score: {R2}, RMSPE: {RMSPE}')

In [None]:
%%time

#Overfitting on the training data

params = {
    "device": "gpu",
    "gpu_platform_id": 0,
    "gpu_device_id": 0,
    'n_estimators' : 500,
    'objective': 'rmse',
    'min_data_in_leaf': 1000,
    'boosting_type': 'dart',
    'max_depth': -1,
    'learning_rate': 0.3,
    'num_leaves': 128,
    'max_bin': 68,
    'seed': 42,
    'early_stopping_rounds': 50,
    'verbose': -1
}

if MODE == 'TRAIN':
    oof_df, y_preds, models, fi_df = fit_model(params,
                                              book_df,
                                              book_df[target],
                                              test_book_df,
                                              features=train_features,
                                              cats = cats,
                                              era = None,
                                              fold_type = 'kfold',
                                              n_fold = 5,
                                              seed = 42)

In [None]:
if MODE == 'TRAIN':
    oof_df.dropna(inplace=True)
    y_true = oof_df[target].values
    y_pred = oof_df['pred'].values
    
    oof_df[target].hist(bins=100) #blue
    oof_df['pred'].hist(bins=100) #green
    
    R2 = round(r2_score(y_true, y_pred), 3)
    RMSPE = round(rmspe(y_true, y_pred), 3)
    logger.info(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

In [None]:
if MODE == 'TRAIN':
    for stock_id in oof_df['stock_id'].unique():
        y_true = oof_df.query('stock_id == @stock_id')[target].values
        y_pred = oof_df.query('stock_id == @stock_id')['pred'].values
        
        R2 = round(r2_score(y_true, y_pred), 3)
        RMSPE = round(rmspe(y_true, y_pred), 3)
        logger.info(f'Performance by stock_id={stock_id}: R2 score: {R2}, RMSPE: {RMSPE}')

In [None]:
import optuna


def objective(trial, features=features, X=book_df, y=book_df[target], cats=cats):
    param_grid = {
        #"n_estimators": trial.suggest_categorical("n_estimators", [813]),
        "n_estimators": 500,
        "device": "gpu",
        "gpu_platform_id": 0,
        "gpu_device_id": 0,
        "learning_rate": 0.3,
        "boosting": trial.suggest_categorical("boosting",['dart','gbdt','goss']),
        #"learning_rate": trial.suggest_float("learning_rate", 0.1, 0.2, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 128, 1024),
        "max_depth": trial.suggest_int("max_depth", 7, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 500, 750, 1000),
        "max_bin": trial.suggest_int("max_bin", 64, 70),
        'early_stopping_rounds': 10,
        #"lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        #"lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        #"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
    }
    
    cv = model_selection.KFold(n_splits=2, shuffle=True, random_state=46)
    #kf = cv.split(X_train, y_train)
    cv_scores = np.empty(2) # from optuna blog
    
    # splits the data into 5 groups, uses one as validation, trains models and gets results then repeats
    for fold_id, (train_index, valid_index) in tqdm(enumerate(cv.split(X,y))): 
        #split
        X_tr = X.loc[train_index, features]
        X_val = X.loc[valid_index, features]
        y_tr = y.loc[train_index]
        y_val = y.loc[valid_index]
        
        
        # model, note inverse weighting please
        train_set = lgb.Dataset(X_tr, y_tr, categorical_feature = cats, weight = 1/np.power(y_tr, 2))
        val_set = lgb.Dataset(X_val, y_val, categorical_feature = cats, weight = 1/np.power(y_val, 2))
        # Note that train() will return a model from the best iteration.
        model = lgb.train(param_grid, train_set, valid_sets = [train_set, val_set], feval=RMSPEMetric(), verbose_eval=250)
        #model.fit(X_train=book_df,y_train = book_df[target],test_book_df,cats = cats)
        
        # predict
        y_pred = model.predict(X_val[features])
        y_true = y_val
        return (np.sqrt(np.mean(np.square((y_true - y_pred)/y_true))))

In [None]:
#optimization_function = partial(objective, X=x, y=y)
study = optuna.create_study(direction='minimize')
study.optimize(objective) #n_trials=20

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
# Number of finished trials: 20
# Best trial: {'boosting': 'dart', 'num_leaves': 288, 'max_depth': 9, 'min_data_in_leaf': 500, 'max_bin': 68}
#Trial 304 finished with value: 0.2411297275201126 and parameters: {'boosting': 'dart', 'num_leaves': 661, 'max_depth': 8, 'min_data_in_leaf': 500, 'max_bin': 68}. Best is trial 304 with value: 0.2411297275201126.

In [None]:
%%time

params = {
    "device": "gpu",
    "gpu_platform_id": 0,
    "gpu_device_id": 0,
    'n_estimators' : 500,
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'learning_rate': 0.3,
    'num_leaves': 128,
    'max_bin': 70,
    'seed': 42,
    'early_stopping_rounds': 50,
    'verbose': -1
}

if MODE == 'INFERENCE':
    oof_df, y_preds, models, fi_df = fit_model(params,
                                              book_df,
                                              book_df[target],
                                              test_book_df,
                                              features=features,
                                              cats = cats,
                                              era = None,
                                              fold_type = 'kfold',
                                              n_fold = 5,
                                              seed = 42)