In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from transformers import RobertaConfig, AdamW
from transformers import RobertaTokenizerFast
from torch.utils.data import Dataset, DataLoader
from torch import nn
import pickle
import torch
from torch import cuda

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
delta = 0.0002
wap1_min = 0.8830628395080566
wap1_max = 1.1270768642425537

In [None]:
def read_test():
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    return test

class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, isSubmit = False):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.isSubmit = isSubmit
        
    def __getitem__(self, index):
        history = str(self.data.history[index])
        inputs = self.tokenizer.encode_plus(
            history,
            None,
            max_length=self.max_len,
            padding = 'max_length',            
            return_attention_mask=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        if self.isSubmit:
            row_id = self.data.row_id[index]
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'row_id'  : row_id
            } 
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'targets': torch.tensor(self.data.target[index], dtype=torch.float)
            } 
    
    def __len__(self):
        return self.len
    
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = RobertaConfig.from_pretrained(model_name)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = RobertaForMaskedLM.from_pretrained(model_name, config=config)  
            
        self.attention = torch.nn.Sequential(            
            torch.nn.Linear(768, 512),            
            torch.nn.Tanh(),                       
            torch.nn.Linear(512, 1),
            torch.nn.Softmax(dim=1)
        )        

        self.regressor = torch.nn.Sequential(                        
            torch.nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)
    
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:197]    
    attention_parameters = named_parameters[199:203]
    regressor_parameters = named_parameters[203:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 2e-5

        if layer_num >= 69:        
            lr = 5e-5

        if layer_num >= 133:
            lr = 1e-4

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return torch.optim.AdamW(parameters)

def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def log_return(series):
    return np.log(series).diff()

def book_preprocessor(delta, stock_id):
    file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)

    df = pd.read_parquet(file_path_book)[['time_id', 'seconds_in_bucket', 'bid_price1', 'ask_price1', 'bid_size1', 'ask_size1']]
    df['wap1'] = calc_wap1(df)
    
    df.loc[df.wap1 < wap1_min, 'wap1'] = wap1_min
    df.loc[df.wap1 > wap1_max, 'wap1'] = wap1_max

    df['row_id'] = str(stock_id) + '-' + df['time_id'].astype(str)
    df['i'] = ((df['wap1'] - wap1_min)/delta).astype(int)
    return df.groupby(['row_id']).apply(convert_to_str)
    
def convert_to_str(g):
    start = 0
    if len(g) > 590:
        start = 10
    s = ''.join(str(g.i.values[start:])).replace('\n', '')
    return s[1: len(s) - 1] + ' . '

In [None]:
data_dir = '../input/optiver-realized-volatility-prediction/'

test = read_test()
test_stock_ids = test['stock_id'].unique()

test_dataset = pd.DataFrame()
for stock_id in test_stock_ids:
    df = pd.DataFrame(book_preprocessor(delta, stock_id), columns = ['history']).reset_index()
    test_dataset = test_dataset.append(df)
    
test_dataset = test_dataset.reset_index(drop = True)

In [None]:
MAX_LEN = 600

tokenizer = RobertaTokenizerFast.from_pretrained('../input/krv-tokenizer')

sibmiting_test = Triage(test_dataset, tokenizer, MAX_LEN, isSubmit = True)
submit_params = {'batch_size': 1,
                'num_workers': 0
                }
submit_loader = DataLoader(sibmiting_test, **submit_params)

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
model_name = '../input/optiver-ml-final'

with open('../input/optiver-ml-final/model_new.pickle', 'rb') as f:
    model = pickle.load(f)
model.to(device)

def submit(model, submit_loader):
    result = {}
    model.eval()
    with torch.no_grad():
        for _, data in enumerate(submit_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            row_id = data['row_id'][0]

            outputs = model(ids, mask)
            big_val, big_idx = torch.max(outputs.data, dim=1)
            result[row_id] = big_val.cpu().numpy()[0]

    result = pd.DataFrame.from_dict(result, orient = 'index').reset_index()
    result.columns = ['row_id', 'target']
    return result

result = submit(model, submit_loader)
result.to_csv('submission.csv', index = False)
