In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from transformers import RobertaConfig, AdamW
from transformers import RobertaForMaskedLM, get_cosine_schedule_with_warmup
from transformers import RobertaTokenizerFast
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed
import pickle
from torch import nn
import torch
import math
from torch import cuda
import datetime

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def log_return(series):
    return np.log(series).diff()

def book_preprocessor(delta, stock_id, is_train = True):
    if is_train:
        file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
    # Test
    else:
        file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)

    df = pd.read_parquet(file_path_book)[['time_id', 'seconds_in_bucket', 'bid_price1', 'ask_price1', 'bid_size1', 'ask_size1']]
    df['wap1'] = calc_wap1(df)
    df['row_id'] = str(stock_id) + '-' + df['time_id'].astype(str)
    df['i'] = ((df['wap1'] - wap1_min)/delta).astype(int)
    return df.groupby(['row_id']).apply(convert_to_str)
    
def convert_to_str(g):
    start = 0
    if len(g) > 590:
        start = 10
    s = ''.join(str(g.i.values[start:])).replace('\n', '')
    return s[1: len(s) - 1] + ' . '

def regression_calculate_rmse(y_pred, y_true):
    delta = 0
    for val1, val2 in zip(y_pred.cpu().numpy(), y_true.cpu().numpy()):
        delta += np.square((val2 - val1) / val2)

    return np.sqrt(delta/len(y_pred))

def rmspe(y_pred, y_true):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def train(model, optimizer, epoch, lr_scheduler = None):
    nb_tr_examples = 0
    rmse = 0
    
    model.train()
    for _,data in enumerate(training_loader, 0):
        optimizer.zero_grad()
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        
        outputs = model(input_ids = ids, attention_mask = mask)

        loss = loss_function(outputs.view(-1), targets.view(-1))
        big_val, big_idx = torch.max(outputs.data, dim=1)
        rmse += regression_calculate_rmse(big_val, targets)
        
        nb_tr_examples+=targets.size(0)

        if nb_tr_examples % 10000 == 0:
            epoch_rmse = math.sqrt(rmse/nb_tr_examples)
            print(f"{datetime.datetime.now().time()} : The number of trained samples ({nb_tr_examples}): {epoch_rmse}")
         
        loss.backward()
        # # When using GPU
        optimizer.step()
        if lr_scheduler != None:
            lr_scheduler.step()

    epoch_rmse = math.sqrt(rmse/nb_tr_examples)
    print(f"Training RMSE Epoch({epoch}): {epoch_rmse}")

    return 

def valid(models, testing_loader):
    rmse = 0; nb_tr_examples = 0;
    if len(models) > 1:
        print(f'valid: the number of good models is {len(models)}')
    
    with torch.no_grad():
        for model in models:
            model.eval()
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            
            outputs = model(input_ids = ids, attention_mask = mask)
            for model in models[1:]:
                outputs += model(input_ids = ids, attention_mask = mask)
                
            outputs /= len(models)
            
            loss = loss_function(outputs.view(-1), targets.view(-1))
            big_val, big_idx = torch.max(outputs.data, dim=1)
            rmse += regression_calculate_rmse(big_val, targets)

            nb_tr_examples+=targets.size(0)
            
            if nb_tr_examples % 10000 == 0:
                epoch_rmse = math.sqrt(rmse/nb_tr_examples)
                print(f"{datetime.datetime.now().time()} : The number of validated samples ({nb_tr_examples}): {epoch_rmse}")
            
        epoch_rmse = math.sqrt(rmse/nb_tr_examples)
        return epoch_rmse
    
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, isSubmit = False):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.isSubmit = isSubmit
        
    def __getitem__(self, index):
        history = str(self.data.history[index])
        inputs = self.tokenizer.encode_plus(
            history,
            None,
            max_length=self.max_len,
            padding = 'max_length',            
            return_attention_mask=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        if self.isSubmit:
            row_id = self.data.row_id[index]
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'row_id'  : row_id
            } 
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'targets': torch.tensor(self.data.target[index], dtype=torch.float)
            } 
    
    def __len__(self):
        return self.len
    
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = RobertaConfig.from_pretrained(model_name)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = RobertaForMaskedLM.from_pretrained(model_name, config=config)  
            
        self.attention = torch.nn.Sequential(            
            torch.nn.Linear(768, 512),            
            torch.nn.Tanh(),                       
            torch.nn.Linear(512, 1),
            torch.nn.Softmax(dim=1)
        )        

        self.regressor = torch.nn.Sequential(                        
            torch.nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)
    
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:197]    
    attention_parameters = named_parameters[199:203]
    regressor_parameters = named_parameters[203:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 2e-5

        if layer_num >= 69:        
            lr = 5e-5

        if layer_num >= 133:
            lr = 1e-4

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return torch.optim.AdamW(parameters)

def save_model(model):
    # Later need to upload to optiver-ml-final as new version
    with open('model.pickle', 'wb') as f:
        pickle.dump(model, f)

In [None]:
delta = 0.0002
wap1_min = 0.8830628395080566
wap1_max = 1.1270768642425537
vocab_size = int((wap1_max - wap1_min)/delta) + 2
num_secs_max = 600

In [None]:
first_run = False
model_name = '../input/bert-for-optiver'
tokenizer = RobertaTokenizerFast.from_pretrained('../input/krv-tokenizer')

if first_run == True:
    model_name = '../input/bert-for-optiver'
    config = RobertaConfig.from_pretrained(model_name)

In [None]:
data_dir = '../input/optiver-realized-volatility-prediction/'

train, test = read_train_test()
train_stock_ids = train['stock_id'].unique()

data = pd.DataFrame()
for stock_id in train_stock_ids:
    df = pd.DataFrame(book_preprocessor(delta, stock_id), columns = ['history']).reset_index()
    data = data.append(df)
    
data = data.reset_index(drop = True)
data = data.merge(train[['target', 'row_id']])

In [None]:
data = pd.read_csv('../input/prepared-optiver-data/optiver_new.csv', index_col = 0).sample(30000).reset_index(drop = True)

In [None]:
mode = 'mAll'

if mode != 'mTrain':
    train_dataset, test_dataset = train_test_split(data, test_size = 0.2, random_state=200)
    train_dataset=train_dataset.reset_index(drop=True)
    test_dataset=test_dataset.reset_index(drop=True)
else:
    train_dataset = data
    train_dataset=train_dataset.reset_index(drop=True)

In [None]:
MAX_LEN = 600
if mode == 'mTrain':
    TRAIN_BATCH_SIZE = 1
else:
    TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 2

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'drop_last': True,
                'shuffle': True,
                'num_workers': 2
                }
training_loader = DataLoader(training_set, **train_params)

if mode != 'mTrain':
    testing_set = Triage(test_dataset, tokenizer, MAX_LEN)
    test_params = {'batch_size': VALID_BATCH_SIZE,
                    'shuffle': True,
                    'num_workers': 0
                    }
    testing_loader = DataLoader(testing_set, **test_params)

loss_function = torch.nn.MSELoss()

EPOCHS = 1

device = 'cuda' if cuda.is_available() else 'cpu'

if first_run == False:
    with open('../input/optiver-ml-final/model_new.pickle', 'rb') as f:
        model = pickle.load(f)
else:
    model = LitModel()
model.to(device)

optimizer = create_optimizer(model)
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_training_steps=EPOCHS * len(training_loader),
    num_warmup_steps=50)    

In [None]:
optimizer = create_optimizer(model)
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_training_steps=EPOCHS * len(training_loader),
    num_warmup_steps=50)    

In [None]:
for epoch in range(EPOCHS):
    train(model, optimizer, epoch, lr_scheduler = scheduler)
    if mode != 'mTrain':
        epoch_rmse = valid([model], testing_loader)
        save_model(model)
        
        print(f"Validation RMSE Epoch({epoch}): {epoch_rmse}\n")

In [None]:
import copy
def run_new_learn_strategy(model, nsamples = 30000, first_run = False):
    run_continue = True
    best_valid_rmse = 0.41
    validat_counter = 0
    model0 = copy.deepcopy(model)
    
    while run_continue:
        data = pd.read_csv('../input/prepared-optiver-data/optiver_new.csv', index_col = 0).sample(nsamples).reset_index(drop = True)
        train_dataset, test_dataset = train_test_split(data, test_size = 0.2)
        train_dataset=train_dataset.reset_index(drop=True)
        test_dataset=test_dataset.reset_index(drop=True)
        
        training_set = Triage(train_dataset, tokenizer, MAX_LEN)
        train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'drop_last': True,
                'shuffle': True,
                'num_workers': 2
                }
        training_loader = DataLoader(training_set, **train_params)

        testing_set = Triage(test_dataset, tokenizer, MAX_LEN)
        test_params = {'batch_size': VALID_BATCH_SIZE,
                    'shuffle': True,
                    'num_workers': 0
                    }
        testing_loader = DataLoader(testing_set, **test_params)
        
        optimizer = create_optimizer(model)
        scheduler = get_cosine_schedule_with_warmup(optimizer, num_training_steps=EPOCHS * len(training_loader), num_warmup_steps=50)    

        for epoch in range(EPOCHS):
            train(model, optimizer, epoch, lr_scheduler = scheduler)
            epoch_rmse = valid([model], testing_loader)
            if epoch_rmse < best_valid_rmse:
                best_valid_rmse = epoch_rmse
                validat_counter = 0
                model0 = copy.deepcopy(model)
                #save_model(model0)
            else:
                model = copy.deepcopy(model0)
                validat_counter += 1
            print(f"Validation RMSE Epoch({epoch}, counter = {validat_counter}): {epoch_rmse}\n")
        
        if validat_counter > 5:
            run_continue = False

In [None]:
run_new_learn_strategy(model)