## Preparation

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
pd.set_option('max_rows', 300)
pd.set_option('max_columns', 300)

import os
import glob

## data

In [2]:
data_dir = '../input/'

In [3]:
test = pd.read_csv('../input/test.csv')
train = pd.read_csv('../input/train.csv')

train["row_id"] = train["stock_id"].astype(str)+"-"+train["time_id"].astype(str)
test["row_id"] = test["stock_id"].astype(str)+"-"+test["time_id"].astype(str)

### book_train.parquet  
オーダーbookデータは予約状況  
この値段まで下がってきたら、買う  
上がってきたら売るみたいな予約情報

### trade_train.parque
リアルタイムで実際に取引された量  
買い手がこの値段でこの数量買い、売り手がこの値段でこの数量売ったなど  


stock_id : 株の銘柄（どの株か）  
time_id : どの時間の情報かのid (submissionファイルのtime_idと連動しています)  
seconds_in_bucket : time_idの中で、0からスタートして何秒後か。たぶん予測するのは、10分のtotalなので、seconds_in_bucketは、最大600 secのはず  
bid_price1,2 : 株の買値の希望値の１番目と２番目 ※　(Normalized prices of the most/second most competitive buy level. だから、正確には、１番と２番目に正規化されたレベルの買値。
→買値の希望値をみんな出しているけど、それの正規化したときに一番多い値と２番目に多い値と推測。以下askも逆の現象。)  

ask_price1,2 : 株の売り値の希望値  
bid_size1,2 : 買うのを希望している側の１番目と２番目の株式数  
ask_size1,2 : 売るのを希望している側の１番目と２番目の株式数  

In [4]:
# book_example = pd.read_parquet('../input/book_train.parquet/stock_id=1')
# book_example


## Functions for preprocess

In [5]:
def calc_wap1(df):
    wap = (df["bid_price1"]*df["ask_size1"]+df["ask_price1"]*df["bid_size1"])/(df["bid_size1"]+df["ask_size1"])
    return wap

In [6]:
def calc_wap2(df):
    wap = (df["bid_price2"]*df["ask_size2"]+df["ask_price2"]*df["bid_size2"])/(df["bid_size2"]+df["ask_size2"])
    return wap

In [7]:
def log_return(series):
    return np.log(series).diff()

# 1行目はNullになるので注意

In [8]:
def realized_vocatility(series):
    return np.sqrt(np.sum(series**2))
    

In [9]:
def count_unique(series):
    return len(np.unique(series))

## Feature Engineering

In [10]:
stock = train.groupby("stock_id")['target'].agg(["mean","median","std","count","sum"]).reset_index()
time = train.groupby(["time_id"])["target"].agg(["mean","median","std","count","sum"]).reset_index()
# row = train.groupby(["row_id"])["target"].agg(["mean","median","std","count","sum"]).reset_index()

In [11]:

train_info = train.copy()
train_info['stock_id_mean']     = train['stock_id'].map(dict(zip(stock['stock_id'], stock['mean'])))
train_info['stock_id_median']   = train['stock_id'].map(dict(zip(stock['stock_id'], stock['median'])))
train_info['stock_id_std']      = train['stock_id'].map(dict(zip(stock['stock_id'], stock['std'])))

train_info['time_id_mean']      = train['time_id'].map(dict(zip(time['time_id'], time['mean'])))
train_info['time_id_median']    = train['time_id'].map(dict(zip(time['time_id'], time['median'])))
train_info['time_id_std']        = train['time_id'].map(dict(zip(time['time_id'], time['std'])))

# train_info['row_id_mean']      = train['row_id'].map(dict(zip(row['row_id'], row['mean'])))
# train_info['row_id_median']    = train['row_id'].map(dict(zip(row['row_id'], row['median'])))
# train_info['row_id_std']        = train['row_id'].map(dict(zip(row['row_id'], row['std'])))


In [12]:
book_train = pd.read_parquet(data_dir + "book_train.parquet/stock_id=15")
book_train.head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2
0,5,0,0.999519,0.999839,0.999454,0.999904,2,166,2,12
1,5,1,0.999711,1.000225,0.999647,1.000289,100,20,100,20
2,5,2,0.999775,1.000225,0.999711,1.000289,1,20,400,20
3,5,3,0.999839,1.000225,0.999775,1.000289,100,20,1,20
4,5,4,0.999839,1.000225,0.999711,1.000289,1,20,400,20
