# An example of loading data 

In this document, we give an example of creating the pytorch Dataset subclass that is needed to use pytorch dataloader to load them during training loops. 

In [23]:
import sys 
import importlib
import numpy as np
import pandas as pd
import torch

sys.path.append("../")
from proj_mod import training, data_processing
importlib.reload(training);
importlib.reload(data_processing);

## Load in pre-transformed data 

### Recovered time id order 

In [3]:
list_time=np.load("../processed_data/recovered_time_id_order.npy")

In [9]:
list_time

array([ 4294, 31984, 31570, ..., 29316, 32195, 10890])

### Timeseries input

In [4]:
df_RV_ts=pd.read_parquet("../processed_data/book_RV_ts_60_si.parquet")

In [10]:
df_RV_ts

Unnamed: 0,time_id,sub_int_RV,sub_int_num,stock_id,row_id
0,5,0.000015,1,0,0-5
1,11,0.000004,1,0,0-11
2,16,0.000432,1,0,0-16
3,31,0.000000,1,0,0-31
4,62,0.000235,1,0,0-62
...,...,...,...,...,...
25735915,6410,0.000000,60,99,99-6410
25735916,10421,0.000000,60,99,99-10421
25735917,25639,0.000000,60,99,99-25639
25735918,25680,0.000000,60,99,99-25680


### Tabular data

In [15]:
df_total=pd.read_parquet("../processed_data/total_trade_values.parquet")

In [16]:
df_total

Unnamed: 0,time_id,price_mean,price_std,size_sum,size_mean,size_std,order_count_sum,order_count_mean,order_count_std,row_id,...,size_sum_time_id_std,order_count_sum_time_id_mean,order_count_sum_time_id_std,price_mean_stock_id_mean,price_mean_stock_id_std,size_sum_stock_id_mean,size_sum_stock_id_std,order_count_sum_stock_id_mean,order_count_sum_stock_id_std,emb_id
0,5,1.003722,0.000578,3179,79.475000,118.375107,110,2.750000,2.467741,0-5,...,55732.656856,444.491071,537.485453,1.000034,0.003438,3212.919843,2060.85147,102.862141,52.711362,0
1,11,1.000206,0.000304,1289,42.966667,77.815203,57,1.900000,1.446756,0-11,...,17787.844183,211.642857,183.786297,1.000034,0.003438,3212.919843,2060.85147,102.862141,52.711362,0
2,16,0.999204,0.000932,2161,86.440000,113.587000,68,2.720000,2.300725,0-16,...,19486.725826,215.535714,174.054767,1.000034,0.003438,3212.919843,2060.85147,102.862141,52.711362,0
3,31,0.999020,0.000729,1962,130.800000,144.828569,59,3.933333,4.043808,0-31,...,18627.884667,177.794643,161.292479,1.000034,0.003438,3212.919843,2060.85147,102.862141,52.711362,0
4,62,0.999618,0.000182,1791,81.409091,117.914682,89,4.045455,4.099678,0-62,...,14562.028835,179.819820,146.265214,1.000034,0.003438,3212.919843,2060.85147,102.862141,52.711362,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428908,32751,1.000635,0.000431,48894,474.699029,534.042552,494,4.796117,4.598293,99-32751,...,27428.487093,311.312500,256.636492,0.999968,0.002766,50414.991384,55170.68466,553.923238,516.955784,111
428909,32753,1.000694,0.000664,48635,528.641304,717.778741,585,6.358696,6.732864,99-32753,...,29343.947115,312.000000,288.377662,0.999968,0.002766,50414.991384,55170.68466,553.923238,516.955784,111
428910,32758,0.998246,0.000552,50438,600.452381,1177.948006,457,5.440476,7.490317,99-32758,...,22949.301781,176.258929,202.530378,0.999968,0.002766,50414.991384,55170.68466,553.923238,516.955784,111
428911,32763,0.998905,0.000309,47020,283.253012,372.698606,588,3.542169,3.248501,99-32763,...,64263.609236,449.339286,348.921648,0.999968,0.002766,50414.991384,55170.68466,553.923238,516.955784,111


### Target

In [5]:
df_target=pd.read_csv("../raw_data/kaggle_ORVP/train.csv")

In [13]:
df_target["row_id"]=df_target["stock_id"].astype(int).astype(str)+"-"+df_target["time_id"].astype(int).astype(str)
df_target

Unnamed: 0,stock_id,time_id,target,row_id
0,0,5,0.004136,0-5
1,0,11,0.001445,0-11
2,0,16,0.002168,0-16
3,0,31,0.002195,0-31
4,0,62,0.001747,0-62
...,...,...,...,...
428927,126,32751,0.003461,126-32751
428928,126,32753,0.003113,126-32753
428929,126,32758,0.004070,126-32758
428930,126,32763,0.003357,126-32763


## Create train test split and pytorch Dataset subclass 

In [17]:
time_split_list=data_processing.time_cross_val_split(list_time=list_time,n_split=1,percent_val_size=10,list_output=True)
train_time_id,test_time_id=time_split_list[0][0],time_split_list[0][1]

tab_features=['price_mean', 'price_std', 'size_sum', 'size_mean', 'size_std',
       'order_count_sum', 'order_count_mean', 'order_count_std','price_mean_time_id_mean',
       'price_mean_time_id_std', 'size_sum_time_id_mean',
       'size_sum_time_id_std', 'order_count_sum_time_id_mean',
       'order_count_sum_time_id_std', 'price_mean_stock_id_mean',
       'price_mean_stock_id_std', 'size_sum_stock_id_mean',
       'size_sum_stock_id_std', 'order_count_sum_stock_id_mean',
       'order_count_sum_stock_id_std', 'emb_id']

norm_feature_dict={col:None for col in tab_features}
del norm_feature_dict["emb_id"]

train_dataset=training.RVdataset(time_id_list=train_time_id,
                                 ts_features=["sub_int_RV"],
                                 tab_features=tab_features,
                                 df_ts_feat=df_RV_ts,
                                 df_target=df_target,
                                 df_tab_feat=df_total,
                                 norm_feature_dict=norm_feature_dict)
test_dataset=training.RVdataset(time_id_list=test_time_id,
                                 ts_features=["sub_int_RV"],
                                 tab_features=tab_features,
                                 df_ts_feat=df_RV_ts,
                                 df_target=df_target,
                                 df_tab_feat=df_total,
                                 norm_feature_dict=norm_feature_dict)

In fold 0 :

Train set end at 8117 .

Test set start at 15516 end at 10890 .

Notice: price_mean has been normalized.
The mean and std of this feature has been stored in feat_norm_dict
Notice: price_std has been normalized.
The mean and std of this feature has been stored in feat_norm_dict
Notice: size_sum has been normalized.
The mean and std of this feature has been stored in feat_norm_dict
Notice: size_mean has been normalized.
The mean and std of this feature has been stored in feat_norm_dict
Notice: size_std has been normalized.
The mean and std of this feature has been stored in feat_norm_dict
Notice: order_count_sum has been normalized.
The mean and std of this feature has been stored in feat_norm_dict
Notice: order_count_mean has been normalized.
The mean and std of this feature has been stored in feat_norm_dict
Notice: order_count_std has been normalized.
The mean and std of this feature has been stored in feat_norm_dict
Notice: price_mean_time_id_mean has been normalized.
The

Let's take one "getitem" example: The first element is the input feature values, and the last is the target. 

In [22]:
train_dataset.__getitem__(0)

(tensor([ 3.4084e-04,  0.0000e+00,  2.3243e-05,  0.0000e+00,  1.6951e-04,
          3.8188e-07,  8.9407e-05,  5.5245e-04,  1.2384e-05,  0.0000e+00,
          4.4959e-04,  4.1774e-06,  0.0000e+00,  4.8719e-04,  7.8863e-05,
          1.3869e-04,  1.8932e-04,  1.7912e-04,  7.7147e-05,  0.0000e+00,
          1.2311e-04,  8.0507e-07,  0.0000e+00,  6.2301e-04,  3.8759e-08,
          1.2902e-06,  2.0088e-06,  1.8896e-08,  0.0000e+00,  9.5118e-07,
          8.9181e-06,  0.0000e+00,  1.7741e-04,  2.3867e-06,  4.5465e-04,
          8.7020e-07,  5.1658e-04,  1.9658e-04,  2.0350e-04,  2.4453e-04,
          0.0000e+00,  2.2738e-04,  2.3521e-06,  1.2019e-04,  2.6251e-04,
          1.1614e-04,  1.2996e-05,  2.3797e-04,  5.6143e-05,  6.8022e-04,
          2.6529e-04,  0.0000e+00,  2.1426e-04,  3.0215e-06,  0.0000e+00,
          1.1849e-04,  2.3133e-04,  1.0609e-05,  1.1056e-04,  2.8797e-04,
         -3.1516e-01, -5.0998e-01, -4.4019e-01, -3.0861e-01, -2.9697e-01,
         -5.0341e-01, -9.7707e-01, -5.

In below, one can check the places of each feature in the whole input

In [18]:
train_dataset.featureplace

{'sub_int_RV': (0, 60),
 'price_mean': (60, 61),
 'price_std': (61, 62),
 'size_sum': (62, 63),
 'size_mean': (63, 64),
 'size_std': (64, 65),
 'order_count_sum': (65, 66),
 'order_count_mean': (66, 67),
 'order_count_std': (67, 68),
 'price_mean_time_id_mean': (68, 69),
 'price_mean_time_id_std': (69, 70),
 'size_sum_time_id_mean': (70, 71),
 'size_sum_time_id_std': (71, 72),
 'order_count_sum_time_id_mean': (72, 73),
 'order_count_sum_time_id_std': (73, 74),
 'price_mean_stock_id_mean': (74, 75),
 'price_mean_stock_id_std': (75, 76),
 'size_sum_stock_id_mean': (76, 77),
 'size_sum_stock_id_std': (77, 78),
 'order_count_sum_stock_id_mean': (78, 79),
 'order_count_sum_stock_id_std': (79, 80),
 'emb_id': (80, 81)}

In [19]:
ts_place, row_param_palce, time_param_place, stock_param_place, emb_id_place=(0,60), (60,68), (68,74), (74,80), (80,81)

Finally, an example of creating pytorch dataloader with the above pytorch Dataset subclass 

In [None]:
train_loader=torch.utils.data.DataLoader(dataset=train_dataset,batch_size=512,shuffle=True, num_workers=4, pin_memory=True)
test_loader=torch.utils.data.DataLoader(dataset=test_dataset,batch_size=512,shuffle=True, num_workers =4, pin_memory=True)

Then the dataloaders are ready to be feed into training.reg_training_loop_rmspe for training use as parameters. Following is an example of where they should go: 

training.reg_training_loop_rmspe(train_loader=train_loader,val_loader=test_loader,...)

## Final remark

The see detail document on pytorch Dataset subclass, see "../data_processing/create_datasets.ipynb", the source code is in "../proj_mod/training.py". 