# Notebook for running all the network at one place

## Initialization and Data Loading

In [1]:
%load_ext autoreload
%autoreload 2

In [18]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pandas import DataFrame, Series
from sklearn.preprocessing import RobustScaler, MinMaxScaler

from datatools import extract_market_data, data_quantization
from pipeline import Dataset
from pipeline.backtest import evaluation_for_submission, cross_validation
from visualization.metric import plot_performance

import xarray as xr
import seaborn as sns
sns.set_theme(style="whitegrid")

In [8]:
from qids_lib import QIDS

# dataset = Dataset.load('../../data/parsed') # Deprecated
qids = QIDS(path_prefix='../../')

In [12]:
base_ds = xr.open_dataset('../../data/nc/base.nc')
market_brief_ds = xr.open_dataset('../../data/nc/market_brief.nc')
ds = base_ds.merge(market_brief_ds)

In [14]:
ds.dims

Frozen({'day': 1000, 'asset': 54, 'timeslot': 50})

In [13]:
ds # examine the new format of the data

## Neural Network Model

In [15]:
import torch
import torch.nn as nn
from torch import nn, Tensor
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim

### MLP

In [17]:
class MLP(nn.Module): # type1 - fully connected layers
    def __init__(self, D_out, input_shape):
        # D_in - dimension of input channel, should be 3 for point cloud coordinates
        # D_out - number of classes label
        # k - number of rays
        # N - number of points on one ray
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_shape[1]*input_shape[2], 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, D_out)
        self.dropout = nn.Dropout(p=0.2)
        self.bn1 = nn.BatchNorm1d(512)
        self.bn2 = nn.BatchNorm1d(256)
        self.bn3 = nn.BatchNorm1d(128)
        self.relu = nn.ReLU()

#         self.final_layer = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.contiguous().view(-1,self.num_flat_features(x))
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.fc2(x)))
        x = F.relu(self.bn3(self.dropout(self.fc3(x))))
        x = self.fc4(x)
#         x = self.final_layer(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


### LSTM

In [16]:
class LSTM(nn.Module):

    def __init__(self, num_output, num_features,
                 hidden_size, num_layers, is_cuda: bool = False):
        ## hidden_size is like embedding feature space dimension
        ## better > num_assets?
        ## if num_layers = 2: stack 2 LSTM of lyaer 1:
        # nn.Sequential(OrderedDict([
        #     ('LSTM1', nn.LSTM(input_size, hidden_size, 1),
        #     ('LSTM2', nn.LSTM(hidden_size, hidden_size, 1)
        #     ]))
        super(LSTM, self).__init__()
        self.num_output = num_output
        self.num_layers = num_layers
        self.num_features = num_features
        self.hidden_size = hidden_size
        # self.window_size = window_size
        ## LSTM input size (batch_first) (batch, seq_length, feature)
        self.lstm = nn.LSTM(input_size=num_features, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(p=0.2)
        self.fc = nn.Linear(hidden_size, num_output)

        self.is_cuda = is_cuda

        if is_cuda:
            self.lstm.cuda()
            self.dropout.cuda()
            self.fc.cuda()

    def forward(self, x: torch.Tensor):
        # print(x.shape)
        batch_size = x.shape[0]
        # the second dim for h_0 and c_0 is the batch dim
        # hidden state
        h_0 = Variable(torch.zeros(
            self.num_layers, batch_size, self.hidden_size), requires_grad=True)
        # cell state
        c_0 = Variable(torch.zeros(
            self.num_layers, batch_size, self.hidden_size), requires_grad=True)

        if self.is_cuda:
            h_0 = h_0.cuda()
            c_0 = c_0.cuda()

        # print(x.device)
        # print(list(x.device for x in self.lstm.parameters()))
        # print(h_0.device)

        # Propagate input through LSTM
        ula, (h_out, _) = self.lstm(x, (h_0, c_0))
        # get only the last hidden layer, needed for multiple layer
        h_out = h_out[-1, :, :].view(-1, self.hidden_size)

        out = self.fc(self.dropout(h_out))

        return out


### Wrapper of the network

In [84]:
class NN_wrapper():

    def __init__(self, preprocess, lr=0.001, criterion=nn.MSELoss(), n_epoch=5, n_feature=7, train_lookback=32, per_eval_lookback=16, n_asset=54, hidden_size=64, network = 'LSTM'):
        if network == 'LSTM':
            self.net = LSTM(num_output=1, num_features=n_feature, hidden_size=hidden_size, num_layers=1, is_cuda=False)
        elif network == 'MLP':
            self.net = MLP(D_out = 1, input_shape=[n_asset*train_lookback, per_eval_lookback, n_feature])
        else:
            raise ValueError('Network architecture not supported')

        self.n_epoch = n_epoch
        self.criterion = criterion
        self.train_lookback = train_lookback
        self.per_eval_lookback = per_eval_lookback
        self.n_asset = n_asset

        # Define the optimizier
        self.optimizer = optim.Adam(self.net.parameters(), lr=lr, betas=(0.9, 0.999))
        self.scheduler = optim.lr_scheduler.CyclicLR(self.optimizer, base_lr=lr/10, max_lr=lr, step_size_up=n_epoch//2, cycle_momentum=False)

        self.preprocess = preprocess
        self.is_learning = True
        # self.early_stopper = EarlyStopper(patience=3, min_delta=10)

    def fit_predict(self, X, y):
        self.net.train()
        start_day = X.day.min().to_numpy().item()

        ### Old data processing with Pandas (still need to transform)
        # X = xr.Dataset.from_dataframe(X_)
        # y = xr.DataArray.from_series(y_)
        X_pd = X.to_dataframe(dim_order=['day', 'asset'])
        X_pd[X_pd.columns] = self.preprocess.fit_transform(X_pd[X_pd.columns].values)
        # # X_transformed_pd = DataFrame(X_transformed_np, index=X_pd.index, columns=X_pd.columns)
        X_transformed = xr.Dataset.from_dataframe(X_pd)
        # X_transformed = self.preprocess.fit_transform(X.to_array(dim='feature'))
        # # now one slice is (num_ft, per_eval_lookback, asset)
        # # stack and transform: shape (asset*observation, time_length, feature)
        # X_ult = torch.cat([torch.from_numpy(
        #     X_transformed.sel(day=slice(start_day + i, start_day + i + self.per_eval_lookback - 1))
        #     .to_array(dim='feature').transpose('asset', 'day', 'feature')
        #     .to_numpy()) for i in range(self.train_lookback)
        # ], dim=0).to(torch.float)
        # y_ult = torch.cat([torch.from_numpy(
        #     y.sel(day=start_day + i + self.per_eval_lookback - 1)
        #     .to_numpy()) for i in range(self.train_lookback)
        # ], dim=-1).to(torch.float)

        # New data preprocessing with Xarray - Hooray!
        X_list = []
        for i in range(self.train_lookback):
            X_slice = X_transformed.sel(day=slice(start_day + i, start_day + i + self.per_eval_lookback - 1)).expand_dims(
                batch=[start_day + i + self.per_eval_lookback - 1])
            X_slice.coords['offset'] = X_slice.day - start_day - i # calculate offset coordinate/index for each slice
            X_slice_o = X_slice.swap_dims({'day': 'offset'}) # swap to make offset the dimension instead of the previous 'day'
            X_list.append(X_slice_o.reset_coords(drop=True)) # (reset and) drop all non-index coordinates
        X_concat = xr.concat(X_list, dim='batch') # concat along the batch dimension
        X_arr = X_concat.stack({'batch_asset': ['batch', 'asset']}).to_array('feature')\
            .transpose('batch_asset', 'offset', 'feature') # create a new batch dimension cartesian product all batch and asset
        X_arr_np = X_arr.to_numpy()
        # then reshape into the desire form for NN (batch, seq_length, feature)
        # batch_asset "is" a multi-index array, with the ordering of train_lookback, asset
        # X_transformed = self.preprocess.fit_transform(X_arr)
        X_tensor = torch.from_numpy(X_arr_np).to(torch.float)

        y_tensor = torch.from_numpy(y.stack({'batch_asset': ['day', 'asset']}).values).to(torch.float)

        # print(X_ult.shape)
        # print(y_ult.shape)

        # LSTM or NN shape (batch, seq_length, feature)
        for epoch in range(self.n_epoch):
            # print('its actually training')
            def closure():
                self.optimizer.zero_grad()
                outputs = self.net(X_tensor)
                loss = self.criterion(outputs, y_tensor)
                loss.backward()
                return loss

            self.optimizer.zero_grad()
            outputs = self.net(X_tensor).squeeze()
            loss = self.criterion(outputs, y_tensor)
            # print(loss)
            loss.backward()
            self.optimizer.step()
            self.scheduler.step()
            # print(self.scheduler.get_last_lr())

        return  xr.DataArray(data=outputs.cpu().detach().numpy(),coords=dict(batch_asset=X_arr.batch_asset))\
            .unstack('batch_asset').rename({'batch': 'day'})

    def predict(self, X):
        self.net.eval()

        ### Old data processing with Pandas
        # # X_transformed = self.preprocess.fit_transform(X)
        # # X = xr.Dataset.from_dataframe(X_)
        X_pd = X.to_dataframe(dim_order=['day', 'asset'])
        X_pd[X_pd.columns] = self.preprocess.transform(X_pd[X_pd.columns].values)
        X_transformed = xr.Dataset.from_dataframe(X_pd)
        # # X_transformed_pd = DataFrame(X_transformed_np, index=X_pd.index, columns=X_pd.columns)

        ## New data processing using Xarray
        X_torch = torch.from_numpy(X_transformed.to_array('feature').transpose('asset','day', 'feature')
                                   .to_numpy()).to(torch.float)
        # X_transformed = self.preprocess.transform(X_transformed)
        # X_torch = torch.from_numpy(X_pd).to(torch.float)

        y = self.net(X_torch).squeeze()

        return np.clip(y.detach().numpy(), -0.2, 0.2)[np.newaxis,:] # return a numpy array

        # xr.DataArray.from_series(Series(y.cpu().detach().numpy(), index=X_pd.index))

        # return  xr.DataArray(data=y.cpu().detach().numpy(),coords=dict(asset=X.asset))

## Training Framework

### Cross Validatoin

In [86]:
feature = ['turnoverRatio', 'transactionAmount', 'pb', 'ps', 'pe_ttm', 'pe', 'pcf']
train_lookback = 16
eval_lookback = 8
prerpocess = MinMaxScaler()

model = NN_wrapper(preprocess=prerpocess, lr=5e-3, n_epoch=8, train_lookback=train_lookback, per_eval_lookback=eval_lookback, hidden_size=64,
                   n_feature=len(feature), n_asset=54, network='LSTM')
idx = pd.IndexSlice
ds_cv = ds.sel(day=slice(200,ds.dims['day'])).drop_dims('timeslot')
performance, cum_y_df = cross_validation(model, feature, ds=ds_cv, train_lookback=train_lookback, per_eval_lookback=eval_lookback)

  0%|          | 0/777 [00:00<?, ?it/s]

ValueError: Input contains NaN.

In [40]:
# lr=5e-4, cyclicLR, nepoch=8, trainlook=16, evallook=8, hidden=512
plt.figure(0)
# plot_performance(performance, metrics_selected=['train_r2', 'test_cum_r2', 'test_cum_pearson'])
plot_performance(performance, metrics_selected=['train_r2', 'val_cum_pearson','val_cum_r2'])
plt.show()

In [47]:
ds_day = ds.drop_dims('timeslot')

In [65]:
ds.day

In [59]:
y_train_true = ds['return'].sel(day=[207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222])

In [60]:
y_train_true.coords

Coordinates:
  * day      (day) int32 207 208 209 210 211 212 213 ... 217 218 219 220 221 222
  * asset    (asset) int32 0 1 2 3 4 5 6 7 8 9 ... 44 45 46 47 48 49 50 51 52 53