# In this notebook, we attempt in creating an attention based NN to predict the target on the timeseries alone. 

We first create an example, then we will generalize it and include it into training.py. 

# Import and preparations 

In [2]:
import sys, importlib
import torch 
import torch.nn as nn
import numpy as np
import pandas as pd
import copy
import time

sys.path.append("../")
from proj_mod import training, data_processing, visualization
importlib.reload(training);
importlib.reload(data_processing);
importlib.reload(visualization);

In [3]:
#Only run this cell if needed. AMD gpus might need this. 
from dotenv import load_dotenv
import os

load_dotenv("../dotenv_env/deep_learning.env")

print(os.environ.get("HSA_OVERRIDE_GFX_VERSION"))

10.3.0


In [4]:
device=(torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
print(f"Using device {device}")

Using device cuda


# Data preparation 

### Load time id order

In [5]:
list_time=np.load("../processed_data/recovered_time_id_order.npy")

### Load timeseries 

In [6]:
df_RV_ts=pd.read_parquet("../processed_data/book_RV_ts_60_si.parquet")

### Load target 

In [7]:
df_target=pd.read_csv("../raw_data/kaggle_ORVP/train.csv")
df_target["row_id"]=df_target["stock_id"].astype(int).astype(str)+"-"+df_target["time_id"].astype(int).astype(str)
df_target

Unnamed: 0,stock_id,time_id,target,row_id
0,0,5,0.004136,0-5
1,0,11,0.001445,0-11
2,0,16,0.002168,0-16
3,0,31,0.002195,0-31
4,0,62,0.001747,0-62
...,...,...,...,...
428927,126,32751,0.003461,126-32751
428928,126,32753,0.003113,126-32753
428929,126,32758,0.004070,126-32758
428930,126,32763,0.003357,126-32763


### Create datasets 

In [9]:
time_split_list=data_processing.time_cross_val_split(list_time=list_time,n_split=1,percent_val_size=10,list_output=True)
train_time_id,test_time_id=time_split_list[0][0],time_split_list[0][1]

train_dataset=training.RVdataset(time_id_list=train_time_id,ts_features=["sub_int_RV"],tab_features=["emb_id"],df_ts_feat=df_RV_ts,df_target=df_target)
test_dataset=training.RVdataset(time_id_list=test_time_id,ts_features=["sub_int_RV"],tab_features=["emb_id"],df_ts_feat=df_RV_ts,df_target=df_target)

In fold 0 :

Train set end at 8117 .

Test set start at 15516 end at 10890 .



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tab_copy["sub_int_num"]=np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tab_copy["sub_int_num"]=np.nan


In [12]:
embed=nn.Embedding(num_embeddings=4,embedding_dim=2)
out=embed(torch.tensor([[0,1,2,3],[0,1,2,3]]))

In [13]:
out

tensor([[[ 0.1853, -1.0454],
         [-0.4169, -0.8853],
         [ 2.0273, -0.7891],
         [ 0.1283,  0.6013]],

        [[ 0.1853, -1.0454],
         [-0.4169, -0.8853],
         [ 2.0273, -0.7891],
         [ 0.1283,  0.6013]]], grad_fn=<EmbeddingBackward0>)

In [14]:
out.shape

torch.Size([2, 4, 2])

In [19]:
torch.tensor([0,1,2,3]).expand(2,4)

tensor([[0, 1, 2, 3],
        [0, 1, 2, 3]])

# Create example (encoder based only) transformer model 

In [44]:
class ts_encoder_example(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder_attn=nn.MultiheadAttention(embed_dim=32,num_heads=4,dropout=0.1,batch_first=True)
        self.encoder_norm1=nn.LayerNorm(32)
        self.encoder_feedforward=nn.Sequential(
            nn.Linear(in_features=32,out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64,out_features=32)
        )
        self.encoder_norm2=nn.LayerNorm(32)
        
    def forward(self,x):
        attn,_=self.encoder_attn(x,x,x)
        x=self.encoder_norm1(x+attn)
        attn=self.encoder_feedforward(x)
        return self.encoder_norm2(x+attn)
        

class ts_trans_example(nn.Module): 
    # An example where only encoder is used, the logic behind this is that we only want one value output, it might not be needed to add in decoders (which )
    def __init__(self):
        super().__init__()
        #Frozen conv 
        self.frozen_conv=training.frozen_diff_conv(n_diff=2) 
        #Position embedding 
        self.pos_emb=nn.Embedding(num_embeddings=60,embedding_dim=32) # 60 is the length of our (default) timeseries. 
        self.ts_proj=nn.Linear(in_features=3,out_features=32)
        self.pos_attn=nn.MultiheadAttention(embed_dim=32,batch_first=True,dropout=0.1,num_heads=4)
        self.pos_norm=nn.LayerNorm(32) 
        #Encoder stacking 
        self.encoder_layers=nn.ModuleList([
            ts_encoder_example()
            for _ in range(4)
        ])
        #Final feedforward 
        self.final_linear=nn.Linear(in_features=32,out_features=1)
        
        #scaler
        self.input_scaler=10000
        
    def forward(self,x): 
        #Create and reshape the timeseries tensor 
        x*=self.input_scaler
        x=torch.unsqueeze(x,dim=1)
        x=self.frozen_conv(x)
        x=x.permute(0,2,1) 
        # print(x.shape)
        x=self.ts_proj(x) # (N, 60, 32) 60 is the timeseries length 
        #Adding in position for positional impact 
        pos_id=torch.arange(60, device=x.device).expand(x.shape[0],60)
        pos_emb=self.pos_emb(pos_id)
        pos,_=self.pos_attn(x,pos_emb,pos_emb)
        x=x+pos
        x=self.pos_norm(x)
        #Run though the encoder layers 
        for layer in self.encoder_layers: 
            x=layer(x)
        return torch.sum(x,dim=1)/self.input_scaler # (N,1)
        
        
        

### Create loaders

In [36]:
train_loader=torch.utils.data.DataLoader(dataset=train_dataset,batch_size=512,shuffle=True, num_workers=4, pin_memory=True)
test_loader=torch.utils.data.DataLoader(dataset=test_dataset,batch_size=512,shuffle=True, num_workers =4, pin_memory=True)

# Loss tracking
train_loss = []
val_loss = []

### Init model 

In [45]:
trans_example_mod=ts_trans_example().to(device=device)

import torch.optim as optim

optimizer = optim.Adam(trans_example_mod.parameters(), lr=1e-3)

In [46]:
from torchinfo import summary
summary(trans_example_mod)

Layer (type:depth-idx)                                       Param #
ts_trans_example                                             --
├─frozen_diff_conv: 1-1                                      --
│    └─Conv1d: 2-1                                           (2)
├─Embedding: 1-2                                             1,920
├─Linear: 1-3                                                128
├─MultiheadAttention: 1-4                                    3,168
│    └─NonDynamicallyQuantizableLinear: 2-2                  1,056
├─LayerNorm: 1-5                                             64
├─ModuleList: 1-6                                            --
│    └─ts_encoder_example: 2-3                               --
│    │    └─MultiheadAttention: 3-1                          4,224
│    │    └─LayerNorm: 3-2                                   64
│    │    └─Sequential: 3-3                                  4,192
│    │    └─LayerNorm: 3-4                                   64
│    └─ts_encoder_

Oh fucks that is a shit ton of tranable parameters. 

### Training loop 

In [47]:
training.reg_training_loop_rmspe(optimizer=optimizer,model=trans_example_mod,train_loader=train_loader,val_loader=test_loader,ot_steps=20,report_interval=5,n_epochs=100,list_train_loss=train_loss,list_val_loss=val_loss,device=device,eps=1e-6)

A new best validation loss at epoch  1  with validation loss of  tensor(1.8792, device='cuda:0')
At  18.355358362197876  epoch  1 has training loss  tensor(3.6192, device='cuda:0')  and validation loss  tensor(1.8792, device='cuda:0') .

A new best validation loss at epoch  2  with validation loss of  tensor(1.5531, device='cuda:0')
A new best validation loss at epoch  3  with validation loss of  tensor(1.4722, device='cuda:0')
A new best validation loss at epoch  4  with validation loss of  tensor(1.4472, device='cuda:0')
A new best validation loss at epoch  5  with validation loss of  tensor(1.4031, device='cuda:0')
At  95.56986117362976  epoch  5 has training loss  tensor(1.5026, device='cuda:0')  and validation loss  tensor(1.4031, device='cuda:0') .

A new best validation loss at epoch  7  with validation loss of  tensor(1.3791, device='cuda:0')
A new best validation loss at epoch  8  with validation loss of  tensor(1.3670, device='cuda:0')
A new best validation loss at epoch  10 

KeyboardInterrupt: 