In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
import dateutil.easter as easter

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [3]:
from torch.utils.data import Dataset, DataLoader

In [4]:
from accelerate import Accelerator
import torch.optim as optim

In [5]:
import time

In [6]:
from tqdm.notebook import tqdm

In [7]:
import gc

## Global Variables ###

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
device

device(type='cuda', index=0)

In [10]:
from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
c_ = Fore.CYAN
g_ = Fore.GREEN
y_ = Fore.YELLOW
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL

In [11]:
BASE_DIR = '/sharedHDD/rohit/timeseries_learning/ubiquant/'
DATA_DIR = BASE_DIR+'data/parquet/'
INPUT_DIR = BASE_DIR+'input/'
WEIGHTS_DIR = BASE_DIR + 'weights/'
OUTPUT_DIR = BASE_DIR+'output/'

In [12]:
input_chunk_length = 30
output_chunk_length = 1
embedding_dim = 50 ## Investment id embedding dim

In [13]:
features = [f'f_{i}' for i in range(300)]

## Load data ##

In [14]:
%%time
train_df = pd.read_parquet(DATA_DIR+'train_low_mem.parquet')

CPU times: user 8.55 s, sys: 14.7 s, total: 23.3 s
Wall time: 4.14 s


In [15]:
investment_ids = train_df.investment_id.unique().tolist()

In [16]:
with open(BASE_DIR+'input/folds.pickle', 'rb') as f:
    folds = pickle.load(f)

## Model ##

In [17]:
config = {
    'seq_length' : input_chunk_length,
    'num_epochs' : 100,
    'lr' : 0.00001,
    'input_size' : 351,
    'hidden_size' : 351,
    'num_layers' : 1,
    'num_classes' :1, ## This is  output dimension
    'train_shuffle': True,
    'val_shuffle': True,
    'batch_size' : 4096*2,
    'best_model_name' : 'lstm',
    'bidirectional' : False,
    'only_last_hidden': True
}

In [18]:
num_epochs = config['num_epochs']
lr = config['lr']
input_size = config['input_size']
hidden_size = config['hidden_size']
num_layers = config['num_layers']
num_classes = config['num_classes']
seq_length = config['seq_length']
bidirectional = config['bidirectional']
only_last_hidden = config['only_last_hidden']

In [19]:
class LstmTsModel(nn.Module):

    def __init__(self, num_classes, input_size, hidden_size, num_layers,seq_length):
        super(LstmTsModel, self).__init__()
        
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        
        cat_input_dim: int = 3774
        
        self.embedding = nn.Embedding(cat_input_dim, embedding_dim)
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True,bidirectional=bidirectional)
        
        if bidirectional:
            m=2
        else:
            m=1
        
        if only_last_hidden:
            input_fc_dim = hidden_size*m
        else:
            input_fc_dim = self.seq_length*hidden_size*m
        
        print(input_fc_dim)
        
        self.fc = nn.Sequential(nn.Linear(input_fc_dim, input_fc_dim//2),
                                nn.BatchNorm1d(num_features=input_fc_dim//2),
                                nn.Dropout(0.2),
                                nn.ReLU(),
                                
#                                 nn.Linear(input_fc_dim//4, input_fc_dim//16),
#                                 nn.BatchNorm1d(num_features=input_fc_dim//16),
#                                 nn.Dropout(0.2),
#                                 nn.ReLU(),
                                
#                                 nn.Linear(input_fc_dim//16, input_fc_dim//32),
#                                 nn.BatchNorm1d(num_features=input_fc_dim//32),
#                                 # nn.Dropout(0.2),
#                                 nn.ReLU(),
                                
                                
                                nn.Linear(input_fc_dim//2, self.num_classes)
                                )

    def forward(self, x):
        # Propagate input through LSTM
        
        investment_ids = x[:,:,0].type(torch.LongTensor).to(device)
        
        embeddings = self.embedding(investment_ids)
        
        x = torch.cat([embeddings,x[:,:,1:]], dim=2)
        
        h_out, (h_n, _) = self.lstm(x)
        if only_last_hidden:
            h_out = h_out[:,-1:,:]
        
#         print(h_out.shape)
        h_out = h_out.flatten(start_dim=1)
#         print(h_out.shape)
        
        out = self.fc(h_out)
        
        return out

## Prediction ###

In [20]:
all_features_columns = ['investment_id', 'target'] + features

In [21]:
def create_context_for_investment_id(df,iid,context_length):
    ### df is df for investment id #####
    df = df.sort_values('time_id').reset_index(drop=True)
    if len(df) >= context_length:
        c_df = df.iloc[-context_length:,:].reset_index(drop=True)
        c = c_df[all_features_columns].values
    elif len(df) == 0:
        c = np.zeros((context_length,len(all_features_columns)))
        c[:, 0] = iid
    else:
        c_df = df.iloc[-context_length:,:].reset_index(drop=True)
        c = c_df[all_features_columns].values
        c = np.resize(c, (context_length, c.shape[-1]))
        
    
    c = np.expand_dims(c, axis=0)
    return c
    

In [22]:
# create_context_for_investment_id(train_f_df[train_f_df.investment_id == 1],input_chunk_length)

In [23]:

# for fold in folds.keys():
for fold in [2,3,4]:
    print(f'Starting for fold: {fold}{r_}')
    print(f'Preparing data for fold: {fold}{m_}')
    train_indxs = folds[fold]['train']
    test_indxs = folds[fold]['test']
    test_f_df = train_df[train_df.index.isin(test_indxs)].reset_index(drop=True)
    train_f_df = train_df[train_df.index.isin(train_indxs)].reset_index(drop=True)
    
    test_f_df = test_f_df.sort_values(['investment_id','time_id']).rename(columns = {'target': 'target_o'}).reset_index(drop=True)
    test_f_df['target'] = -111.1

    #### Create Context #####
    context = {}
    for iid in investment_ids:
#             print(f'fold: {fold} Investment id {iid}{r_}')
        iid_df = train_f_df[train_f_df.investment_id == iid]
        c = create_context_for_investment_id(iid_df,iid,input_chunk_length)
        context[iid] = c


    #### Load model #####
    model = LstmTsModel(num_classes, input_size, hidden_size, num_layers,seq_length).to(device)
    model.load_state_dict(torch.load(WEIGHTS_DIR+str(fold)+'_'+config['best_model_name']))
    model.eval()
    
    #### Prediction ####
    with torch.no_grad():
        for i, row in test_f_df.iterrows():
            inv_id = row['investment_id']
            ctx = context[inv_id]
            out = model(torch.Tensor(ctx.astype(float)).to(device)).squeeze().cpu().item()
#             print(out)
            test_f_df.at[i, 'target'] = float(out)
            
            ### Update Context #####
            new_in_ctx = test_f_df.loc[i][all_features_columns].values.reshape(1,-1).reshape(1,1,-1)
            
            context[inv_id] = np.concatenate((context[inv_id][:,1:,:], new_in_ctx),axis=1)
    
    
    test_f_df[['row_id','time_id','investment_id','target_o','target']].to_csv(OUTPUT_DIR+str(fold)+'_'+config['best_model_name']+'.csv', index=False)
    
    
    
        
        

Starting for fold: 2[31m
Preparing data for fold: 2[35m
351
Starting for fold: 3[31m
Preparing data for fold: 3[35m
351


KeyboardInterrupt: 

In [None]:
test_f_df.loc[:500]['target'].hist()