In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
import dateutil.easter as easter

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [3]:
from torch.utils.data import Dataset, DataLoader

In [4]:
from accelerate import Accelerator
import torch.optim as optim

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
import time

In [7]:
from tqdm.notebook import tqdm

In [8]:
import gc

## Global Variables ###

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
device

device(type='cuda', index=0)

In [11]:
from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
c_ = Fore.CYAN
g_ = Fore.GREEN
y_ = Fore.YELLOW
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL

In [12]:
BASE_DIR = '/sharedHDD/rohit/timeseries_learning/ubiquant/'
DATA_DIR = BASE_DIR+'data/parquet/'
INPUT_DIR = BASE_DIR+'input/'
WEIGHTS_DIR = BASE_DIR + 'weights/'
OUTPUT_DIR = BASE_DIR+'output/'

In [13]:
features = [f'f_{i}' for i in range(300)]

## Load data ##

In [14]:
%%time
train_df = pd.read_parquet(DATA_DIR+'train_low_mem.parquet')

CPU times: user 9.05 s, sys: 15.4 s, total: 24.5 s
Wall time: 4.25 s


In [15]:
investment_ids = train_df.investment_id.unique().tolist()

In [16]:
max(investment_ids)

3773

In [17]:
with open(BASE_DIR+'input/folds.pickle', 'rb') as f:
    folds = pickle.load(f)

#### Utility functions ######

In [18]:
gc.collect()

80

## Modeling ##

In [19]:
config = {
    'num_epochs' : 100,
    'lr' : 0.00026,
    'input_size' : 300,
    'num_classes' :1, ## This is  output dimension
    'train_shuffle': True,
    'val_shuffle': False,
    'batch_size' : 4096*2,
    'best_model_name' : 'baseline_mse',
    'early_stopping_patience':10,
}

In [20]:
class TSDataset(Dataset):
    
    def __init__(self,x):
        """
        Args:
        """
        self.x=x

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        
        sample = torch.Tensor(self.x[idx])
        return sample

#### Model ####

In [21]:
num_epochs = config['num_epochs']
lr = config['lr']
input_size = config['input_size']
num_classes = config['num_classes']
early_stopping_patience = config['early_stopping_patience']

#### Loss function ######

In [22]:
def pearson_loss(x, y):
    xd = x - x.mean()
    yd = y - y.mean()
    nom = (xd * yd).sum()
    denom = ((xd ** 2).sum() * (yd ** 2).sum()).sqrt()
    return 1 - nom / denom

In [23]:
class BaselineModel(nn.Module):

    def __init__(self, num_classes, input_size):
        super(BaselineModel, self).__init__()
        
        self.num_classes = num_classes
        self.input_size = input_size
        
        
        print(self.input_size)
        
        self.fc = nn.Sequential(nn.Linear(self.input_size, 508),
#                                 nn.BatchNorm1d(num_features=508),
                                nn.ReLU(),
                                nn.Dropout(0.42),
                                
                                nn.Linear(508, 405),
#                                 nn.BatchNorm1d(num_features=405),
                                nn.Dropout(0.42),
                                nn.ReLU(),
                                
#                                 nn.Linear(input_fc_dim//16, input_fc_dim//32),
#                                 nn.BatchNorm1d(num_features=input_fc_dim//32),
#                                 # nn.Dropout(0.2),
#                                 nn.ReLU(),
                                
                                
                                nn.Linear(405, self.num_classes)
                                )
    
    def forward(self, x):
        out = self.fc(x)
        
        return out

In [24]:
gc.collect()

120

In [25]:
# for fold in folds.keys():
for fold in tqdm([0,1,2,3,4]):
    preds=[]
    print(f'Starting for fold: {fold}{r_}')
    
    test_indxs = folds[fold]['test']
    test_f_df = train_df[train_df.index.isin(test_indxs)].reset_index(drop=True)
    
    
    
    X_val = test_f_df[features].values
    
    
    model = BaselineModel(num_classes, input_size).to(device)
    model.load_state_dict(torch.load(WEIGHTS_DIR+str(fold)+'_'+config['best_model_name']))
    model.eval()
    
    
    test_dl = DataLoader(TSDataset(X_val), batch_size=config['batch_size'], shuffle=config['val_shuffle'], num_workers=0)
    with torch.no_grad():
        for i, inputs in enumerate(tqdm(test_dl)):
            dataX = inputs.to(device)
            outputs = model(dataX).cpu().numpy().squeeze().tolist()
            preds.extend(outputs)
    
    test_f_df['predicted'] = preds
    test_f_df[['row_id','time_id','investment_id','target','predicted']].to_csv(OUTPUT_DIR+str(fold)+'_'+config['best_model_name']+'.csv', index=False)
    
    gc.collect()
    

  0%|          | 0/5 [00:00<?, ?it/s]

Starting for fold: 0[31m
300


  0%|          | 0/64 [00:00<?, ?it/s]

Starting for fold: 1[31m
300


  0%|          | 0/64 [00:00<?, ?it/s]

Starting for fold: 2[31m
300


  0%|          | 0/64 [00:00<?, ?it/s]

Starting for fold: 3[31m
300


  0%|          | 0/64 [00:00<?, ?it/s]

Starting for fold: 4[31m
300


  0%|          | 0/64 [00:00<?, ?it/s]