In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
import torch

In [3]:
from pytorch_tabnet.tab_model import TabNetRegressor

In [4]:
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
import time

In [7]:
from tqdm.notebook import tqdm

In [8]:
import gc

## Global Variables ###

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
c_ = Fore.CYAN
g_ = Fore.GREEN
y_ = Fore.YELLOW
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL

In [11]:
BASE_DIR = '/sharedHDD/rohit/timeseries_learning/ubiquant/'
DATA_DIR = BASE_DIR+'data/parquet/'
INPUT_DIR = BASE_DIR+'input/'
WEIGHTS_DIR = BASE_DIR + 'weights/'
OUTPUT_DIR = BASE_DIR+'output/'

In [12]:
features = [f'f_{i}' for i in range(300)]

#### Read Data #####

In [13]:
%%time
train_df = pd.read_parquet(DATA_DIR+'train_low_mem.parquet')

CPU times: user 8.01 s, sys: 14.3 s, total: 22.3 s
Wall time: 4.18 s


In [14]:
train_df.head()

Unnamed: 0,row_id,time_id,investment_id,target,f_0,f_1,f_2,f_3,f_4,f_5,...,f_290,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299
0,0_1,0,1,-0.300875,0.932573,0.113691,-0.402206,0.378386,-0.203938,-0.413469,...,0.366028,-1.09562,0.200075,0.819155,0.941183,-0.086764,-1.087009,-1.044826,-0.287605,0.321566
1,0_2,0,2,-0.23104,0.810802,-0.514115,0.742368,-0.616673,-0.194255,1.77121,...,-0.154193,0.912726,-0.734579,0.819155,0.941183,-0.387617,-1.087009,-0.929529,-0.97406,-0.343624
2,0_6,0,6,0.568807,0.393974,0.615937,0.567806,-0.607963,0.068883,-1.083155,...,-0.13802,0.912726,-0.551904,-1.220772,-1.060166,-0.219097,-1.087009,-0.612428,-0.113944,0.243608
3,0_7,0,7,-1.06478,-2.343535,-0.01187,1.874606,-0.606346,-0.586827,-0.815737,...,0.382201,0.912726,-0.266359,-1.220772,0.941183,-0.609113,0.104928,-0.783423,1.15173,-0.773309
4,0_8,0,8,-0.53194,0.842057,-0.262993,2.33003,-0.583422,-0.618392,-0.742814,...,-0.170365,0.912726,-0.741355,-1.220772,0.941183,-0.588445,0.104928,0.753279,1.345611,-0.737624


In [15]:
investment_ids = train_df.investment_id.unique().tolist()

In [16]:
cat_vocab_size = max(investment_ids)+1

In [17]:
with open(BASE_DIR+'input/folds.pickle', 'rb') as f:
    folds = pickle.load(f)

### Modeling ####

In [18]:
all_features_columns = ['investment_id', 'target'] + features

##### Loss function #####

In [19]:
def pearson_loss(x, y):
    xd = x - x.mean()
    yd = y - y.mean()
    nom = (xd * yd).sum()
    denom = ((xd ** 2).sum() * (yd ** 2).sum()).sqrt()
    return 1 - nom / denom

### Tabnet Preparation ####

In [20]:
cat_names = ['investment_id']
cont_names = features

In [21]:
cat_idxs = [0]

In [22]:
cat_dims = [cat_vocab_size]

In [23]:
cat_emb_dim = [100]

In [24]:
tabnet_params = dict(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=cat_emb_dim,
    n_d = 16,
    n_a = 16,
    n_steps = 2,
    gamma = 2,
    n_independent = 2,
    n_shared = 2,
    lambda_sparse = 0,
    optimizer_fn = Adam,
    optimizer_params = dict(lr = (2e-2)),
    mask_type = "entmax",
    scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    scheduler_fn = CosineAnnealingWarmRestarts,
    seed = 42,
#     verbose = 10,
#     device_name = 'cpu'
)

In [25]:
train_df['target'].values.shape

(3141410,)

## Training ###

In [26]:
# for fold in folds.keys():
for fold in [4]:
    print(f'Starting for fold: {fold}{r_}')
    print(f'Preparing training data for fold: {fold}{m_}')
    train_indxs = folds[fold]['train']
    test_indxs = folds[fold]['test']
    test_f_df = train_df[train_df.index.isin(test_indxs)].reset_index(drop=True)
    train_f_df = train_df[train_df.index.isin(train_indxs)].reset_index(drop=True)
    
    X_train = train_f_df[cat_names+cont_names].values
    y_train = train_f_df['target'].values
    
    X_val = test_f_df[cat_names+cont_names].values
    y_val = test_f_df['target'].values
    
    model =  TabNetRegressor(**tabnet_params)
    model.fit(
          X_train, y_train.reshape(-1, 1),
          eval_set=[(X_val, y_val.reshape(-1, 1))],
          eval_name=['valid'],
          max_epochs = 15,
          patience = 10,
          batch_size = 2048, 
          virtual_batch_size = 128,
          num_workers = 4,
          drop_last = False,
          eval_metric=['mse'],
          )
    
    
    preds = model.predict(X_val)
    
    test_f_df['predicted'] = preds.reshape(-1).tolist()
    test_f_df[['row_id','time_id','investment_id','target','predicted']].to_csv(OUTPUT_DIR+str(fold)+'_tabnet.csv', index=False)
    
    model.save_model(WEIGHTS_DIR+str(fold)+'_tabnet')

Starting for fold: 4[31m
Preparing training data for fold: 4[35m
Device used : cuda
epoch 0  | loss: 0.85929 | valid_mse: 0.7516199946403503|  0:01:22s
epoch 1  | loss: 0.8453  | valid_mse: 0.7481099963188171|  0:02:45s
epoch 2  | loss: 0.84114 | valid_mse: 0.7482200264930725|  0:04:11s
epoch 3  | loss: 0.83688 | valid_mse: 0.7537699937820435|  0:05:35s
epoch 4  | loss: 0.83196 | valid_mse: 0.7517300248146057|  0:06:57s
epoch 5  | loss: 0.82604 | valid_mse: 0.7563300132751465|  0:08:17s
epoch 6  | loss: 0.81979 | valid_mse: 0.7572000026702881|  0:09:38s
epoch 7  | loss: 0.8124  | valid_mse: 0.7581999897956848|  0:10:59s
epoch 8  | loss: 0.80523 | valid_mse: 0.7734900116920471|  0:12:20s
epoch 9  | loss: 0.79794 | valid_mse: 0.7721199989318848|  0:13:43s
epoch 10 | loss: 0.79107 | valid_mse: 0.779699981212616|  0:15:04s
epoch 11 | loss: 0.78474 | valid_mse: 0.7825199961662292|  0:16:27s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_valid_mse = 0.7481099963188171
Be