In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import dask.dataframe as dd

In [4]:
import gc

In [5]:
import pickle

In [6]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [7]:
from torch.utils.data import Dataset, DataLoader
from accelerate import Accelerator
import torch.optim as optim
from sklearn.model_selection import train_test_split


In [8]:
# import sys
# sys.path.append("../data/")

In [9]:
# import ubiquant

##### Global Variables ####

In [10]:
from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
c_ = Fore.CYAN
g_ = Fore.GREEN
y_ = Fore.YELLOW
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL

In [11]:
gc.collect()

0

In [12]:
DATA_DIR = '../data/parquet/'

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [14]:
device

device(type='cuda', index=0)

In [15]:
input_chunk_length = 30
output_chunk_length = 1
hidden_size = 50

In [16]:
features = [f'f_{i}' for i in range(300)]

In [17]:
config = {
    'seq_length' : input_chunk_length,
    'pred_length':output_chunk_length,
    'num_epochs' : 1,
    'lr' : 0.001,
    'input_size' : 180, 
    'hidden_size' : hidden_size,
    'num_layers' : 2,
    'num_classes' :1, ## This is  output dimension
    'train_shuffle': True,
    'val_shuffle': True,
    'batch_size' : 128,
    'best_model_name' : 'lstm_'+str(input_chunk_length)+'_'+str(output_chunk_length)+'_'+str(hidden_size),
    'bidirectional' : False,
    'only_last_hidden': False
}

#### Read Data #####

In [18]:
%%time
train_df = pd.read_parquet(DATA_DIR+'train_low_mem.parquet')

CPU times: user 26.8 s, sys: 9.26 s, total: 36 s
Wall time: 6.08 s


In [19]:
train_df.head()

Unnamed: 0,row_id,time_id,investment_id,target,f_0,f_1,f_2,f_3,f_4,f_5,...,f_290,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299
0,0_1,0,1,-0.300875,0.932573,0.113691,-0.402206,0.378386,-0.203938,-0.413469,...,0.366028,-1.09562,0.200075,0.819155,0.941183,-0.086764,-1.087009,-1.044826,-0.287605,0.321566
1,0_2,0,2,-0.23104,0.810802,-0.514115,0.742368,-0.616673,-0.194255,1.77121,...,-0.154193,0.912726,-0.734579,0.819155,0.941183,-0.387617,-1.087009,-0.929529,-0.97406,-0.343624
2,0_6,0,6,0.568807,0.393974,0.615937,0.567806,-0.607963,0.068883,-1.083155,...,-0.13802,0.912726,-0.551904,-1.220772,-1.060166,-0.219097,-1.087009,-0.612428,-0.113944,0.243608
3,0_7,0,7,-1.06478,-2.343535,-0.01187,1.874606,-0.606346,-0.586827,-0.815737,...,0.382201,0.912726,-0.266359,-1.220772,0.941183,-0.609113,0.104928,-0.783423,1.15173,-0.773309
4,0_8,0,8,-0.53194,0.842057,-0.262993,2.33003,-0.583422,-0.618392,-0.742814,...,-0.170365,0.912726,-0.741355,-1.220772,0.941183,-0.588445,0.104928,0.753279,1.345611,-0.737624


In [20]:
####### Filter out only Fold 1 data ###########

In [21]:
with open('../input/folds.pickle', 'rb') as f:
    folds = pickle.load(f)

In [22]:
train_indxs = folds[0]['train']
test_indxs = folds[0]['test']

In [23]:
test_df = train_df[train_df.index.isin(test_indxs)].reset_index(drop=True)
train_df = train_df[train_df.index.isin(train_indxs)].reset_index(drop=True)

In [24]:
gc.collect()

0

In [25]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531075 entries, 0 to 531074
Columns: 304 entries, row_id to f_299
dtypes: float32(301), object(1), uint16(2)
memory usage: 615.9+ MB


In [26]:
# train_df = train_df.sample(frac=0.2)

In [27]:
investment_ids = train_df.investment_id.unique().tolist()

In [28]:
count_df = train_df.groupby('investment_id',as_index=False).agg(c = ('time_id', len))

In [29]:
count_df.c.describe()

count    3577.000000
mean      148.469388
std        52.384007
min         2.000000
25%       116.000000
50%       170.000000
75%       191.000000
max       206.000000
Name: c, dtype: float64

In [30]:
to_exclude_ids = count_df[count_df.c<input_chunk_length+1].investment_id.unique().tolist()

In [31]:
train_df.shape

(531075, 304)

## Modeling ##

##### Prepare data ######

In [32]:
all_features_columns = ['investment_id', 'target'] + features

In [33]:
def get_feature_array_train(rows):
    target_val = rows.target.values.reshape(-1,1)
    
    inp_vec = rows[all_features_columns].values
    
    return inp_vec,target_val

In [34]:
def sliding_windows(X,Y,seq_length,prediction_length):
    x = []
    y = []
    for i in range(len(X)-(prediction_length+seq_length)):
        _x = X[i:(i+seq_length),:].reshape(seq_length,-1)
        _y = Y[(i+seq_length):(i+prediction_length+seq_length),:]
        x.append(_x)
        y.append(_y)

    return x,y

In [35]:
def make_sequences(df):
    x = []
    y = []
    investment_ids = df.investment_id.unique()
    for iid in investment_ids:
        if iid not in to_exclude_ids:
            df_tmp = df[df.investment_id == iid].sort_values('time_id')
            tmp_X,tmp_Y = get_feature_array_train(df_tmp)
            _x,_y = sliding_windows(tmp_X,tmp_Y,input_chunk_length,output_chunk_length)
            x.extend(_x)
            y.extend(_y)
            
            gc.collect()
        # else:
        #     print('Not including iid:', iid)
    
    return np.array(x),np.array(y)
            
            
        

In [None]:
X_train, Y_train = make_sequences(train_df)

In [None]:
X_train.shape

In [None]:
Y_train.shape