### pytorch-tabnet
* 5 fold cv
* best result for single model alg.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from pytorch_tabnet.tab_model import TabNetRegressor

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# globals
FIGSIZE = (9, 6)

SEED = 42

FILE_TRAIN = "train.csv"

In [3]:
data_orig = pd.read_csv(FILE_TRAIN)

# feature engineering
data_orig['datetime'] = pd.to_datetime(data_orig['datetime'])

# this way I add 3 engineered features
data_orig['hour'] = data_orig['datetime'].dt.hour

# mese di fatto è duplicato di season. rimuovo
#data_orig['month'] = data_orig['datetime'].dt.month
# data_orig['day'] = data_orig['datetime'].dt.day
data_orig['year'] = data_orig['datetime'].dt.year

In [4]:
all_columns = data_orig.columns

# colonne da ignorare
# atemp and temp are strongly correlated (0.98) taking only one
del_columns = ['datetime', 'casual', 'registered', 'temp']

TARGET = "count"

cat_cols = ['season', 'holiday','workingday', 'weather', 'hour', 'year']

num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))

features = sorted(cat_cols + num_cols)

print('Tutte le colonne:', len(all_columns))
print('Colonne ignorate:', len(del_columns))
print('target:', len([TARGET]))
print('Colonne cat:', len(cat_cols))
print('Colonne num:', len(num_cols))
print('Num. features', len(features))

Tutte le colonne: 14
Colonne ignorate: 4
target: 1
Colonne cat: 6
Colonne num: 3
Num. features 9


In [5]:
data_used = data_orig.drop(del_columns, axis=1)

In [8]:
categorical_columns = cat_cols
categorical_dims =  {}
vet_lenc = []

for col in cat_cols:
    print(col, data_used[col].nunique(), data_used[col].unique())
    l_enc = LabelEncoder()
    data_used[col] = l_enc.fit_transform(data_used[col].values)
    vet_lenc.append(l_enc)
    categorical_dims[col] = len(l_enc.classes_)

season 4 [1 2 3 4]
holiday 2 [0 1]
workingday 2 [0 1]
weather 4 [1 2 3 4]
hour 24 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
year 2 [2011 2012]


In [9]:
cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [15]:
%%time

FOLDS = 5

skf = KFold(n_splits = FOLDS, shuffle=True, random_state = SEED)

params = {'n_steps':2,
          'cat_dims':cat_dims,
          'cat_idxs':cat_idxs,
          'verbose':50
         }

best_models = []

i = 1
for train_idx, valid_idx in skf.split(data_used):
    print()
    print('Processing fold:', i)
    
    data_train = data_used.iloc[train_idx]
    data_valid = data_used.iloc[valid_idx]
    
    x_train = data_train[features].values
    y_train = data_train[TARGET].values
    y_train = y_train.reshape(-1, 1)

    x_valid = data_valid[features].values
    y_valid = data_valid[TARGET].values
    y_valid = y_valid.reshape(-1, 1)
    
    # clf = TabNetRegressor(n_steps = 2, cat_dims=cat_dims, cat_idxs=cat_idxs, verbose=50)
    model = TabNetRegressor(**params)

    # provo a cercare rmsle
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], eval_metric=['rmse', 'rmsle'], max_epochs=1000, patience=100, batch_size=256)
    
    best_models.append(model)
    
    # next iteration
    i += 1


Processing fold: 1
Device used : cuda
epoch 0  | loss: 64915.54318| val_0_rmse: 207.72345| val_0_rmsle: 3.33527 |  0:00:00s
epoch 50 | loss: 3398.20264| val_0_rmse: 55.69292| val_0_rmsle: 0.32845 |  0:00:47s
epoch 100| loss: 2923.74165| val_0_rmse: 59.0454 | val_0_rmsle: 0.30486 |  0:01:35s
epoch 150| loss: 3126.23728| val_0_rmse: 54.4858 | val_0_rmsle: 0.42791 |  0:02:21s
epoch 200| loss: 2352.15818| val_0_rmse: 50.90203| val_0_rmsle: 0.27902 |  0:03:09s
epoch 250| loss: 2164.53267| val_0_rmse: 49.10467| val_0_rmsle: 0.26385 |  0:03:57s
epoch 300| loss: 2100.76719| val_0_rmse: 48.56569| val_0_rmsle: 0.25323 |  0:04:44s
epoch 350| loss: 1951.568| val_0_rmse: 48.71729| val_0_rmsle: 0.27985 |  0:05:31s
epoch 400| loss: 2008.18462| val_0_rmse: 46.48582| val_0_rmsle: 0.21245 |  0:06:18s
epoch 450| loss: 1850.84148| val_0_rmse: 57.4932 | val_0_rmsle: 0.33263 |  0:07:07s

Early stopping occurred at epoch 495 with best_epoch = 395 and best_val_0_rmsle = 0.21042
Best weights from best epoch a

### Submission

In [16]:
test_orig = pd.read_csv("test.csv")

In [17]:
# add engineered features
# feature engineering
test_orig['datetime'] = pd.to_datetime(test_orig['datetime'])

# this way I add 3 engineered features
test_orig['hour'] = test_orig['datetime'].dt.hour
#test_orig['month'] = test_orig['datetime'].dt.month
# test_orig['day'] = test_orig['datetime'].dt.day
test_orig['year'] = test_orig['datetime'].dt.year

In [18]:
# code categorical
for i, col in enumerate(cat_cols):
    print(col, test_orig[col].nunique(), test_orig[col].unique())
    l_enc = vet_lenc[i]
    test_orig[col] = l_enc.transform(test_orig[col].values)

season 4 [1 2 3 4]
holiday 2 [0 1]
workingday 2 [1 0]
weather 4 [1 2 3 4]
hour 24 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
year 2 [2011 2012]


In [22]:
x_test = test_orig[features].values

avg_score = np.zeros((x_test.shape[0], 1))
                     
i = 0
for model in best_models:
    print()
    print('Predictions from model', i+1)
    
    score_test = model.predict(x_test)
    
    avg_score += score_test/float(FOLDS)
                     
    i += 1


Predictions from model 1

Predictions from model 2

Predictions from model 3

Predictions from model 4

Predictions from model 5


In [23]:
df_sub = pd.read_csv("sampleSubmission.csv")

In [24]:
df_sub["count"] = avg_score

In [25]:
condition = df_sub["count"] < 0

df_sub.loc[condition, "count"] = 0

In [26]:
FILE_SUB = "submission33.csv"

df_sub.to_csv(FILE_SUB, index=False)

In [27]:
!kaggle competitions submit -c "bike-sharing-demand" -f $FILE_SUB -m "sub33 tabnet cv"

100%|█████████████████████████████████████████| 244k/244k [00:01<00:00, 149kB/s]
Successfully submitted to Bike Sharing Demand