### pytorch-tabnet
* 5 fold cv
* best result for single model alg.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from pytorch_tabnet.tab_model import TabNetRegressor

In [2]:
# globals
FIGSIZE = (9, 6)
BATCH_SIZE = 256

SEED = 42

FILE_TRAIN = "train.csv"

In [4]:
# for adding features
def add_features(df):
    new_df = df.copy()
    new_df['datetime'] = pd.to_datetime(new_df['datetime'])

    # this way I add 3 engineered features
    new_df['hour'] = new_df['datetime'].dt.hour
    new_df['year'] = new_df['datetime'].dt.year
    
    return new_df

In [5]:
data_orig = pd.read_csv(FILE_TRAIN)

# feature engineering
data_added = add_features(data_orig)

In [None]:
all_columns = data_orig.columns

# colonne da ignorare
# atemp and temp are strongly correlated (0.98) taking only one
del_columns = ['datetime', 'casual', 'registered', 'temp']

TARGET = "count"

cat_cols = ['season', 'holiday','workingday', 'weather', 'hour', 'year']

num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))

features = sorted(cat_cols + num_cols)

print('Tutte le colonne:', len(all_columns))
print('Colonne ignorate:', len(del_columns))
print('target:', len([TARGET]))
print('Colonne cat:', len(cat_cols))
print('Colonne num:', len(num_cols))
print('Num. features', len(features))

In [None]:
data_used = data_orig.drop(del_columns, axis=1)

In [None]:
# this part is required for TabNet
# categorical_columns = cat_cols
categorical_dims =  {}
# save label encoder for predictions
vet_lenc = []

for col in cat_cols:
    # print(col, data_used[col].nunique(), data_used[col].unique())
    print(col)
    l_enc = LabelEncoder()
    data_used[col] = l_enc.fit_transform(data_used[col].values)
    vet_lenc.append(l_enc)
    categorical_dims[col] = len(l_enc.classes_)

In [None]:
cat_idxs = [ i for i, f in enumerate(features) if f in cat_cols]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in cat_cols]

### Training

In [None]:
%%time

FOLDS = 7

skf = KFold(n_splits = FOLDS, shuffle=True, random_state = SEED)

# provato sembra meglio n_steps = 2
# forse perchè va in overfitting
params = {'n_steps':2,
          'cat_dims':cat_dims,
          'cat_idxs':cat_idxs,
          'verbose':50
         }

# we will save here all the results from FOLDS
best_models = []

EPOCHS = 1000
PATIENCE = 100

i = 1
for train_idx, valid_idx in skf.split(data_used):
    print()
    print('Processing fold:', i)
    
    data_train = data_used.iloc[train_idx]
    data_valid = data_used.iloc[valid_idx]
    
    x_train = data_train[features].values
    y_train = data_train[TARGET].values
    y_train = y_train.reshape(-1, 1)

    x_valid = data_valid[features].values
    y_valid = data_valid[TARGET].values
    y_valid = y_valid.reshape(-1, 1)
    
    model = TabNetRegressor(**params)

    # provo a cercare direttamente best su rmsle
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], eval_metric=['rmse', 'rmsle'], 
              max_epochs=EPOCHS, patience=PATIENCE, batch_size=BATCH_SIZE)
    
    best_models.append(model)
    
    # next iteration
    i += 1

### Submission

In [None]:
test_orig = pd.read_csv("test.csv")

In [None]:
# add engineered features
test_orig = add_features(test_orig)

In [None]:
# code categorical
for i, col in enumerate(cat_cols):
    print(col)
    l_enc = vet_lenc[i]
    test_orig[col] = l_enc.transform(test_orig[col].values)

In [None]:
x_test = test_orig[features].values

avg_score = np.zeros((x_test.shape[0], 1))
                     
for i,model in enumerate(best_models):
    print()
    print('Predictions from model', i+1)
    
    score_test = model.predict(x_test)
    
    avg_score += score_test/float(FOLDS)

In [None]:
df_sub = pd.read_csv("sampleSubmission.csv")

In [None]:
df_sub["count"] = avg_score

In [None]:
# replace negative with zero
condition = df_sub["count"] < 0

df_sub.loc[condition, "count"] = 0

In [None]:
FILE_SUB = "submission33bis.csv"

df_sub.to_csv(FILE_SUB, index=False)

In [None]:
!kaggle competitions submit -c "bike-sharing-demand" -f $FILE_SUB -m "sub33bis tabnet cv"