In [None]:
import os

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.tab_model import TabNetClassifier

sns.set()
sns.set_style("whitegrid", {'grid.linestyle': '--'})
sns.set_context("paper", 1.5, {"lines.linewidth": 4})
sns.set_palette("winter_r", 8, 1)
sns.set('talk', 'whitegrid', 'dark', font_scale=1., rc={"lines.linewidth": 2, 'grid.linestyle': '--'})

In [None]:
DATA_PATH = os.path.join(os.pardir, 'input', 'preprocessed')
MODELING_DATA_FILE = os.path.join(DATA_PATH, 'modeling.csv')

SEED = 22
N_STEPS = 3

TARGET_COL = 'Survived'
FEATURE_COLS = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'in_cabin', 'cabin_shared', 'familySize', 'is_alone', 'is_large_family',
                'salutation_impact', 'is_family_boy', 'golden_ticket', 'fare_med', 'high_fare', 'Embarked_n']

In [None]:
df_modeling = pd.read_csv(MODELING_DATA_FILE)

df_modeling = df_modeling[[TARGET_COL] + FEATURE_COLS]

df_modeling

Label Encoding

In [None]:
# le = dict()
# for col in df_modeling.columns:
#     le_ = preprocessing.LabelEncoder()
#     le_.fit(df_modeling[col])
#     df_modeling[col] = le_.transform(df_modeling[col])
#     le.update({col: le_})

# df_modeling

In [None]:
# def label_decode(df):
#     for col in df.columns:
#         df[col] = le[col].inverse_transform(df[col])
#     return df

# df_modeling_orig = label_decode(df_modeling)
# df_modeling_orig

In [None]:
X = df_modeling.drop(TARGET_COL,axis=1)
y = df_modeling[TARGET_COL]

from sklearn.model_selection import train_test_split
(X_train, X_valid, y_train, y_valid) = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# cat_dims = list()

# for col, le_ in le.items():
#     if col == 'Survived':
#         pass
#     print(col, len(le_.classes_))
#     cat_dims.append(len(le_.classes_))

Pretraininig

In [None]:
tabnet_params = dict(n_d=8, n_a=8, n_steps=N_STEPS, gamma=1.3, n_independent=2, n_shared=2, 
                     seed=SEED, lambda_sparse=1e-3,
                     optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2),
                     mask_type="entmax",
                     scheduler_params=dict(mode="min", patience=5, min_lr=1e-5, factor=0.9,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=10
                    )

pretrainer = TabNetPretrainer(**tabnet_params)

pretrainer.fit(
    X_train=X_train.values,
    eval_set=[X_train.values],
    max_epochs=200, patience=20, batch_size=256, virtual_batch_size=128, num_workers=1, drop_last=True
)

Main Training

In [None]:
tabnet_params = dict(n_d=8, n_a=8, n_steps=3, gamma=1.3, n_independent=2, n_shared=2,
                     seed=SEED, lambda_sparse=1e-3,
                     optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type="entmax",
                     scheduler_params=dict(max_lr=0.05, steps_per_epoch=int(X_train.shape[0] / 256), epochs=200, is_batch_level=True),
                     scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
                     verbose=10,
                     # cat_idxs=cat_idxs,     # comment out when Unsupervised
                     # cat_dims=cat_dims,     # comment out when Unsupervised
                     cat_emb_dim=1          # comment out when Unsupervised
                    )

model = TabNetClassifier(**tabnet_params)

model.fit(X_train=X_train.values,
          y_train=y_train.values,
          eval_set=[(X_train.values, y_train.values), (X_valid.values, y_valid.values)],
          eval_name = ['train', 'valid'],
          eval_metric = ['auc', 'accuracy'],
          max_epochs=100,
          patience=20, batch_size=256, virtual_batch_size=128,
          num_workers=0, drop_last=False,
          from_unsupervised=pretrainer # comment out when Unsupervised
         )

Result

In [None]:
df_feature = pd.DataFrame(model.feature_importances_, index=FEATURE_COLS, columns=['feature_importance'])
df_feature.sort_values(by='feature_importance', ascending=True, inplace=True)
df_feature.plot(kind='barh', figsize=(12, 12), legend=False)

In [None]:
# Mask(Local interpretability)
# このMaskはdecision（どの特徴量を使うか決定）するたびに作成される
# 横軸は特徴量

explain_matrix, masks = model.explain(X_valid.values)

fig, axs = plt.subplots(1, N_STEPS, figsize=(20, 20))

for i in range(N_STEPS):
    axs[i].imshow(masks[i][:25])
    axs[i].set_title('mask {}'.format(i))
    axs[i].set_xticklabels(FEATURE_COLS, rotation=90)
    axs[i].xaxis.set_major_locator(ticker.FixedLocator(range(len(FEATURE_COLS))))

In [None]:
y_pred = model.predict(X_valid.values)

y_pred

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_valid.values)

In [None]:
plt.plot(model.history['valid_auc'],)