In [1]:
from pytorch_tabnet.tab_model import TabNetClassifier

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np
np.random.seed(0)


import os
import wget
from pathlib import Path

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
!git clone https://github.com/pianonyy/VCR_uplift.git

In [None]:
!pip install -e VCR_uplift

# Load data and split

In [2]:
# функция для уменьшения объёма датасета за счёт изменения типов
def reduce_mem_usage(df, verbose=True): 
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
import sys
from sklearn.model_selection import train_test_split
import logging
import pickle

log_format = '[%(asctime)s] %(name)-25s %(levelname)-8s %(message)s'
logging.basicConfig(
    format=log_format,
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

from load_and_prepare import (
    prepare_clients,
    prepare_products,
    prepare_purchases,
    load_train,
    load_test,
)


from config import RANDOM_STATE, SUBMISSIONS_PATH
logger.info('Loading features...')
with open('features.pkl', 'rb') as f:
    features: pd.DataFrame = pickle.load(f)
logger.info('Features are loaded')

logger.info(f'Features shape: {features.shape}')

logger.info('Preparing data sets...')
features.set_index('client_id', inplace=True)

features = reduce_mem_usage(features)


features = features.replace([np.inf, -np.inf], np.nan).fillna(0)

train = load_train()
test = load_test()
indices_train = train.index
indices_test = test.index

X_train = features.loc[indices_train, :]
treatment_train = train.loc[indices_train, 'treatment_flg'].values
target_train = train.loc[indices_train, 'target'].values
# y_valid = make_z(treatment_train, target_train)

X_test = features.loc[indices_test, :]

RANDOM_STATE = 12

indices_learn, indices_valid = train_test_split(
    train.index,
    test_size=0.3,
    random_state=RANDOM_STATE + 1,
)

X_learn = features.loc[indices_learn, :]
treatment_learn = train.loc[indices_learn, 'treatment_flg'].values
target_learn = train.loc[indices_learn, 'target'].values
# y_learn = make_z(treatment_learn, target_learn)

X_valid = features.loc[indices_valid, :]
treatment_valid = train.loc[indices_valid, 'treatment_flg'].values
target_valid = train.loc[indices_valid, 'target'].values
# y_valid = make_z(treatment_valid, target_valid)
logger.info('Data sets prepared')

# eval_set = [(np.column_stack((X_valid,treatment_valid)), target_valid)]



[2021-02-25 01:48:07,497] __main__                  INFO     Loading features...
[2021-02-25 01:48:07,909] __main__                  INFO     Features are loaded
[2021-02-25 01:48:07,910] __main__                  INFO     Features shape: (400162, 334)
[2021-02-25 01:48:07,911] __main__                  INFO     Preparing data sets...


Mem. usage decreased to 255.69 Mb (68.1% reduction)


[2021-02-25 01:48:51,311] __main__                  INFO     Data sets prepared


In [4]:

clf = TabNetClassifier()
                      

Device used : cpu


In [5]:
clf

TabNetClassifier(n_d=8, n_a=8, n_steps=3, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=1, n_independent=2, n_shared=2, epsilon=1e-15, momentum=0.02, lambda_sparse=0.001, seed=0, clip_value=1, verbose=1, optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.02}, scheduler_fn=None, scheduler_params={}, mask_type='sparsemax', input_dim=None, output_dim=None, device_name='auto')

In [6]:
print(X_valid.shape)

(60011, 333)


# Training

In [7]:
T_train = treatment_learn


# print(X_learn.shape)
# print(T_train.shape)
# print(target_learn.shape)



clf.fit(
    X_train=X_learn.values,T_train=T_train, y_train=target_learn,
#     eval_set=[(X_learn, T_train, target_learn)],
#     eval_name=['train'],
#     eval_metric=['auc'],
    max_epochs=10 , patience=20,batch_size=8573,

    num_workers=0,
    weights=1,
    drop_last=False
) 

0
[]
No early stopping will be performed, last training weights will be used.
epoch 0  | loss: 1.61136 |  0:02:01s
epoch 1  | loss: 1.5867  |  0:04:01s
epoch 2  | loss: 1.57931 |  0:06:16s
epoch 3  | loss: 1.5734  |  0:08:15s
epoch 4  | loss: 1.57665 |  0:10:29s
epoch 5  | loss: 1.57731 |  0:12:45s
epoch 6  | loss: 1.57601 |  0:14:47s
epoch 7  | loss: 1.57814 |  0:16:45s
epoch 8  | loss: 1.5808  |  0:19:02s
epoch 9  | loss: 1.57564 |  0:21:30s


In [8]:
preds = clf.predict_proba(X_valid.values)
print(preds)

torch.Size([8573])
torch.Size([8573])
torch.Size([8573])
torch.Size([8573])
torch.Size([8573])
torch.Size([8573])
torch.Size([8573])
[[ 2.27187574e-03 -9.75102186e-04 -4.27961349e-03 ...  1.26665831e-03
  -8.98659229e-04  0.00000000e+00]
 [-1.55508518e-04  9.53200459e-03  8.69047642e-03 ... -1.56462193e-04
  -1.31529570e-03 -6.95466995e-04]
 [-1.32936239e-03  5.12838364e-04 -3.96585464e-03 ...  9.19494033e-02
   3.98688912e-02  4.38457727e-03]
 ...
 [ 5.87695837e-03  2.24673748e-03 -3.42264771e-03 ... -4.06968594e-03
  -2.77078152e-03  1.33920372e-01]
 [ 1.26067400e-02 -9.79834795e-03  1.12593174e-04 ...  1.13189220e-03
   9.24453139e-04  1.93595886e-04]
 [ 1.27652287e-02  1.93535089e-02  1.84084773e-02 ...  5.46997786e-02
  -1.31040812e-04  8.12250376e-03]]


In [9]:
print(preds.reshape(60011,1))

[[ 0.00227188]
 [-0.0009751 ]
 [-0.00427961]
 ...
 [ 0.05469978]
 [-0.00013104]
 [ 0.0081225 ]]


In [10]:
from uplift.metrics import uplift_at_k
from uplift.metrics import qini_auc_score
pred_uplift = preds[:,0]
 
print(uplift_at_k(target_valid, preds.reshape(60011,1), treatment_valid))

nan



Mean of empty slice.


invalid value encountered in double_scalars



In [11]:
preds = clf.predict_proba(X_valid.values)
print(preds)

torch.Size([8573])
torch.Size([8573])
torch.Size([8573])
torch.Size([8573])
torch.Size([8573])
torch.Size([8573])
torch.Size([8573])
[[ 2.27187574e-03 -9.75102186e-04 -4.27961349e-03 ...  1.26665831e-03
  -8.98659229e-04  0.00000000e+00]
 [-1.55508518e-04  9.53200459e-03  8.69047642e-03 ... -1.56462193e-04
  -1.31529570e-03 -6.95466995e-04]
 [-1.32936239e-03  5.12838364e-04 -3.96585464e-03 ...  9.19494033e-02
   3.98688912e-02  4.38457727e-03]
 ...
 [ 5.87695837e-03  2.24673748e-03 -3.42264771e-03 ... -4.06968594e-03
  -2.77078152e-03  1.33920372e-01]
 [ 1.26067400e-02 -9.79834795e-03  1.12593174e-04 ...  1.13189220e-03
   9.24453139e-04  1.93595886e-04]
 [ 1.27652287e-02  1.93535089e-02  1.84084773e-02 ...  5.46997786e-02
  -1.31040812e-04  8.12250376e-03]]


In [12]:
T_train = np.random.randint(2, size=X_train.shape[0])
clf.fit(
    X_train=X_train,T_train=T_train, y_train=y_train,
    eval_set=[(X_train, T_train, y_train)],
    eval_name=['train'],
    eval_metric=['auc'],
    max_epochs=10 , patience=20,

    num_workers=0,
    weights=1,
    drop_last=False
) 

NameError: name 'y_train' is not defined

In [None]:
# plot losses
plt.plot(clf.history['loss'])

In [None]:
# plot auc
plt.plot(clf.history['train_auc'])


In [None]:
# plot learning rates
plt.plot(clf.history['lr'])

In [None]:
preds = clf.predict_proba(X_test)
print(preds)

In [None]:
print(preds[:,1]-preds[:,0])

## Predictions

In [None]:
preds = clf.predict_proba(X_test)
test_auc = roc_auc_score(y_score=preds[:,1], y_true=y_test)


preds_valid = clf.predict_proba(X_valid)
valid_auc = roc_auc_score(y_score=preds_valid[:,1], y_true=y_valid)

print(f"BEST VALID SCORE FOR {dataset_name} : {clf.best_cost}")
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")

In [None]:
# check that best weights are used
assert np.isclose(valid_auc, np.max(clf.history['valid_auc']), atol=1e-6)

In [None]:
clf.predict(X_test)

# Save and load Model

In [None]:
# save tabnet model
saving_path_name = "./tabnet_model_test_1"
saved_filepath = clf.save_model(saving_path_name)

In [None]:
# define new model with basic parameters and load state dict weights
loaded_clf = TabNetClassifier()
loaded_clf.load_model(saved_filepath)

In [None]:
loaded_preds = loaded_clf.predict_proba(X_test)
loaded_test_auc = roc_auc_score(y_score=loaded_preds[:,1], y_true=y_test)

print(f"FINAL TEST SCORE FOR {dataset_name} : {loaded_test_auc}")

In [None]:
assert(test_auc == loaded_test_auc)

In [None]:
loaded_clf.predict(X_test)

# Global explainability : feat importance summing to 1

In [None]:
clf.feature_importances_

# Local explainability and masks

In [None]:
explain_matrix, masks = clf.explain(X_test)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20,20))

for i in range(3):
    axs[i].imshow(masks[i][:50])
    axs[i].set_title(f"mask {i}")


# XGB

In [None]:
from xgboost import XGBClassifier

clf_xgb = XGBClassifier(max_depth=8,
    learning_rate=0.1,
    n_estimators=1000,
    verbosity=0,
    silent=None,
    objective='binary:logistic',
    booster='gbtree',
    n_jobs=-1,
    nthread=None,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.7,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=0,
    seed=None,)

clf_xgb.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=40,
        verbose=10)

In [None]:
preds = np.array(clf_xgb.predict_proba(X_valid))
valid_auc = roc_auc_score(y_score=preds[:,1], y_true=y_valid)
print(valid_auc)

preds = np.array(clf_xgb.predict_proba(X_test))
test_auc = roc_auc_score(y_score=preds[:,1], y_true=y_test)
print(test_auc)