##### Copyright 2019 Faculty Authors.

# Tabby-nets

**TF2.0 Input_fn** for `.csv` or `pd.dataframes` to [TabNet](https://arxiv.org/abs/1908.07442) for Tensorflow 2.0, whose original codebase is available at https://github.com/titu1994/tf-TabNet.

In [None]:
!pip install --upgrade tabnet

In [3]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import pandas as pd
import tensorflow as tf
import os
import shutil
import tensorflow as tf
import tensorflow_datasets as tfds
import tabnet
import numpy as np
from sklearn.model_selection import train_test_split

In [4]:
csv_file = 'train.csv'

### Read the csv file using pandas.

In [8]:
df = pd.read_csv(csv_file)

In [None]:
df.head()

Convert `dtype: object` columns which is to a discrete numerical value.

In [245]:
df['pat_gender'] = pd.Categorical(df['pat_gender'])
df['pat_gender'] = df.variety.cat.codes

In [None]:
df.head()

### Class balancing with random undersampling.

In [None]:
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from collections import Counter
rus = RandomUnderSampler(random_state=0, ratio=0.1, return_indices=True)
_, _, index = rus.fit_resample(df, df.patient_type)
df = df.iloc[index,:]

### Scale features with `StandardScaler`.

In [248]:
target_col = 'patient_type'
target = df.pop('patient_type')
target = pd.get_dummies(target)

In [None]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler
mapper = DataFrameMapper([(df.columns, StandardScaler())])
scaled_features = mapper.fit_transform(df.copy(), 4)
scaled_df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)

## Load data using `tf.data.Dataset`

Use `tf.data.Dataset.from_tensor_slices` to read the values from a pandas dataframe. 

One of the advantages of using `tf.data.Dataset` is it allows you to write simple, highly efficient data pipelines. Read the [loading data guide](https://www.tensorflow.org/guide/data) to find out more.

In [249]:
dataset = tf.data.Dataset.from_tensor_slices((df.values, target.values))

In [250]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2)
# train, val = train_test_split(train, test_size=0.2)
print(len(X_train), 'train examples')
print(len(X_test), 'test examples')

80 train examples
21 test examples


In [251]:
for feat, targ in dataset.take(5):
  print ('Features: {}, Target: {}'.format(feat, targ))

Features: [5.1 3.5 1.4 0.2], Target: 0
Features: [4.9 3.  1.4 0.2], Target: 0
Features: [4.7 3.2 1.3 0.2], Target: 0
Features: [4.6 3.1 1.5 0.2], Target: 0
Features: [5.  3.6 1.4 0.2], Target: 0


Since a `pd.Series` implements the `__array__` protocol it can be used transparently nearly anywhere you would use a `np.array` or a `tf.Tensor`.

In [252]:
tf.constant(scaled_df['pat_gocard'])

<tf.Tensor: id=114675, shape=(101,), dtype=float64, numpy=
array([0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.2, 0.1,
       0.1, 0.2, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.4, 0.2, 0.5, 0.2, 0.2,
       0.4, 0.2, 0.2, 0.2, 0.2, 0.4, 0.1, 0.2, 0.2, 0.2, 0.2, 0.1, 0.2,
       0.2, 0.3, 0.3, 0.2, 0.6, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 1.4, 1.5,
       1.5, 1.3, 1.5, 1.3, 1.6, 1. , 1.3, 1.4, 1. , 1.5, 1. , 1.4, 1.3,
       1.4, 1.5, 1. , 1.5, 1.1, 1.8, 1.3, 1.5, 1.2, 1.3, 1.4, 1.4, 1.7,
       1.5, 1. , 1.1, 1. , 1.2, 1.6, 1.5, 1.6, 1.5, 1.3, 1.3, 1.3, 1.2,
       1.4, 1.2, 1. , 1.3, 1.2, 1.3, 1.3, 1.1, 1.3, 2.5])>

Shuffle the dataset.

In [253]:
dataset = dataset.shuffle(len(df))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df, target, test_size=0.22)
print(len(X_train), 'train examples')
print(len(X_test), 'test examples')

## Alternative to feature columns

The easiest way to preserve the column structure of a `pd.DataFrame` when used with `tf.data` is to convert the `pd.DataFrame` to a `dict`, and slice that dictionary. Batch the train and test sets.

In [254]:
batch_size = int(len(X_train)*0.1)

train_slices = tf.data.Dataset.from_tensor_slices((X_train.to_dict('list'), 
                                                  y_train)).batch(50)

test_slices = tf.data.Dataset.from_tensor_slices((X_test.to_dict('list'), 
                                                  y_test)).batch(50)

In [None]:
for train_slice in train_slices.take(1):
  print (train_slice)

### Feature columns (numeric)
The output of a feature column becomes the input to the model. A [numeric column](https://www.tensorflow.org/api_docs/python/tf/feature_column/numeric_column) is the simplest type of column. It is used to represent real valued features. When using this column, the model will receive the column value from the dataframe unchanged.

In [256]:
col_names = list(scaled_df.columns.values)
feature_columns = []
for col_name in col_names:
    feature_columns.append(tf.feature_column.numeric_column(col_name))

In [257]:
model = tabnet.TabNetClassification(feature_columns, num_classes=2,
                                    feature_dim=32, output_dim=32,
                                    num_decision_steps=2, relaxation_factor=1.0,
                                    sparsity_coefficient=1e-5, batch_momentum=0.98,
                                    virtual_batch_size=None, norm_type='batch',
                                    num_groups=1)

In [258]:
lr = tf.keras.optimizers.schedules.ExponentialDecay(0.01, decay_steps=100, decay_rate=0.9, staircase=False)
optimizer = tf.keras.optimizers.Adam(lr)


In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

model.compile(optimizer, loss='categorical_crossentropy', 
              metrics=['accuracy',  f1_m,precision_m, recall_m]
             )

In [None]:
history = model.fit(train_slices, epochs=100, validation_data=test_slices, verbose=2)

### Model Summary
Print TabNet custom architecture

In [260]:

model.summary()

Model: "tab_net_classification_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
tab_net_14 (TabNet)          multiple                  208       
_________________________________________________________________
dense_89 (Dense)             multiple                  8         
Total params: 216
Trainable params: 216
Non-trainable params: 0
_________________________________________________________________



### Save the images of the feature masks
Force eager execution mode to generate the masks

In [229]:

print()
if os.path.exists('logs/'):
    shutil.rmtree('logs/')

x, y = next(iter(train_slices))
_ = model(x)

writer = tf.summary.create_file_writer("logs/")
with writer.as_default():
    for i, mask in enumerate(model.tabnet.feature_selection_masks):
        print("Saving mask {} of shape {}".format(i + 1, mask.shape))
        tf.summary.image('mask_at_iter_{}'.format(i + 1), step=0, data=mask, max_outputs=1)
        writer.flush()

    agg_mask = model.tabnet.aggregate_feature_selection_mask
    print("Saving aggregate mask of shape", agg_mask.shape)
    tf.summary.image("Aggregate Mask", step=0, data=agg_mask, max_outputs=1)
    writer.flush()

writer.close()




Saving mask 1 of shape (1, 16, 4, 1)
Saving aggregate mask of shape (1, 16, 4, 1)


## Evaluation



In [None]:
tab_train_preds = model.predict(test_slices)
trainset_score = average_precision_score(y_test, tab_train_preds)
config.APS_train = trainset_score
print('train set average precision score:',trainset_score)