Required libraries.

In [1]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd

Cleaned dataset.

In [2]:
DATASET_PATH = './dataset/generated_dataset.csv'
df = pd.read_csv(DATASET_PATH)
df

Unnamed: 0,Disease,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38480,decubitus ulcer,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38481,decubitus ulcer,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38482,decubitus ulcer,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38483,decubitus ulcer,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38485 entries, 0 to 38484
Columns: 405 entries, Disease to yellow sputum
dtypes: int64(404), object(1)
memory usage: 118.9+ MB


Definition of independent variables.

In [4]:
features = df.loc[:, df.columns != 'Disease']
features


Unnamed: 0,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,abscess bacterial,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38481,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38482,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38483,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Definition of dependent variable.

In [5]:
target = df['Disease']
target


0        hypertensive disease
1        hypertensive disease
2        hypertensive disease
3        hypertensive disease
4        hypertensive disease
                 ...         
38480         decubitus ulcer
38481         decubitus ulcer
38482         decubitus ulcer
38483         decubitus ulcer
38484         decubitus ulcer
Name: Disease, Length: 38485, dtype: object

Splitting the dataset into training, testing, and validation data.

In [6]:
train_ratio = 0.60
validation_ratio = 0.20
test_ratio = 0.20

X_train, X_val, Y_train, Y_val = train_test_split(features, target, test_size=(1 - train_ratio), random_state=1)
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=(test_ratio/(test_ratio + validation_ratio)), random_state=1)

### Tabnet

Convert all splits to numpy formatted values.

In [7]:
X_train = X_train.to_numpy()
Y_train = Y_train.to_numpy()
X_val = X_val.to_numpy()
Y_val = Y_val.to_numpy()

Importing the TabNet libraries.

In [8]:
import pytorch_tabnet
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

Defining the model.

In [9]:
clf1_nopreproc = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                                  optimizer_params=dict(lr=2e-2),
                                  scheduler_params={"step_size": 10,  # how to use learning rate scheduler
                                                    "gamma": 0.9},
                                  scheduler_fn=torch.optim.lr_scheduler.StepLR,
                                  mask_type='entmax'  # "sparsemax"
                                  )

Device used : cuda


Fitting the model.

In [10]:
clf1_nopreproc.fit(
    X_train, Y_train,
    eval_set=[(X_train, Y_train), (X_val, Y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy'],
    max_epochs=1000, patience=50,
    batch_size=256, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
)

epoch 0  | loss: 5.02433 | train_accuracy: 0.00918 | valid_accuracy: 0.00838 |  0:00:08s
epoch 1  | loss: 4.36788 | train_accuracy: 0.0181  | valid_accuracy: 0.01838 |  0:00:12s
epoch 2  | loss: 3.59726 | train_accuracy: 0.06687 | valid_accuracy: 0.06113 |  0:00:17s
epoch 3  | loss: 3.01652 | train_accuracy: 0.14145 | valid_accuracy: 0.13681 |  0:00:21s
epoch 4  | loss: 2.51228 | train_accuracy: 0.25769 | valid_accuracy: 0.24646 |  0:00:26s
epoch 5  | loss: 2.02746 | train_accuracy: 0.31997 | valid_accuracy: 0.30772 |  0:00:30s
epoch 6  | loss: 1.66783 | train_accuracy: 0.51841 | valid_accuracy: 0.49045 |  0:00:35s
epoch 7  | loss: 1.43211 | train_accuracy: 0.57124 | valid_accuracy: 0.55164 |  0:00:39s
epoch 8  | loss: 1.21104 | train_accuracy: 0.59342 | valid_accuracy: 0.5684  |  0:00:44s
epoch 9  | loss: 1.07165 | train_accuracy: 0.68159 | valid_accuracy: 0.64993 |  0:00:48s
epoch 10 | loss: 0.91498 | train_accuracy: 0.69511 | valid_accuracy: 0.66084 |  0:00:53s
epoch 11 | loss: 0.85

Testing the model.

In [15]:
from sklearn import metrics

preds = clf1_nopreproc.predict(X_test.to_numpy())
accuracy = metrics.accuracy_score(Y_test, preds)
print('TEST ACCURACY: {}%'.format(accuracy))

TEST ACCURACY: 0.8038281655984757%
