Required libraries.

In [1]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
import numpy as np

Cleaned dataset.

In [2]:
DATASET_PATH = './dataset/generated_dataset.csv'
df = pd.read_csv(DATASET_PATH)
df

Unnamed: 0,Disease,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38480,decubitus ulcer,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38481,decubitus ulcer,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38482,decubitus ulcer,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38483,decubitus ulcer,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38485 entries, 0 to 38484
Columns: 405 entries, Disease to yellow sputum
dtypes: int64(404), object(1)
memory usage: 118.9+ MB


Definition of independent variables.

In [4]:
features = df.loc[:, df.columns != 'Disease']
features


Unnamed: 0,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,abscess bacterial,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38481,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38482,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38483,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Definition of dependent variable.

In [5]:
target = df['Disease']
target


0        hypertensive disease
1        hypertensive disease
2        hypertensive disease
3        hypertensive disease
4        hypertensive disease
                 ...         
38480         decubitus ulcer
38481         decubitus ulcer
38482         decubitus ulcer
38483         decubitus ulcer
38484         decubitus ulcer
Name: Disease, Length: 38485, dtype: object

Splitting the dataset into training, testing, and validation data.

In [6]:
train_ratio = 0.80
test_ratio = 0.20

X, X_test, Y, Y_test = train_test_split(features, target, test_size=(1 - train_ratio), random_state=1)

### Tabnet

Convert all splits to numpy formatted values.

In [7]:
X = X.to_numpy()
Y = Y.to_numpy()
X_test = X_test.to_numpy()
Y_test = Y_test.to_numpy()

Perform K-fold.

In [8]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=42, shuffle=True)

Importing the TabNet libraries.

In [9]:
import pytorch_tabnet
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

Defining the model.

In [10]:
clf1_nopreproc = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                                  optimizer_params=dict(lr=2e-2),
                                  scheduler_params={"step_size": 10,  # how to use learning rate scheduler
                                                    "gamma": 0.9},
                                  scheduler_fn=torch.optim.lr_scheduler.StepLR,
                                  mask_type='entmax'  # "sparsemax"
                                  )

Device used : cuda


Fitting the model.

In [11]:
i = 1
CV_score_array = []
for train_index, test_index in kf.split(X):
    print("K-FOLD: {}".format(i))
    X_train, X_val = X[train_index], X[test_index]
    Y_train, Y_val = Y[train_index], Y[test_index]
    clf1_nopreproc.fit(
        X_train, Y_train,
        eval_set=[(X_train, Y_train), (X_val, Y_val)],
        eval_name=['train', 'valid'],
        eval_metric=['accuracy'],
        max_epochs=1000, patience=50,
        batch_size=256, virtual_batch_size=128,
        num_workers=0,
        weights=1,
        drop_last=False
    )
    i += 1
    CV_score_array.append(clf1_nopreproc.best_cost)

print("MODEL PERFORMANCE")
i = 1
for score in CV_score_array:
    print("K-FOLD {} \t {}".format(i, score))
CV_mean = np.mean(CV_score_array)
print("AVERAGE: {}".format(CV_mean))

K-FOLD: 1
epoch 0  | loss: 4.76244 | train_accuracy: 0.06829 | valid_accuracy: 0.07161 |  0:00:08s
epoch 1  | loss: 3.62136 | train_accuracy: 0.16529 | valid_accuracy: 0.16223 |  0:00:15s
epoch 2  | loss: 2.9229  | train_accuracy: 0.311   | valid_accuracy: 0.31146 |  0:00:22s
epoch 3  | loss: 2.03055 | train_accuracy: 0.48226 | valid_accuracy: 0.48084 |  0:00:32s
epoch 4  | loss: 1.49355 | train_accuracy: 0.60528 | valid_accuracy: 0.5885  |  0:00:40s
epoch 5  | loss: 1.1318  | train_accuracy: 0.65773 | valid_accuracy: 0.64177 |  0:00:47s
epoch 6  | loss: 0.91469 | train_accuracy: 0.73272 | valid_accuracy: 0.71111 |  0:00:55s
epoch 7  | loss: 0.75478 | train_accuracy: 0.73341 | valid_accuracy: 0.71338 |  0:01:02s
epoch 8  | loss: 0.65843 | train_accuracy: 0.76654 | valid_accuracy: 0.74602 |  0:01:10s
epoch 9  | loss: 0.59323 | train_accuracy: 0.79184 | valid_accuracy: 0.76941 |  0:01:17s
epoch 10 | loss: 0.53933 | train_accuracy: 0.80487 | valid_accuracy: 0.7837  |  0:01:24s
epoch 11 | 

Testing the model.

In [12]:
from sklearn import metrics

preds = clf1_nopreproc.predict(X_test)
accuracy = metrics.accuracy_score(Y_test, preds)
print('TEST ACCURACY: {}%'.format(accuracy))

TEST ACCURACY: 0.8205794465376121%
