# Testing

In [138]:
%load_ext watermark
%watermark -v -n -m -p numpy,scipy,sklearn,pandas

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Mon Oct 22 2018 

CPython 3.6.4
IPython 7.0.1

numpy 1.15.2
scipy 1.1.0
sklearn 0.20.0
pandas 0.23.4

compiler   : GCC 7.2.0
system     : Linux
release    : 4.15.0-36-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 12
interpreter: 64bit


In [139]:
# Magic commands must be in separate cells 
# to properly display light background for 
# plots with JupyterLab dark theme 

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [140]:
import numpy as np
import pandas as pd
import sys
import os

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as mtick

import seaborn as sns
plt.style.use('ggplot')
sns.set()
# plt.style.use('seaborn')
mpl.style.use('seaborn')

pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_columns = 999
pd.options.display.max_rows = 40
np.set_printoptions(precision=6)

print(pd.__version__)

# UGLY HACK - not for production
sys.path.append('..')

0.23.4


## Params

In [141]:
import utils

PARAM_FILE = '../experiments/00_baseline.json'

params = utils.Params()
params.load(PARAM_FILE)

print(params)

{'name': '00_baseline', 'dataset': 'baseline', 'batch_size': 256, 'model': {'dropout_emb': 0.2, 'dropout_fc': 0.5, 'size_fc': [512, 256], 'size_final': 64}, 'optimizer': {'type': 'Adam', 'weight_decay': 1e-09}, 'scheduler': {'type': 'OneCycle', 'max_epochs': 10, 'eta_min': 0.001, 'eta_max': 0.01, 'epsilon': 0.0, 'end_fraction': 0.1}}


## Features

In [142]:
DATA_DIR = '../data/processed'

FEAT_FILE = os.path.join(DATA_DIR, params['dataset'], 'features.json')
features = utils.Features()
features.load(FEAT_FILE)

print(features)

{'continuous': ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses'], 'categorical': ['race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin', 'change', 'diabetesMed'], 'output': ['readmitted'], 'embedding_sizes': [[6, 3], [3, 2], [10, 5], [10, 5], [8, 4], [23, 12], [13, 7], [17, 9], [63, 32], [536, 50], [539, 50], [592, 50], [4, 2], [4, 2], [4, 2], [4, 2], [4, 2], [4, 2], [4, 2], [4, 2], [2, 1], [2, 1]]}


## Dataset

In [143]:
TRAIN = os.path.join(DATA_DIR, params['dataset'], 'train.csv')
train_df = pd.read_csv(TRAIN)
display(train_df.head(10))

Unnamed: 0,readmitted,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,payer_code,medical_specialty,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed
0,1,2,37,0,7,0,0,0,5,2,1,6,9,1,0,0,9,62,39,20,57,2,2,1,1,2,1,1,1,1,1
1,1,3,68,0,4,0,0,2,6,2,0,2,9,0,0,10,16,10,75,154,135,2,1,1,1,1,1,1,3,0,1
2,1,11,49,0,12,0,0,4,9,2,0,5,9,1,9,4,16,62,131,114,297,2,2,1,2,1,1,1,1,1,1
3,0,4,41,0,3,0,0,0,2,2,0,7,9,5,9,10,16,17,92,189,591,3,2,1,1,1,1,1,1,1,0
4,1,4,54,0,28,0,0,2,9,2,1,7,9,2,19,0,6,62,375,194,559,2,2,1,1,2,1,1,0,0,1
5,0,3,63,0,12,0,0,0,8,2,0,6,9,0,0,10,0,62,211,55,190,2,2,1,1,1,2,1,1,1,1
6,0,4,49,0,10,0,0,0,5,0,0,5,9,0,0,10,11,62,307,121,57,2,2,1,1,1,1,1,1,1,0
7,0,7,38,2,10,0,0,0,2,2,1,8,9,1,8,0,16,62,223,55,591,2,2,1,1,1,1,1,1,1,0
8,1,2,1,2,9,1,0,1,9,0,0,3,9,2,0,0,16,59,520,191,320,2,2,1,1,1,1,1,1,1,0
9,0,3,68,0,13,0,0,1,5,2,0,1,9,0,0,10,3,10,331,58,102,2,2,1,1,1,1,1,3,0,1


In [144]:
import model.data_loader as data_loader

train_set = data_loader.DiabetesDataset(train_df,
                                        continuous_features=features['continuous'],
                                        categorical_features=features['categorical'],
                                        output_features=features['output'])

print(train_set.n)
print(train_set[np.arange(10)])

18172
[array([[1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.]], dtype=float32), array([[ 2., 37.,  0.,  7.,  0.,  0.,  0.,  5.],
       [ 3., 68.,  0.,  4.,  0.,  0.,  2.,  6.],
       [11., 49.,  0., 12.,  0.,  0.,  4.,  9.],
       [ 4., 41.,  0.,  3.,  0.,  0.,  0.,  2.],
       [ 4., 54.,  0., 28.,  0.,  0.,  2.,  9.],
       [ 3., 63.,  0., 12.,  0.,  0.,  0.,  8.],
       [ 4., 49.,  0., 10.,  0.,  0.,  0.,  5.],
       [ 7., 38.,  2., 10.,  0.,  0.,  0.,  2.],
       [ 2.,  1.,  2.,  9.,  1.,  0.,  1.,  9.],
       [ 3., 68.,  0., 13.,  0.,  0.,  1.,  5.]], dtype=float32), array([[  2,   1,   6,   9,   1,   0,   0,   9,  62,  39,  20,  57,   2,
          2,   1,   1,   2,   1,   1,   1,   1,   1],
       [  2,   0,   2,   9,   0,   0,  10,  16,  10,  75, 154, 135,   2,
          1,   1,   1,   1,   1,   1,   3,   0,   1],
       [  2,   0,   5,   9,   1,   9,   4,  16,  62, 131, 114, 297,   2,
          2,

## Dataloader

In [145]:
import model.data_loader as data_loader

dataloaders = data_loader.fetch_dataloaders(DATA_DIR, features, params)

print(len(dataloaders['train']))
print(len(dataloaders['val']))
print(len(dataloaders['test']))

# for i, (target, x_cont, x_cat) in enumerate(dataloaders['train']):
#     print(i, x_cont)

71
9
9


## Model

In [146]:
import torch
import model.net as net

SEED = 42

params.update(features)

# set device
params['cuda'] = torch.cuda.is_available()
params['device'] = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# set random seeds
torch.manual_seed(SEED)
if params['cuda']: torch.cuda.manual_seed(SEED)
    
model = net.Network(params)
model.to(params['device'])

print(model)

Network(
  (embeddings): ModuleList(
    (0): Embedding(6, 3)
    (1): Embedding(3, 2)
    (2): Embedding(10, 5)
    (3): Embedding(10, 5)
    (4): Embedding(8, 4)
    (5): Embedding(23, 12)
    (6): Embedding(13, 7)
    (7): Embedding(17, 9)
    (8): Embedding(63, 32)
    (9): Embedding(536, 50)
    (10): Embedding(539, 50)
    (11): Embedding(592, 50)
    (12): Embedding(4, 2)
    (13): Embedding(4, 2)
    (14): Embedding(4, 2)
    (15): Embedding(4, 2)
    (16): Embedding(4, 2)
    (17): Embedding(4, 2)
    (18): Embedding(4, 2)
    (19): Embedding(4, 2)
    (20): Embedding(2, 1)
    (21): Embedding(2, 1)
  )
  (dropout_emb): Dropout(p=0.2)
  (bn_continuous): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc_layers): ModuleList(
    (0): FCUnit(
      (linear): Linear(in_features=255, out_features=512, bias=True)
      (batchnorm): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (dropout): Dropout(p=0.5)
    )

In [193]:
dataloader = dataloaders['train']

print(0.1 * len(dataloader))

for i, (target, X_cont, X_cat) in enumerate(dataloader):
    target = target.to(params['device'])
    X_cont = X_cont.to(params['device'])
    X_cat = X_cat.to(params['device'])
    
    output = model(X_cont, X_cat)
    
    if i % int(1 / 0.2) == 0:
        print(i)

7.1000000000000005
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70


## Metrics

In [179]:
from evaluate import metrics, accuracy, auroc

# target = np.array([0, 0, 1, 1])
# output = np.array([0.1, 0.4, 0.35, 0.8])
target = target.data.cpu().numpy()
output = output.data.cpu().numpy()


# acc = accuracy(output, target)
# auc = auroc(output, target)
acc = metrics['accuracy'](output, target)
auc = metrics['auroc'](output, target)

print(f'Accuracy: {acc}')
print(f'AUROC: {auc}')


Accuracy: 0.4921875
AUROC: 0.49033520919990214
