In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from MortalityPrediction.src.ml_utils import read_and_extract_features, print_metrics_binary, Reader
import numpy as np

In [5]:
DATA = "MortalityPrediction/data/preprocessed_data/"

In [6]:
#dataset reader
train_reader = Reader(dataset_dir=DATA + "train", listfile=DATA + "train_listfile.csv")
val_reader = Reader(dataset_dir=DATA + "train", listfile=DATA + "val_listfile.csv")
test_reader = Reader(dataset_dir=DATA + "test", listfile=DATA + "test_listfile.csv")                
print('Reading data and extracting features ...')
(train_X, train_y) = read_and_extract_features(train_reader)
(val_X, val_y) = read_and_extract_features(val_reader)
(test_X, test_y) = read_and_extract_features(test_reader)
print('Imputing missing values ...')
imputer = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0, copy=True)
imputer.fit(train_X)
train_X = np.array(imputer.transform(train_X), dtype=np.float32)
val_X = np.array(imputer.transform(val_X), dtype=np.float32)
test_X = np.array(imputer.transform(test_X), dtype=np.float32)
print('Normalizing the data to have zero mean and unit variance ...')
scaler = StandardScaler()
scaler.fit(train_X)
train_X = scaler.transform(train_X)
val_X = scaler.transform(val_X)
test_X = scaler.transform(test_X)

Reading data and extracting features ...


Imputing missing values ...


Normalizing the data to have zero mean and unit variance ...


In [7]:
clf = LogisticRegression(penalty="l2", C=0.001, solver='lbfgs')
clf.fit(train_X, train_y)
print("coef_: ", clf.coef_)
#print result in terminal
print('\nResults on train set')
print_metrics_binary(train_y, clf.predict_proba(train_X))
print('\nResults on eval set')
print_metrics_binary(val_y, clf.predict_proba(val_X))
prediction = clf.predict_proba(test_X)[:, 1]
print('\nResults on test set')
print_metrics_binary(test_y, prediction)

coef_:  [[ 3.37299253e-03  1.11993325e-02  2.51913479e-03  1.57538515e-02
   3.44756841e-03 -4.09424638e-03 -9.79094171e-03 -1.35300967e-03
  -5.24794373e-03  2.77960175e-02 -7.32993336e-03 -1.19106652e-02
  -3.39308511e-03 -2.06850480e-03 -1.62253533e-03 -1.78561541e-04
  -8.03801117e-03 -9.60640747e-03  5.58016786e-03 -1.45475598e-03
  -3.27842269e-03 -8.06867913e-03  9.65633504e-03 -8.15270638e-03
   2.80288111e-03  2.80288111e-03  2.80288111e-03  0.00000000e+00
   0.00000000e+00 -2.31222994e-03  2.70083470e-03  2.80288111e-03
   2.77748572e-03  7.70452439e-04 -7.70454819e-04 -6.18040085e-03
   7.31750511e-03  5.56722502e-03  3.78890949e-03 -1.72219402e-04
   9.34765242e-03 -7.12878477e-03 -4.19469807e-03  9.00873562e-03
  -1.85442766e-02  7.04086747e-03  1.44587575e-02 -2.72915891e-03
  -3.49995701e-02  1.28529702e-02 -5.06999435e-03  5.33914260e-03
   2.29388551e-02 -2.36049653e-02 -2.51435677e-02  1.02685966e-02
  -8.39128787e-03  4.38327140e-03 -1.27278415e-02 -7.56885088e-03
  

{'acc': 0.9011125,
 'auprc': 0.4823380792491313,
 'auroc': 0.848530626277574,
 'prec0': 0.9161755,
 'prec1': 0.64835167,
 'rec0': 0.977638,
 'rec1': 0.315508}

In [8]:
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(train_X, train_y)
#print result in terminal
print('\nResults on train set')
print_metrics_binary(train_y, clf.predict_proba(train_X))
print('\nResults on eval set')
print_metrics_binary(val_y, clf.predict_proba(val_X))
prediction = clf.predict_proba(test_X)[:, 1]
print('\nResults on test set')
print_metrics_binary(test_y, prediction)


Results on train set


confusion matrix:
[[12539   155]
 [ 1373   614]]
accuracy = 0.895919919013977
precision class 0 = 0.9013082385063171
precision class 1 = 0.7984395027160645
recall class 0 = 0.987789511680603
recall class 1 = 0.30900856852531433
AUC of ROC = 0.9277144039058354
AUC of PRC = 0.6872543865092984

Results on eval set


confusion matrix:
[[2716   70]
 [ 355   81]]
accuracy = 0.8680943250656128
precision class 0 = 0.8844024538993835
precision class 1 = 0.5364238619804382
recall class 0 = 0.9748743772506714
recall class 1 = 0.18577980995178223
AUC of ROC = 0.7125116078426208
AUC of PRC = 0.3764556193816045



Results on test set
confusion matrix:
[[2793   69]
 [ 308   66]]
accuracy = 0.8834981322288513
precision class 0 = 0.9006772041320801
precision class 1 = 0.4888888895511627
recall class 0 = 0.9758909940719604
recall class 1 = 0.1764705926179886
AUC of ROC = 0.7122108992253275
AUC of PRC = 0.35072461277637296


{'acc': 0.88349813,
 'auprc': 0.35072461277637296,
 'auroc': 0.7122108992253275,
 'prec0': 0.9006772,
 'prec1': 0.4888889,
 'rec0': 0.975891,
 'rec1': 0.1764706}

In [11]:
clf = SVC(C=1.0, kernel="rbf", probability=True)
clf.fit(train_X, train_y)
#print result in terminal
print('\nResults on train set')
print_metrics_binary(train_y, clf.predict_proba(train_X))
print('\nResults on eval set')
print_metrics_binary(val_y, clf.predict_proba(val_X))
prediction = clf.predict_proba(test_X)[:, 1]
print('\nResults on test set')
print_metrics_binary(test_y, prediction)




Results on train set


confusion matrix:
[[12605    89]
 [ 1130   857]]
accuracy = 0.916967511177063
precision class 0 = 0.9177284240722656
precision class 1 = 0.9059196710586548
recall class 0 = 0.9929888248443604
recall class 1 = 0.431303471326828
AUC of ROC = 0.959215838827596
AUC of PRC = 0.8483804599892494

Results on eval set


confusion matrix:
[[2719   67]
 [ 322  114]]
accuracy = 0.8792675137519836
precision class 0 = 0.8941137790679932
precision class 1 = 0.6298342347145081
recall class 0 = 0.9759511947631836
recall class 1 = 0.26146790385246277
AUC of ROC = 0.8245334635167977
AUC of PRC = 0.5040367460352914



Results on test set
confusion matrix:
[[2807   55]
 [ 267  107]]
accuracy = 0.9004944562911987
precision class 0 = 0.9131425023078918
precision class 1 = 0.6604938507080078
recall class 0 = 0.980782687664032
recall class 1 = 0.2860962450504303
AUC of ROC = 0.821703438379354
AUC of PRC = 0.476884175815348


{'acc': 0.90049446,
 'auprc': 0.476884175815348,
 'auroc': 0.821703438379354,
 'prec0': 0.9131425,
 'prec1': 0.66049385,
 'rec0': 0.9807827,
 'rec1': 0.28609625}

In [12]:
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, min_samples_leaf=1)
clf.fit(train_X, train_y)
#print result in terminal
print('\nResults on train set')
print_metrics_binary(train_y, clf.predict_proba(train_X))
print('\nResults on eval set')
print_metrics_binary(val_y, clf.predict_proba(val_X))
prediction = clf.predict_proba(test_X)[:, 1]
print('\nResults on test set')
print_metrics_binary(test_y, prediction)


Results on train set
confusion matrix:
[[12694     0]
 [    0  1987]]
accuracy = 1.0
precision class 0 = 1.0
precision class 1 = 1.0
recall class 0 = 1.0
recall class 1 = 1.0
AUC of ROC = 1.0
AUC of PRC = 1.0

Results on eval set
confusion matrix:
[[2466  320]
 [ 272  164]]
accuracy = 0.8162631988525391
precision class 0 = 0.9006574153900146
precision class 1 = 0.3388429880142212
recall class 0 = 0.8851400017738342
recall class 1 = 0.3761467933654785
AUC of ROC = 0.6306433873166619
AUC of PRC = 0.3997046896716547

Results on test set
confusion matrix:
[[2548  314]
 [ 236  138]]
accuracy = 0.8300370573997498
precision class 0 = 0.915229856967926
precision class 1 = 0.3053097426891327
recall class 0 = 0.8902865052223206
recall class 1 = 0.3689839541912079
AUC of ROC = 0.6296352350736368
AUC of PRC = 0.37361161718888336


{'acc': 0.83003706,
 'auprc': 0.37361161718888336,
 'auroc': 0.6296352350736368,
 'prec0': 0.91522986,
 'prec1': 0.30530974,
 'rec0': 0.8902865,
 'rec1': 0.36898395}

In [13]:
clf = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2)
clf.fit(train_X, train_y)
#print result in terminal
print('\nResults on train set')
print_metrics_binary(train_y, clf.predict_proba(train_X))
print('\nResults on eval set')
print_metrics_binary(val_y, clf.predict_proba(val_X))
prediction = clf.predict_proba(test_X)[:, 1]
print('\nResults on test set')
print_metrics_binary(test_y, prediction)


Results on train set


confusion matrix:
[[12694     0]
 [    7  1980]]
accuracy = 0.9995232224464417
precision class 0 = 0.999448835849762
precision class 1 = 1.0
recall class 0 = 1.0
recall class 1 = 0.9964771270751953
AUC of ROC = 1.0
AUC of PRC = 1.0

Results on eval set
confusion matrix:
[[2738   48]
 [ 318  118]]
accuracy = 0.8864059448242188
precision class 0 = 0.8959423899650574
precision class 1 = 0.7108433842658997
recall class 0 = 0.9827709794044495
recall class 1 = 0.2706421911716461
AUC of ROC = 0.8439342024671193
AUC of PRC = 0.5397045091170963

Results on test set
confusion matrix:
[[2813   49]
 [ 284   90]]
accuracy = 0.8970952033996582
precision class 0 = 0.9082983732223511
precision class 1 = 0.6474820375442505
recall class 0 = 0.982879102230072
recall class 1 = 0.24064171314239502
AUC of ROC = 0.8415028008535221
AUC of PRC = 0.48067137423321316


{'acc': 0.8970952,
 'auprc': 0.48067137423321316,
 'auroc': 0.8415028008535221,
 'prec0': 0.9082984,
 'prec1': 0.64748204,
 'rec0': 0.9828791,
 'rec1': 0.24064171}

In [14]:
clf = AdaBoostClassifier(n_estimators=50, learning_rate=0.5)
clf.fit(train_X, train_y)
#print result in terminal
print('\nResults on train set')
print_metrics_binary(train_y, clf.predict_proba(train_X))
print('\nResults on eval set')
print_metrics_binary(val_y, clf.predict_proba(val_X))
prediction = clf.predict_proba(test_X)[:, 1]
print('\nResults on test set')
print_metrics_binary(test_y, prediction)


Results on train set


confusion matrix:
[[12362   332]
 [ 1362   625]]
accuracy = 0.8846127390861511
precision class 0 = 0.9007577896118164
precision class 1 = 0.653082549571991
recall class 0 = 0.973845899105072
recall class 1 = 0.31454452872276306
AUC of ROC = 0.8782041121393357
AUC of PRC = 0.5568575675871068

Results on eval set
confusion matrix:
[[2702   84]
 [ 299  137]]
accuracy = 0.8811297416687012
precision class 0 = 0.9003665447235107
precision class 1 = 0.6199095249176025
recall class 0 = 0.9698492288589478
recall class 1 = 0.3142201900482178
AUC of ROC = 0.8544039002351207
AUC of PRC = 0.524463321878077

Results on test set
confusion matrix:
[[2784   78]
 [ 260  114]]
accuracy = 0.8955500721931458
precision class 0 = 0.914586067199707
precision class 1 = 0.59375
recall class 0 = 0.9727463126182556
recall class 1 = 0.30481284856796265
AUC of ROC = 0.8522484370153627
AUC of PRC = 0.48666948629288304


{'acc': 0.8955501,
 'auprc': 0.48666948629288304,
 'auroc': 0.8522484370153627,
 'prec0': 0.91458607,
 'prec1': 0.59375,
 'rec0': 0.9727463,
 'rec1': 0.30481285}