In [1]:
%pylab inline
import pandas as pd
from sklearn import metrics

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Read in the prediction scores on the test set accross all three models
test_multiple_models = pd.read_csv('../data/test_score_multiple_models.csv')
test_multiple_models.head()

Unnamed: 0,pred_rf,pred_gb,pred_lr,label
0,0.157483,0.050373,0.0122,False
1,0.007528,0.017296,0.021629,False
2,0.010931,0.019073,0.013231,False
3,0.014331,0.026936,0.109715,False
4,0.002645,0.017125,0.005357,False


### Permutation Test
- randomly select with replacement from the test set
- Compute the auc for each model
- Compute the proportion of times model A > model B: $P(AUC_A > AUC_B)$

In [3]:
np.random.seed(12345)
N = test_multiple_models.shape[0]
bootstraps = 10000
models = ['rf', 'gb', 'lr']
res_list = []
for b in range(bootstraps):
    idx = np.random.choice(range(N), N, replace=True)
    res = {}
    for model in models:
        tmp = test_multiple_models.iloc[idx]
        probas = tmp['pred_'+model]
        y = tmp['label']
        fpr, tpr, thresholds = metrics.roc_curve(y, probas)
        try:
            auc = metrics.auc(fpr, tpr)
        except:
            auc = np.nan
        res.update({model: auc})
    res_list.append(res)
auc_bootstrapped = pd.DataFrame(res_list)

In [4]:
# Mean AUCs and CIs
for model in models:
    print(model, auc_bootstrapped[model].mean(), np.percentile(auc_bootstrapped[model], [2.5, 97.5]))

rf 0.8753544399213361 [0.85685892 0.89239396]
gb 0.8674011848429615 [0.84809157 0.88577818]
lr 0.8560106915352218 [0.83510814 0.87601817]


$P(AUC_{logistic regression} > AUC_{random forest})$

In [5]:
(auc_bootstrapped['lr'] > auc_bootstrapped['rf']).mean()

0.0189

$P(AUC_{logistic regression} > AUC_{gradient boosting})$

In [6]:
(auc_bootstrapped['lr'] > auc_bootstrapped['gb']).mean()

0.1213

$P(AUC_{gradient boosting} > AUC_{random forest})$

In [7]:
(auc_bootstrapped['gb'] > auc_bootstrapped['rf']).mean()

0.02

With Bonnferoni correction, 
$$\alpha_{\{per comparison\}} = \alpha/m \\
= 0.05/3 \\
= 0.017$$

Conclusion that taking into account the multiple comparisons, the difference between the best and worst performing models is $p < \alpha$