In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

np.random.seed(1234)

# positive class: 1 
# negative class: 0

# number of examples in each class

nN = 4*2000
nP = 2000
n = nN + nP

# scores for each class generated from two normal distributions
scoresN=np.random.normal(loc=0.4, scale=0.12, size=nN)
scoresP=np.random.normal(loc=0.6, scale=0.12, size=nP)

# plot the scores - one histogram for each class
def plotScoreDistributions(scores, labels):
    plt.hist(scores[labels==0], density=True, bins=30, alpha=0.5, label="Negative class", color="skyblue")
    plt.hist(scores[labels==1], density=True, bins=30, alpha=0.5, label="Positive class", color="green")
    plt.legend(loc='upper left')


# create labels for the two classes
labelsN=np.zeros(nN)
labelsP=np.ones(nP)

# create data frame of scores and labels
scores= np.concatenate((scoresN, scoresP))
labels= np.concatenate((labelsN, labelsP))

d = {'score': scores, 'label':labels}

df = pd.DataFrame(data=d)

df = df.set_index("score")
df = df.sort_values('score')
df.reset_index(inplace=True)

# checking
df.size
#print(df.head(n=10))
#print(df.tail(n=10))
print(min(df.score))
print(max(df.score))

print('class distribution:', nP/(nN+nP))

plotScoreDistributions(df.score, df.label)

# fix scores outside 0 and 1
print('Min:', min(df.score), " Max: ", max(df.score))

df.loc[df['score'] < 0, 'score'] = 0
df.loc[df['score'] > 1, 'score'] = 1
print('Min:', min(df.score), " Max: ", max(df.score))


In [None]:
# calculate AUC

from sklearn import metrics
#xx = df.dropna(subset="label")
#auc = metrics.roc_auc_score(xx['label'], xx['score'])

auc = metrics.roc_auc_score(df['label'], df['score'])
print("AUC:", auc)

In [None]:
# calibrate scores
from scipy.stats import norm

def myCalibrate(score):
    piP = nP/(nN+nP)

    scoresCal = piP*norm.pdf(score, 0.55, 0.15)/((1-piP)*norm.pdf(score, 0.45, 0.15)+piP*norm.pdf(score, 0.55, 0.15))
    plt.scatter(score, scoresCal)
    plt.xlabel('Original score')
    plt.ylabel('Calibrated score')

    return(scoresCal)

df['scoreCal'] = myCalibrate(df.score)

df


In [None]:
# plot the calibrated score distributions

plotScoreDistributions(df.scoreCal, df.label)


In [None]:
# calculate TPR and FPR

# init fpr and tpr columns
df.fpr=np.nan
df.tpr=np.nan

# loop through each examples (sorted by score) and calculate tpr and fpr for with that score as the threshold 
prevScore = -1
prevLabel = -1
for i in range(0,df.shape[0]):

    threshold = df.loc[i,'score']
    
    # find indexes with examples predicted as true for this threshold
    i_pred = np.where(df.score >= threshold, 1, 0)
    
    # calculate fpr and tpr
    label = df.label[i]
    fp = np.sum((i_pred==1) & (df.label == 0))
    tp = np.sum((i_pred==1) & (df.label == 1))
    
    df.loc[i, 'fpr'] = fp / nN
    df.loc[i, 'tpr'] = tp / nP


# past the highest score and no examples classified as true 
numrows = df.shape[0]
df.loc[numrows,'fpr'] = 0
df.loc[numrows,'tpr'] = 0

print(df.head(n=10))
print(df.tail(n=10))

In [None]:


# plot roc curve
ax = df.plot(kind = 'line', x = 'fpr', y ='tpr', legend=False, color='skyblue')
ax.set_ylabel("TPR")
ax.set_xlabel("FPR")




In [None]:
###
### plotting the cost curve

import matplotlib.pyplot as plt 

# Calculate loss for different costs
piN = nN/(nN+nP)
piP = 1 - piN

b = 2
c_0 = 1
c = c_0 / 2

fig, ax = plt.subplots()
    
# plot line in cost space for each F0 F1 pair in ROC space
for i in range(1,df.shape[0]):

    gradient = 2*(piN*df.loc[i,'fpr'] - piP*(1-df.loc[i,'tpr']))
    intercept = 2*piP * (1-df.loc[i,'tpr'])
    ax.axline((0, intercept), slope=gradient, color='C0')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)




In [None]:
# Cost curve without plotting all the lines

# get best loss for each cost
loss_costs = []
for c in np.arange(0,1.01, 0.01):
    
    #loss = 2*(c*pi_1*(1-df['tpr']) + (1-c)*pi_0*df['fpr'])
    loss = 2*((1-c)*piP*(1-df['tpr']) + c*piN*df['fpr'])
    
    minLoss = min(loss)

    # the point with lowest loss doesn't have c==t
    print(c, ': ', df.score[np.argmin(loss)])

    loss_cost = {'cost':c, 'loss':minLoss}
    loss_costs.append(loss_cost)

dfLossCost = pd.DataFrame(loss_costs)

ax = dfLossCost.plot(kind = 'line', x = 'cost', y ='loss', legend=False, color='skyblue', ylim=[0,1])
ax.set_ylabel("Loss")
ax.set_xlabel("Cost")



In [None]:
# calculate brier loss for the original score and the calibrated scores

df['brier_lossx'] = 2*(1-df['score'])*piP*(1-df['tpr']) + 2*df['score']*piN*df['fpr']
df['brier_loss_calx'] = 2*(1-df['scoreCal'])*piP*(1-df['tpr']) + 2*df['scoreCal']*piN*df['fpr']



In [None]:
# plot cost curve and brier curve (of uncalibrated scores)
ax = dfLossCost.plot(kind = 'line', x = 'cost', y ='loss', color='skyblue', ylim=[0,1], label='Model cost curve')
df.plot(kind = 'line', x = 'score', y ='brier_loss', ax=ax, color='C7', linestyle='--', label='Model Brier curve')
ax.set_ylabel("Loss")
ax.set_xlabel("Cost")



In [None]:

# same plot but checking that the calibrated scores version is the same as the cost curve

ax = dfLossCost.plot(kind = 'line', x = 'cost', y ='loss', legend=False, color='skyblue', ylim=[0,1])
df.plot(kind = 'line', x = 'score', y ='brier_loss', ax=ax, color='C7', linestyle='-', label='Model Brier curve')
df.plot(kind = 'line', x = 'scoreCal', y ='brier_loss_cal', ax=ax, color='C8', linestyle='--', label='Model Brier curve cal')
ax.set_ylabel("Loss")
ax.set_xlabel("Cost")


In [None]:
# calculate net benefit for DCA

#df['netben'] = df.tpr*nP/n - (df.fpr*nN/n)*(df.score/(1-df.score))
df['netben'] = df['tpr']*nP/n - (df['fpr']*nN/n)*(df['score']/(1-df['score']))
df['netben_treatnone'] = 0/n - (0/n)*(df['score']/(1-df['score']))
df['netben_treatall'] = nP/n - (nN/n)*(df['score']/(1-df['score']))

df

In [None]:
# plot DCA curve
fig, ax = plt.subplots()
ax.set_xlim(0, 1)
ax.set_ylim(-1, 1)
df.plot(kind = 'line', x = 'score', y ='netben', ax=ax, color='C2', linestyle='-', label='Model, standard DCA')
df.plot(kind = 'line', x = 'score', y ='netben_treatall', ax=ax, color='C2', linestyle='--', label='Treat all standard DCA')
ax.axline((0, 0), slope=0, color='C3', markersize=1, linestyle='-', linewidth=1, label="Treat none")
ax.legend();


In [None]:
#    df.loc[i,'brier_loss_treatall'] = 2*(1-df.loc[i,'score'])*pi_0*(1-1) + 2*df.loc[i,'score']*pi_1*1
#    df.loc[i,'brier_loss_treatnone'] = 2*(1-df.loc[i,'score'])*pi_0*(1-0) + 2*df.loc[i,'score']*pi_1*0

    # df.loc[i,'NB_loss'] = 1*pi_0*(1-df.loc[i,'f0']) + ((1-df.loc[i,'score'])/(df.loc[i,'score']))*pi_1*df.loc[i,'f1']
    
    # df.loc[i,'max_brier_loss'] = 2*df.loc[i,'score']*pi_0*(1-0) + 2*(1-df.loc[i,'score'])*pi_1*1
    # df.loc[i,'max_NB_loss'] = 1*pi_0*(1-0) + ((1-df.loc[i,'score'])/(df.loc[i,'score']))*pi_1*1

    # df.loc[i,'NB_loss_treatall'] = 1*pi_0*(1-1) + ((1-df.loc[i,'score'])/(df.loc[i,'score']))*pi_1*1
    # df.loc[i,'NB_loss_treatnone'] = 1*pi_0*(1-0) + ((1-df.loc[i,'score'])/(df.loc[i,'score']))*pi_1*0
