In [None]:
###
### this notebook shows what happens when the classes are flipped (i.e. pos becomes neg and neg becomes pos)
### and what happens when the costs for DCA are rescaled to sum to 2


# my helper functions for this project
import importlib
import helper



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

np.random.seed(1234)

# positive class: 1 
# negative class: 0
# high score -> more likely to be positive

# number of examples in each class
nN = 4*2000
nP = 2000
n = nN + nP

piP = nP/n
piN = nN/n

meanP=0.6
meanN=0.4
scalePN=0.12

# scores for each class generated from two normal distributions
scoresN=np.random.normal(loc=meanN, scale=scalePN, size=nN)
scoresP=np.random.normal(loc=meanP, scale=scalePN, size=nP)

# plot the scores - one histogram for each class
def plotScoreDistributions(scores, labels):
    plt.hist(scores[labels==0], density=True, bins=30, alpha=0.5, label="Negative class", color="skyblue")
    plt.hist(scores[labels==1], density=True, bins=30, alpha=0.5, label="Positive class", color="green")
    plt.legend(loc='upper left')


# create labels for the two classes
labelsN=np.zeros(nN)
labelsP=np.ones(nP)

# create data frame of scores and labels
scores= np.concatenate((scoresN, scoresP))
labels= np.concatenate((labelsN, labelsP))

d = {'score': scores, 'label':labels}

df = pd.DataFrame(data=d)

df = df.set_index("score")
df = df.sort_values('score')
df.reset_index(inplace=True)

# checking
df.size
#print(df.head(n=10))
#print(df.tail(n=10))
print(min(df.score))
print(max(df.score))

print('class distribution:', nP/(nN+nP))

plotScoreDistributions(df.score, df.label)

# fix scores outside 0 and 1
print('Min:', min(df.score), " Max: ", max(df.score))

df.loc[df['score'] < 0, 'score'] = 0
df.loc[df['score'] > 1, 'score'] = 1
print('Min:', min(df.score), " Max: ", max(df.score))


In [None]:
# calculate TPR and FPR
importlib.reload(helper)
df = helper.calculateROCPoints(df)

print(df.head(n=10))
print(df.tail(n=10))

In [None]:
# calculate brier loss for the original score and the calibrated scores

df['brier_loss'] = 2*(1-df['score'])*piP*(1-df['tpr']) + 2*df['score']*piN*df['fpr']

df['brier_lossFlip'] = 2*(df['score'])*(1-piP)*(df['fpr']) + 2*(1-df['score'])*(1-piN)*(1-df['tpr'])



In [None]:
# calculate net benefit for DCA

df['netben'] = df['tpr']*nP/n - (df['fpr']*nN/n)*(df['score']/(1-df['score']))

# treat all and treat none lines
df['netben_treatnone'] = 0/n - (0/n)*(df['score']/(1-df['score']))
df['netben_treatall'] = nP/n - (nN/n)*(df['score']/(1-df['score']))

# flipped versions
df['netbenFlip'] = (1-df['fpr'])*nN/n - ((1-df['tpr'])*nP/n)*((1-df['score'])/df['score'])
df['netben_treatnoneFlip'] = 0/n - (0/n)*((1-df['score'])/(df['score']))
df['netben_treatallFlip'] = nN/n - (nP/n)*((1-df['score'])/(df['score']))

df

In [None]:



importlib.reload(helper)

###
### original versions

upperEnv = helper.upperEnvelopeNB(df)
plt.plot(upperEnv['cost'], upperEnv['nb'], color='skyblue', linestyle=':', linewidth=2, label='Upper envelope decision curve')

plt.plot(df['score'], df['netben'], color='C2', linestyle='-', label='Decision curve')

# treat all and treat none lines
plt.plot(df['score'], df['netben_treatall'], color='C2', linestyle='--', label='Treat all')

###
### flipped classes versions

upperEnvFlip = helper.upperEnvelopeNBFlip(df)
plt.plot(upperEnvFlip['cost'], upperEnvFlip['nb'], color='paleturquoise', linestyle=':', linewidth=2, label='Upper envelope decision curve (flip)')

plt.plot(1-df['score'], df['netbenFlip'], color='limegreen', linestyle='-', label='Decision curve (flip)')

# treat all and treat none lines
plt.plot((1-df['score']), df['netben_treatallFlip'], color='limegreen', linestyle='--', label='Treat all (flip)')
#plt.plot((1-df['score']), df['netben_treatnoneFlip'], color='red', markersize=1, linestyle='-', linewidth=1, label="Treat none")

plt.axline((0, 0), (1, 0), color='red', markersize=1, linestyle='-', linewidth=1, label="Treat none")

plt.xlabel('Cost')
plt.ylabel('Net benefit');

plt.xlim(0,1)
plt.ylim(-1,1)

plt.legend(loc='lower left')


In [None]:
###
### plot cost curves including the treat all and treat none lines


df['brier_loss_treatall'] = 2*(1-df['score'])*piP*(1-1) + 2*df['score']*piN*1
df['brier_loss_treatnone'] = 2*(1-df['score'])*piP*(1-0) + 2*df['score']*piN*0

df['brier_loss_treatallFlip'] = 2*(df['score'])*piN*(1) + 2*(1-df['score'])*piP*0
df['brier_loss_treatnoneFlip'] = 2*(df['score'])*piN*(0) + 2*(1-df['score'])*piP*1

##
## cost curve with cost proportion as the operating condition

# plot cost curve and brier curve (of uncalibrated scores)
dfLossCost = helper.lowerEnvelope(df)
importlib.reload(helper)

plt.plot(dfLossCost['cost'], dfLossCost['loss'], color='skyblue', linestyle=':', label='Lower envelope cost curve')



plt.plot(df['score'], df['brier_loss'], color='C2', linestyle='-', label='Model Brier curve')
plt.plot(df['score'], df['brier_loss_treatall'], color='C2', linestyle='--', label='Predict all as positive')
plt.plot(df['score'], df['brier_loss_treatnone'], color='red', linestyle='-', label='Predict all as negative', linewidth=1)

# flip versions
dfLossCostFlip = helper.lowerEnvelopeFlip(df)
plt.plot(dfLossCostFlip['cost'], dfLossCostFlip['loss'], color='paleturquoise', linestyle=':', label='Lower envelope cost curve (flip)')

plt.plot(1-df['score'], df['brier_lossFlip'], color='limegreen', linestyle='-', label='Model Brier curve (flip)')
plt.plot(1-df['score'], df['brier_loss_treatallFlip'], color='limegreen', linestyle='--', label='Predict all as positive (flip)')
plt.plot(1-df['score'], df['brier_loss_treatnoneFlip'], color='red', linestyle=':', label='Predict all as negative (flip)', linewidth=1)

plt.xlabel("Cost proportion")
plt.ylabel("Loss")

plt.xlim(0,1)
plt.ylim(0,1)

plt.legend()





In [None]:

## normalised version of DCA with costs summing to 2

df['netben_bfix'] = 2*(piP*df['tpr'] - df['score']*piP*df['tpr'] - df['score']*piN*df['fpr'])
df['netben_treatnone_bfix'] = 2*(piP*0 - df['score']*piP*0 - df['score']*piN*0)
df['netben_treatall_bfix'] = 2*(piP*1 - df['score']*piP*1 - df['score']*piN*1)


importlib.reload(helper)

###
### original versions
plt.plot(df['score'], df['netben_bfix'], color='green', linestyle='-', label='DC')

# treat all and treat none lines
plt.plot(df['score'], df['netben_treatall_bfix'], color='green', linestyle='--', label='Treat all')


plt.axline((0, 0), (1, 0), color='red', markersize=1, linestyle='-', linewidth=1, label="Treat none")

plt.xlabel('Cost')
plt.ylabel('Net benefit');

plt.xlim(0,1)
plt.ylim(-1,1)

plt.legend(loc='lower left')
