In [None]:

# my helper functions for this project
import importlib
import helper



In [None]:
# make the test data

import pandas as pd
import numpy as np

# positive class: 1 
# negative class: 0
# high score -> more likely to be positive

#d = {'score': [0.05, 0.1 ,0.20, 0.70, 0.70, 0.90, 0.90, 0.95], 'label': [0,0, 1, 0, 1, 1, 0, 1]}
d = {'score': [0.03, 0.05, 0.1 ,0.20, 0.70, 0.70, 0.90, 0.90, 0.95], 'label': [0, 0, 0, 1, 0, 0, 1, 0, 1]}
df = pd.DataFrame(data=d)
df




In [None]:
# calculate TPR and FPR for each point on ROC space

importlib.reload(helper)
df = helper.calculateROCPoints(df)
df


In [None]:

# plot roc curve with specific colours for each point to show correspondance with each cost line
# in the cost curve plotted below

import matplotlib.pyplot as plt

# set plot font size for all plots
plt.rcParams.update({'font.size': 16})  # applies to everything

cols = ['blue', 'green', 'orange', 'red', 'pink', 'lightgreen', 'red', 'lightgrey', "#fbadd8be", '#d37af6']

# plot ROC
plt.plot(df['fpr'], df['tpr'], clip_on=False)
plt.scatter(df['fpr'], df['tpr'], c=cols, s=100, clip_on=False)

# plot convex hull
plt.plot([1/6,0.5], [2/3,1], linestyle='--', color='green')

plt.xlabel('FPR')
plt.ylabel('TPR')

# this is the set of point that will have a horizontal cost line in cost space
nN = np.sum(df.label==0)
nP = np.sum(df.label==1)
piP = nP/(nP+nN)
piN = 1-piP
plt.axline((0, 1), (piP/piN, 0), color='lightgrey', markersize=1, linestyle=':', linewidth=1, label="DCA costs")

plt.xlim([0,1])
plt.ylim([0,1])

plt.show()



In [None]:
## plot cost curve with specific colours for each line to show correspondance with ROC curve above 

def plotCostCurveWithCostLines(df):
    # calc class distribution
    nN = np.sum(df.label==0)
    nP = np.sum(df.label==1)
    piP = nP/(nP+nN)
    piN = 1-piP

    fig, ax = plt.subplots()

    # plot line in cost space for each F0 F1 pair in ROC space
    for i in range(0,df.shape[0]):

        gradient = 2*(piN*df.loc[i,'fpr'] - piP*(1-df.loc[i,'tpr']))
        intercept = 2*piP * (1-df.loc[i,'tpr'])

        # plot cost line
        ax.axline((0, intercept), slope=gradient, color=cols[i], linewidth=3)
        

    # plot lower envelope
    importlib.reload(helper)
    lowerEnv = helper.lowerEnvelope(df)
    plt.plot(lowerEnv['cost'], lowerEnv['loss'], color='black', linestyle=':', linewidth=4, label='Lower envelope cost curve')

    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)

    plt.xlabel('Cost proportion')
    plt.ylabel('Loss');

plotCostCurveWithCostLines(df)


In [None]:
# plot cost curve (lower envelope) and cost lines, with skew as the operating condition

def plotSkewCurveWithCostLines(df):
    # calc class distribution
    nN = np.sum(df.label==0)
    nP = np.sum(df.label==1)
    piP = nP/(nP+nN)
    piN = 1-piP

    fig, ax = plt.subplots()

    # plot line in cost space for each F0 F1 pair in ROC space
    for i in range(0,df.shape[0]):

        gradient = df.loc[i,'fpr'] + df.loc[i,'tpr'] - 1
        intercept = 1-df.loc[i,'tpr']

        # plot cost line
        ax.axline((0, intercept), slope=gradient, color=cols[i], linewidth=3)
        

    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)

    # plot lower envelope
    importlib.reload(helper)
    lowerEnv = helper.lowerEnvelopeSkew(df)
    plt.plot(lowerEnv['skew'], lowerEnv['loss'], color='black', linestyle=':', linewidth=4)

    plt.xlabel('Skew')
    plt.ylabel('Loss');


plotSkewCurveWithCostLines(df)

In [None]:
# plot cost curves (lower envelope) for different class distibutions

# plot lower envelope
importlib.reload(helper)

ls=['-', '--', ':', '-.', '-', '--']

for piP in np.arange(0,1.01, 0.2):
    lowerEnv = helper.lowerEnvelope(df, piP)
    plt.plot(lowerEnv['cost'], lowerEnv['loss'], color=cols[int(piP*5)], linestyle=ls[int(piP*5)], label=r"$\pi_{POS}$="+str(int(piP*10)/10), linewidth=3)

plt.xlim(0,1)
plt.ylim(0,1)

plt.legend()

plt.xlabel('Cost proportion')
plt.ylabel('Loss');


In [None]:
# plot cost curve with skew as the operating condition

# plot lower envelope
importlib.reload(helper)

lowerEnv = helper.lowerEnvelopeSkew(df)
plt.plot(lowerEnv['skew'], lowerEnv['loss'], color='blue', linestyle='-', linewidth=3)

plt.xlim(0,1)
plt.ylim(0,1)

plt.xlabel('Skew')
plt.ylabel('Loss');

In [None]:

### Plotting cost curve plot with brier curve, with cost as the operating condition

plotCostCurveWithCostLines(df)


# this interpolates the scores but keeps the fpr and tpr as segments (i.e. it's empirical with only a few examples)
def interpolateScores(df):
    import pandas as pd
    import numpy as np

    # add a row for score =1 and a row for score = 0, to get a complete interpolation for the whole range of costs
    df.loc[len(df)-1, 'score'] = 1
    df.loc[len(df)-1, 'fpr'] = 0
    df.loc[len(df)-1, 'tpr'] = 0

    new_row = pd.DataFrame([{'score': 0, 'fpr': 1, 'tpr': 1}])
    df = pd.concat([new_row, df], ignore_index=True)

    factor = 100  # number of steps between each original row
    new_rows = []

    for i in range(len(df) - 1):
    
        # Interpolate numeric columns
        startx = df.iloc[i]
        endx = df.iloc[i + 1]

        for j in range(factor):
            t = j / factor

            row = {
                'score': startx['score'] * (1 - t) + endx['score'] * t,
                'fpr': startx['fpr'],
                'tpr': startx['tpr']
            }
            new_rows.append(row)

    # Add the final row
    new_rows.append(df.iloc[-1].to_dict())

    # Final interpolated DataFrame
    df_interp = pd.DataFrame(new_rows)
    return(df_interp)
    
print(df)
df_interp = interpolateScores(df)

nN = np.sum(df.label==0)
nP = np.sum(df.label==1)
piP = nP/(nP+nN)
piN = 1-piP

df_interp['brier_loss'] = 2*(1-df_interp['score'])*piP*(1-df_interp['tpr']) + 2*df_interp['score']*piN*df_interp['fpr']
plt.plot(df_interp['score'], df_interp['brier_loss'], color="black", linestyle='--', label='Model Brier curve', linewidth=4)


plt.xlabel('Cost proportion')
plt.ylabel('Loss');

plt.legend()

plt.xlim(0,1)
plt.ylim(0,1)



In [None]:
### Plotting cost curve plot with brier curve, with skew as the operating condition

plotSkewCurveWithCostLines(df)


# this interpolates the scores but keeps the fpr and tpr as segments (i.e. it's empirical with only a few examples)
def interpolateScores(df):
    import pandas as pd
    import numpy as np

    # add a row for score =1 and a row for score = 0, to get a complete interpolation for the whole range of costs
    df.loc[len(df)-1, 'score'] = 1
    df.loc[len(df)-1, 'fpr'] = 0
    df.loc[len(df)-1, 'tpr'] = 0

    new_row = pd.DataFrame([{'score': 0, 'fpr': 1, 'tpr': 1}])
    df = pd.concat([new_row, df], ignore_index=True)

    factor = 100  # number of steps between each original row
    new_rows = []

    for i in range(len(df) - 1):
    
        # Interpolate numeric columns
        startx = df.iloc[i]
        endx = df.iloc[i + 1]

        for j in range(factor):
            t = j / factor

            row = {
                'score': startx['score'] * (1 - t) + endx['score'] * t,
                'fpr': startx['fpr'],
                'tpr': startx['tpr']
            }
            new_rows.append(row)

    # Add the final row
    new_rows.append(df.iloc[-1].to_dict())

    # Final interpolated DataFrame
    df_interp = pd.DataFrame(new_rows)
    return(df_interp)
    
print(df)
df_interp = interpolateScores(df)

df_interp['brier_loss'] = (1-df_interp['score'])*(1-df_interp['tpr']) + df_interp['score']*df_interp['fpr']
plt.plot(df_interp['score'], df_interp['brier_loss'], color="black", linestyle='--', label='Model Brier curve', linewidth=3)


plt.xlabel('Skew')
plt.ylabel('Loss');

plt.xlim(0,1)
plt.ylim(0,1)


In [None]:
##
## plot DCA plot for illustrative example


## plot NB lines
def plotDCAWithDecisionLines(brierCosts=False, zoom=False):

    n=9
    nP = 3
    nN = 6

    if (brierCosts==False):
        df_interp['netbenx'] = df_interp['tpr']*nP/n - (df_interp['fpr']*nN/n)*(df_interp['score']/(1-df_interp['score']))
        df_interp['netbenx_treatnone'] = 0/n - (0/n)*(df_interp['score']/(1-df_interp['score']))
        df_interp['netbenx_treatall'] = nP/n - (nN/n)*(df_interp['score']/(1-df_interp['score']))
    else:
    # versions with costs used for Brier curves
        df_interp['netbenx'] = 2*(1-df_interp['score'])*df_interp['tpr']*nP/n - 2*(df_interp['fpr']*nN/n)*df_interp['score']
        df_interp['netbenx_treatnone'] = 2*0/n - 2*(0/n)*df_interp['score']
        df_interp['netbenx_treatall'] = 2*(1-df_interp['score'])*nP/n - 2*(nN/n)*df_interp['score']

    #df['netben'] = df['tpr']*nP/n - (df['fpr']*nN/n)*(df['score']/(1-df['score']))


    # create the values for each of the decision lines, where each decision line corresponds 
    # to a particular point on the ROC curve
    for i in range(0,df.shape[0]):

        nb_ts = []

        thisTPR =df.loc[i,'tpr']
        thisFPR =df.loc[i,'fpr']
        thisScore =df.loc[i,'score']

        piP = nP/n
        piN = 1-piP

        # get each point on the decision line for this TPR/FPR combination
        for t in np.arange(0,1, 0.01):

            if (brierCosts==False):
                nb = thisTPR*piP - thisFPR*piN*(t/(1-t))
            else:
                nb = 2*(1-t)*thisTPR*piP - 2*thisFPR*piN*t

            nb_t = {'t':t, 'nb':nb}
            nb_ts.append(nb_t)


        df_nb_ts = pd.DataFrame(nb_ts)
        plt.plot(df_nb_ts['t'], df_nb_ts['nb'], color=cols[i], linestyle='-', linewidth=3)


    # decision curve
    plt.plot(df_interp['score'], df_interp['netbenx'], color='black', linestyle='--', label='Decision curve', linewidth=4)

    ## plot decision curve upper envelope
    importlib.reload(helper)
    upperEnv = helper.upperEnvelopeNB(df, brierCosts=brierCosts)
    plt.plot(upperEnv['cost'], upperEnv['nb'], color='black', linestyle=':', linewidth=4, label='Upper envelope decision curve')

    # either plot whole plot or zoomed in to left-hand side to see what is happening
    if (zoom==False):
        plt.xlim([0,1])
        plt.ylim([min(df_interp['netbenx'])-0.1,1])
    else:
        # zoomed in version of plot
        plt.xlim([0,0.6])
        plt.ylim([min(df_interp['netbenx'].head(500)),0.5])

    plt.xlabel('Decision threshold')
    plt.ylabel('Net benefit')

    plt.legend()

    plt.show()


# whole plots
plotDCAWithDecisionLines(False)
plotDCAWithDecisionLines(True)

# zoomed in versions
plotDCAWithDecisionLines(False, zoom=True)
plotDCAWithDecisionLines(True, zoom=True)


In [None]:

## plot misclassification costs of DCA versus Brier curves

import matplotlib.pyplot as plt 
import numpy as np

plt.axline((0, 2), (2, 0), color='green', markersize=1, linestyle='-', linewidth=1, label="Brier curve costs")
plt.axline((1, 0), (1, 2), color='red', markersize=1, linestyle='--', linewidth=1, label="DCA costs")

plt.xlabel('Misclassification cost for positive class (c_P)')
plt.ylabel('Misclassification cost for negative class (c_N)')
plt.xlim(0,2)
plt.ylim(0,4.3)


for t in np.arange(0,1.1, 0.2):

       textXPos = 1
       textYPos = t/(1-t)
       plt.text(textXPos+0.02,textYPos+0.02 , "{:.2f}". format(t) , fontsize=8)
       plt.scatter(textXPos,textYPos, c='red', s=10)

       textXPosBC = 2*(1-t)
       textYPosBC = 2*t
       plt.text(textXPosBC+0.02,textYPosBC+0.02 , "{:.2f}". format(t) , fontsize=8)
       plt.scatter(textXPosBC,textYPosBC, c='green', s=10)

plt.legend()
