In [None]:

# my helper functions for this project
import importlib
import helper
importlib.reload(helper)


In [None]:
# make the test data

import pandas as pd
import numpy as np

# positive class: 1 
# negative class: 0
# high score -> more likely to be positive

d = {'score': [0.05, 0.1 ,0.20, 0.70, 0.70, 0.90, 0.90, 0.95], 'label': [0,0, 1, 0, 1, 1, 0, 1]}
df = pd.DataFrame(data=d)
df




In [None]:
# calculate TPR and FPR for each point on ROC space

importlib.reload(helper)
df = helper.calculateROCPoints(df)

df


In [None]:

# plot roc curve with specific colours for each point to show correspondance with each cost line
# in the cost curve plotted below

import matplotlib.pyplot as plt

cols = ['blue', 'green', 'orange', 'red', 'pink', 'lightgreen', 'red', 'lightgrey', 'lightgreen']

# plot ROC
plt.plot(df['fpr'], df['tpr'])
plt.scatter(df['fpr'], df['tpr'], c=cols, s=100)

# plot convex hull
plt.plot([0,0.5], [0.25,1], linestyle='--', color='green')

plt.xlabel('FPR')
plt.ylabel('TPR')
plt.show()



In [None]:
## plot cost curve with specific colours for each line to show correspondance with ROC curve above 

import matplotlib.pyplot as plt 

# calc class distribution
nN = np.sum(df.label==0)
nP = np.sum(df.label==1)
piP = nP/(nP+nN)
piN = 1-piP

b = 2
c_0 = 1
c = c_0 / 2

fig, ax = plt.subplots()

# plot line in cost space for each F0 F1 pair in ROC space
for i in range(0,df.shape[0]):

    gradient = 2*(piN*df.loc[i,'fpr'] - piP*(1-df.loc[i,'tpr']))
    intercept = 2*piP * (1-df.loc[i,'tpr'])

    # plot cost line
    ax.axline((0, intercept), slope=gradient, color=cols[i], linewidth=3)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)


# plot lower envelope
importlib.reload(helper)
lowerEnv = helper.lowerEnvelope(df)
plt.plot(lowerEnv['cost'], lowerEnv['loss'], color='black', linestyle=':', linewidth=3)
ax.set_ylabel("Loss")
ax.set_xlabel("Cost")

plt.xlabel('Cost')
plt.ylabel('Loss');


In [None]:
## plot skew curve

import matplotlib.pyplot as plt 

# calc class distribution
nN = np.sum(df.label==0)
nP = np.sum(df.label==1)
piP = nP/(nP+nN)
piN = 1-piP

b = 2
c_0 = 1
c = c_0 / 2

fig, ax = plt.subplots()

# plot line in cost space for each F0 F1 pair in ROC space
for i in range(0,df.shape[0]):

    gradient = df.loc[i,'fpr'] + df.loc[i,'tpr'] - 1
    intercept = 1-df.loc[i,'tpr']

    # plot cost line
    ax.axline((0, intercept), slope=gradient, color=cols[i], linewidth=3)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)


# plot lower envelope
importlib.reload(helper)
lowerEnv = helper.lowerEnvelope(df)
plt.plot(lowerEnv['cost'], lowerEnv['loss'], color='black', linestyle=':', linewidth=3)

plt.xlabel('Skew')
plt.ylabel('Loss');


In [None]:
# plot cost curves (lower envelope) for different class distibutions

# plot lower envelope
importlib.reload(helper)

ls=['-', '--', ':', '-.', '-', '--']

for piP in np.arange(0,1.01, 0.2):
    lowerEnv = helper.lowerEnvelope(df, piP)
    plt.plot(lowerEnv['cost'], lowerEnv['loss'], color=cols[int(piP*5)], linestyle=ls[int(piP*5)], label=r"$\pi_{POS}$="+str(int(piP*10)/10), linewidth=3)
    ax.set_ylabel("Loss")
    ax.set_xlabel("Cost")

plt.xlim(0,1)
plt.ylim(0,1)

plt.legend()

plt.xlabel('Cost')
plt.ylabel('Loss');


In [None]:
# plot cost curve with skew on the x axis

# plot lower envelope
importlib.reload(helper)

lowerEnv = helper.lowerEnvelopeSkew(df)
plt.plot(lowerEnv['cost'], lowerEnv['loss'], color='blue', linestyle='-', linewidth=3)
ax.set_ylabel("Loss")
ax.set_xlabel("Cost")

plt.xlim(0,1)
plt.ylim(0,1)

plt.xlabel('Skew')
plt.ylabel('Loss');

In [None]:


import matplotlib.pyplot as plt 

# calc class distribution
nN = np.sum(df.label==0)
nP = np.sum(df.label==1)
piP = nP/(nP+nN)
piN = 1-piP

b = 2
c_0 = 1
c = c_0 / 2

fig, ax = plt.subplots()

# plot line in cost space for each F0 F1 pair in ROC space
for i in range(0,df.shape[0]):

    gradient = 2*(piN*df.loc[i,'fpr'] - piP*(1-df.loc[i,'tpr']))
    intercept = 2*piP * (1-df.loc[i,'tpr'])

    # plot cost line
    ax.axline((0, intercept), slope=gradient, color=cols[i], linewidth=2)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)


# plot lower envelope
importlib.reload(helper)
lowerEnv = helper.lowerEnvelope(df)
plt.plot(lowerEnv['cost'], lowerEnv['loss'], color='black', linestyle=':', linewidth=2)
ax.set_ylabel("Loss")
ax.set_xlabel("Cost")

plt.xlabel('Cost')
plt.ylabel('Loss');

### Add brier curve with cost as the operating condition

# this interpolates the scores but keeps the fpr and tpr as segments (i.e. it's empirical with only a few examples)
def interpolateScores(df):
    import pandas as pd
    import numpy as np

    # add a row for score =1 and a row for score = 0, to get a complete interpolation for the whole range of costs
    df.loc[len(df)-1, 'score'] = 1
    df.loc[len(df)-1, 'fpr'] = 0
    df.loc[len(df)-1, 'tpr'] = 0

    new_row = pd.DataFrame([{'score': 0, 'fpr': 1, 'tpr': 1}])
    df = pd.concat([new_row, df], ignore_index=True)

    factor = 100  # number of steps between each original row
    new_rows = []

    for i in range(len(df) - 1):
    
        # Interpolate numeric columns
        startx = df.iloc[i]
        endx = df.iloc[i + 1]

        for j in range(factor):
            t = j / factor

            row = {
                'score': startx['score'] * (1 - t) + endx['score'] * t,
                'fpr': startx['fpr'],
                'tpr': startx['tpr']
            }
            new_rows.append(row)

    # Add the final row
    new_rows.append(df.iloc[-1].to_dict())

    # Final interpolated DataFrame
    df_interp = pd.DataFrame(new_rows)
    return(df_interp)
    
print(df)
df_interp = interpolateScores(df)

df_interp['brier_loss'] = 2*(1-df_interp['score'])*piP*(1-df_interp['tpr']) + 2*df_interp['score']*piN*df_interp['fpr']
plt.plot(df_interp['score'], df_interp['brier_loss'], color="#d37af6", linestyle='--', label='Model Brier curve', linewidth=3)


plt.xlabel('Cost')
plt.ylabel('Loss');

plt.xlim(0,1)
plt.ylim(0,1)

