In [None]:
import numpy as np

In [None]:
tmp = np.array([0,0,1,0,3,1,3,3,5,5])
np.bincount(tmp, weights=1/np.arange(1,len(tmp)+1))

In [None]:
def self_dist_matrix(matrix):
    dmatrix = scipy.spatial.distance.pdist(matrix, metric='euclidean')
    dmatrix = scipy.spatial.distance.squareform(dmatrix)
    np.fill_diagonal(dmatrix, np.inf)
    return dmatrix

def ab_dist_matrix(matrixA, matrixB):
    #calculate distance from matrixA vectors to MatrixB vectors
    dmatrix = scipy.spatial.distance.cdist(matrixA, matrixB, metric='euclidean')
    return dmatrix

def knn_predict(neighbors, immigrants, neighbor_labels, kmax, batch_memory_limit = 10**8, weighted=True):
    """
    NOTE: numpy arrays are passed by reference
    -- any changes made in this function will persist past scope
    """
    print('neighbors: {}\nimmigrants: {}'.format(neighbors.shape, immigrants.shape))
    def predict(vec, labels=neighbor_labels, kmax=kmax, weighted=weighted):
        """
        calc predictions for a single vector,
        k=1 -> k=kmax (returns a np.array, shape = (kmax,len(labels)))
        """
        #kmax_kclosest_indexes = np.argpartition(vec,kmax)[:kmax] #had issues with indexes being incorrect
        kmax_closest_indexes_ordered = np.argsort(vec)[:kmax]
        #sort the k-closest neighbors based on distance, and get their labels
        kmax_closest_labels  = neighbor_labels[kmax_closest_indexes_ordered]
        if weighted:
            np.bincount(tmp, weights=1/np.arange(1,len(tmp)+1))
            probs = np.cumsum(kmax_closest_labels / np.arange(1,kmax+1)) / np.cumsum(1/np.arange(1,kmax+1))
            return probs
        return np.cumsum(kmax_closest_labels)/np.arange(1,kmax+1)
    
    batch_size = int(batch_memory_limit / (8*neighbors.shape[0]))
    print('batch_size: {}'.format(batch_size))
    curindex   = 0
    preds      = []
    while curindex < immigrants.shape[0]:
        print('\rcurindex: {}'.format(curindex), end='')
        dmat      = ab_dist_matrix(immigrants[curindex:(curindex+batch_size)], neighbors)
        preds    += list(map(predict, dmat))
        curindex += batch_size    
    return np.array(preds)

def kappa(probs, actual, cutoff):
    #this is correct, and ~100x faster than scikit learn's cohen_kappa_score() function
    #compared against scikit learn's kappa function and answers were the same
    class_preds                      = np.zeros(len(pred_probs))
    class_preds[pred_probs > cutoff] = 1
    class_preds                      = class_preds.astype(bool)
    n     = len(actual_bools)
    
    preds_true  = class_preds.sum()
    actual_true = actual_bools.sum()
    acc   = (class_preds == actual_bools).sum() / n         #(TP+TN)/(P+N)
    po    = acc
    pe1   = (preds_true / n) * (actual_true / n)
    pe2   = ((n-preds_true) / n) * ((n-actual_true) / n)
    pe    = pe1 + pe2
    kappa = (po-pe)/(1-pe)

    return kappa

def explore_cutoff_knn(probs, actual, step=0.01, metric = 'kappa'):
    kmax          = len(probs[0])
    cutoffs       = np.array(np.arange(0.0,1.0,step).tolist())
    metric_matrix = np.zeros((kmax+1, len(cutoffs)))
    if metric == 'kappa':
        for k in range(kmax):
            for i,cutoff in enumerate(cutoffs):
                metric_matrix[k][i] = kappa(probs[:,k], actual, cutoff)
    else:
        for k in range(kmax):
            for i,cutoff in enumerate(cutoffs):
                class_preds                      = np.zeros(len(actual))
                class_preds[probs[:,k] > cutoff] = 1
                class_preds                      = class_preds.astype(bool)
                if metric == 'acc':
                    metric_matrix[k][i] = (class_preds == actual).sum() / (len(actual))
                elif metric == 'sens':
                    metric_matrix[k][i] = (class_preds & actual).sum() / actual.sum()
                elif metric == 'spec':
                    metric_matrix[k][i] = ((~class_preds) & (~actual)).sum() / (~actual).sum()
    
    fig = go.Figure(data=[go.Surface(z=metric_matrix, x=cutoffs, y=np.arange(1,kmax+1).tolist(),
                                    hovertemplate = '<b>Cutoff</b>: %{x:.3f}' + '<br><b>K</b>: %{y}<br>')])
    fig.update_layout(title='{} as a function of K and Cutoff point'.format(metric),
                      autosize=True,
                      scene={'xaxis': {'title': 'Cutoff'},'yaxis': {'title': 'K'}, 'zaxis': {'title': '{}'.format(metric)}})
    fig.show()

In [None]:
def explore_cutoff(pred_probs, actual_bool, step=0.001):
    
    length      = len(np.arange(0.0,1.0,step).tolist())
    accuracy    = np.zeros(length)
    sensitivity = np.zeros(length)
    specificity = np.zeros(length)
    kappa       = np.zeros(length)
    i = 0

    for cutoff in np.arange(0.0,1.0,step):
        class_preds                      = np.zeros(len(pred_probs))
        class_preds[pred_probs > cutoff] = 1
        class_preds                      = class_preds.astype(bool)
        #print('{} | pred_probs: {}\nclass_preds: {}\nactual_bool: {}'.format(cutoff, pred_probs, class_preds, actual_bool))
        
        acc  = (class_preds == actual_bool).sum() / len(actual_bool)          #(TP+TN)/(P+N)
        sens = (class_preds & actual_bool).sum() / actual_bool.sum()          #TP/P
        spec = ((~class_preds) & (~actual_bool)).sum() / (~actual_bool).sum() #TN/N
        
        po    = acc
        pe1   = (class_preds.sum() / len(actual_bool)) * (actual_bool.sum()/len(actual_bool))
        pe2   = ((~class_preds).sum() / len(actual_bool)) * ((~actual_bool).sum()/len(actual_bool))
        pe    = pe1 + pe2
        kppa  = (po-pe)/(1-pe)
        accuracy[i]    = acc
        sensitivity[i] = sens
        specificity[i] = spec
        kappa[i]       = kppa
        i += 1

    df = pd.DataFrame({'Cutoff':np.arange(0.0,1.0,step).tolist(), 'Accuracy':accuracy, 'Sensitivity':sensitivity, 'Specificity':specificity, 'Kappa':kappa})
    df = pd.melt(df, id_vars=['Cutoff'], value_vars=['Accuracy','Sensitivity','Specificity','Kappa'])
    return px.line(df, x = 'Cutoff', y = 'value', color='variable')
