In [7]:
import pandas as pd
import math

def auc_test(auc1, auc2, n_p, n_n):
    dp1 = (n_p - 1) * (auc1 / (2 - auc1) - pow(auc1, 2))
    dn1 = (n_n - 1) * ((2 * pow(auc1, 2)) / (1 + auc1) - pow(auc1, 2))
    se1 = math.sqrt((auc1 * (1 - auc1) + dp1 + dn1) / (n_p * n_n))
    
    dp2 = (n_p - 1) * (auc2 / (2 - auc2) - pow(auc2, 2))
    dn2 = (n_n - 1) * ((2 * pow(auc2, 2)) / (1 + auc2) - pow(auc2, 2))
    se2 = math.sqrt((auc2 * (1 - auc2) + dp2 + dn2) / (n_p * n_n))
    
    z = (auc1 - auc2) / math.sqrt(pow(se1, 2) + pow(se2, 2))
    if abs(z) > 3.819:
        print('z = {:.3f}, p < 0.0001'.format(z))
    elif abs(z) > 3.291:
        print('z = {:.3f}, p < 0.001'.format(z))
    elif abs(z) > 2.576:
        print('z = {:.3f}, p < 0.01'.format(z))
    elif abs(z) > 1.96:
        print('z = {:.3f}, p < 0.05'.format(z))
    else:
        print('not significant')
    
    return z

In [8]:
def pull_auc_scores(filename):
    df = pd.read_csv(filename)

    auc_scores = dict()
    for index, row in df.iterrows():
        country = row['country']
        clf = row['clf']
        feat_set = row['feat_set']

        if country not in auc_scores:
            auc_scores[country] = dict()

        if clf not in auc_scores[country]:
            auc_scores[country][clf] = dict()

        if feat_set not in auc_scores[country][clf]:
            auc_scores[country][clf][feat_set] = [list(), row['N_p'], row['N_n']]

        auc_scores[country][clf][feat_set][0].append(row['auc_score'])
    
    return auc_scores

In [11]:
def compare_auc_scores(auc_scores):
    for country in auc_scores:
        for clf in auc_scores[country]:
            for feat_set in auc_scores[country][clf]:
                for i in range(7):
                    for j in reversed(range(8)):
                        if i == j:
                            break
                            
                        auc_i = auc_scores[country][clf][feat_set][0][i]
                        auc_j = auc_scores[country][clf][feat_set][0][j]
                        
                        N_p = auc_scores[country][clf][feat_set][1]
                        N_n = auc_scores[country][clf][feat_set][2]
                        
                        z = auc_test(auc_i, auc_j, N_p, N_n)

In [None]:
infile_path = '../input/inc-metrics/inc_metrics_us.csv'
auc_scores = pull_auc_scores(infile_path)
compare_auc_scores(auc_scores)