In [None]:
import pandas as pd
import math

def auc_test(auc1, auc2, n_p, n_n):
    dp1 = (n_p - 1) * (auc1 / (2 - auc1) - pow(auc1, 2))
    dn1 = (n_n - 1) * ((2 * pow(auc1, 2)) / (1 + auc1) - pow(auc1, 2))
    se1 = math.sqrt((auc1 * (1 - auc1) + dp1 + dn1) / (n_p * n_n))
    
    dp2 = (n_p - 1) * (auc2 / (2 - auc2) - pow(auc2, 2))
    dn2 = (n_n - 1) * ((2 * pow(auc2, 2)) / (1 + auc2) - pow(auc2, 2))
    se2 = math.sqrt((auc2 * (1 - auc2) + dp2 + dn2) / (n_p * n_n))
    
    z = (auc1 - auc2) / math.sqrt(pow(se1, 2) + pow(se2, 2))
    
    level = 1
    
    if abs(z) > 3.819:
        level = 0.0001
    elif abs(z) > 3.291:
        level = 0.001
    elif abs(z) > 2.576:
        level = 0.01
    elif abs(z) > 1.96:
        level = 0.05
    
    is_sig = True if level < 1 else False
        
    
    return z, level, is_sig

In [None]:
def pull_auc_scores(filename):
    df = pd.read_csv(filename)

    auc_scores = dict()
    for index, row in df.iterrows():
        country = row['country']
        clf = row['clf']
        feat_set = row['feat_set']

        if country not in auc_scores:
            auc_scores[country] = dict()

        if clf not in auc_scores[country]:
            auc_scores[country][clf] = dict()

        if feat_set not in auc_scores[country][clf]:
            auc_scores[country][clf][feat_set] = [list(), row['N_p'], row['N_n']]

        auc_scores[country][clf][feat_set][0].append(row['auc_score'])
    
    return auc_scores

In [None]:
def compare_auc_scores(auc_scores):
    outfile = open('./comparisons.csv', 'w+')
    outfile.write('country,clf,feat_set,inc_i,inc_j,auc_i,auc_j,z-score,N,N_p,N_n\n')
    
    best_models = dict()
    for country in auc_scores:
        if country not in best_models:
            best_models[country] = [0.0, list()]
        
        for clf in auc_scores[country]:
            for feat_set in auc_scores[country][clf]:
                for i in range(8):
                    auc_i = auc_scores[country][clf][feat_set][0][i]
                    
                    N_p = auc_scores[country][clf][feat_set][1]
                    N_n = auc_scores[country][clf][feat_set][2]

                    all_ns = True
                    for j in reversed(range(8)):
                        if i == j:
                            break
                            
                        auc_j = auc_scores[country][clf][feat_set][0][j]
                        
                        z, level, is_sig = auc_test(auc_i, auc_j, N_p, N_n)
                        
                        outfile.write('{},{},{},{},{},{},{},{:.3f},{},{},{}\n'.format(
                            country, clf, feat_set, i+1, j+1, auc_i, auc_j, z, N_p + N_n, N_p, N_n))
                        
                        if is_sig:
                            all_ns = False
                            break
                    if all_ns:
                        if auc_i > best_models[country][0]:
                            best_models[country] = [auc_i, [clf, feat_set, i + 1]]
                        break
    outfile.close()
    return best_models
                        

In [None]:
infile_path = '../input/country-metrics/inc_metrics.csv'
auc_scores = pull_auc_scores(infile_path)
best_models = compare_auc_scores(auc_scores)

In [None]:
best_models