In [36]:
import pandas as pd
import math

def auc_test(auc1, auc2, n_p, n_n):
    dp1 = (n_p - 1) * (auc1 / (2 - auc1) - pow(auc1, 2))
    dn1 = (n_n - 1) * ((2 * pow(auc1, 2)) / (1 + auc1) - pow(auc1, 2))
    se1 = math.sqrt((auc1 * (1 - auc1) + dp1 + dn1) / (n_p * n_n))
    
    dp2 = (n_p - 1) * (auc2 / (2 - auc2) - pow(auc2, 2))
    dn2 = (n_n - 1) * ((2 * pow(auc2, 2)) / (1 + auc2) - pow(auc2, 2))
    se2 = math.sqrt((auc2 * (1 - auc2) + dp2 + dn2) / (n_p * n_n))
    
    z = (auc1 - auc2) / math.sqrt(pow(se1, 2) + pow(se2, 2))
    
    level = 1
    
    if abs(z) > 3.819:
        level = 0.0001
    elif abs(z) > 3.291:
        level = 0.001
    elif abs(z) > 2.576:
        level = 0.01
    elif abs(z) > 1.96:
        level = 0.05
    
    is_sig = True if level < 1 else False
        
    
    return z, level, is_sig

In [37]:
def pull_auc_scores(filename):
    df = pd.read_csv(filename)

    auc_scores = dict()
    for index, row in df.iterrows():
        country = row['country']
        clf = row['clf']
        feat_set = row['feat_set']

        if country not in auc_scores:
            auc_scores[country] = dict()

        if clf not in auc_scores[country]:
            auc_scores[country][clf] = dict()

        if feat_set not in auc_scores[country][clf]:
            auc_scores[country][clf][feat_set] = [list(), row['N_p'], row['N_n']]

        auc_scores[country][clf][feat_set][0].append(row['auc_score'])
    
    return auc_scores

In [38]:
def compare_auc_scores(auc_scores):
    best_models = dict()
    for country in auc_scores:
        if country not in best_models:
            best_models[country] = [0.0, list()]
        
        for clf in auc_scores[country]:
            for feat_set in auc_scores[country][clf]:
                for i in range(8):
                    auc_i = auc_scores[country][clf][feat_set][0][i]
                    
                    N_p = auc_scores[country][clf][feat_set][1]
                    N_n = auc_scores[country][clf][feat_set][2]

                    all_ns = True
                    for j in reversed(range(8)):
                        if i == j:
                            break
                            
                        auc_j = auc_scores[country][clf][feat_set][0][j]
                        
                        z, level, is_sig = auc_test(auc_i, auc_j, N_p, N_n)
                        
                        print(i+1, j+1, auc_i, auc_j, 'z={:.3f}'.format(z), country, clf, feat_set)
                        
                        if is_sig:
                            all_ns = False
                            break
                    if all_ns:
                        if auc_i > best_models[country][0]:
                            best_models[country] = [auc_i, [clf, feat_set, i + 1]]
                        break
    return best_models
                        

In [39]:
infile_path = '../input/inc-metrics/inc_metrics_us.csv'
auc_scores = pull_auc_scores(infile_path)
best_models = compare_auc_scores(auc_scores)

1 8 0.686534723 0.8363365 z=-85.281 US CART inc-only
2 8 0.648681014 0.8363365 z=-105.903 US CART inc-only
3 8 0.696606627 0.8363365 z=-79.807 US CART inc-only
4 8 0.78235774 0.8363365 z=-32.285 US CART inc-only
5 8 0.767928959 0.8363365 z=-40.496 US CART inc-only
6 8 0.756453329 0.8363365 z=-46.941 US CART inc-only
7 8 0.813997624 0.8363365 z=-13.724 US CART inc-only
1 8 0.68527965 0.895047536 z=-127.311 US CART appended
2 8 0.758517972 0.895047536 z=-86.080 US CART appended
3 8 0.797880094 0.895047536 z=-63.331 US CART appended
4 8 0.837774705 0.895047536 z=-39.060 US CART appended
5 8 0.859308779 0.895047536 z=-25.128 US CART appended
6 8 0.873939209 0.895047536 z=-15.196 US CART appended
7 8 0.879216502 0.895047536 z=-11.502 US CART appended
1 8 0.936669116 0.982071079 z=-52.553 US RF inc-only
2 8 0.917554486 0.982071079 z=-68.059 US RF inc-only
3 8 0.933966083 0.982071079 z=-54.875 US RF inc-only
4 8 0.975598719 0.982071079 z=-10.152 US RF inc-only
5 8 0.970818895 0.982071079 z=-1

In [40]:
best_models

{'US': [0.994919347, ['XGB', 'appended', 8]]}