In [1]:
import pickle
import numpy as np
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from proglearn.forest import UncertaintyForest
import time
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve, plot_roc_curve, auc


In [7]:
#load clusters hayden generated
#clusterIdx to acutal label name lookup
conditions = ['No Finding', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion']
classLabelToFindName = {i: conditions[int(np.math.floor(i / 5))] for i in range(len(conditions) * 5)}
categoryNamesIdxLookup = {c : i for i, c in enumerate(conditions)}

#position indexed cluster idx
all_class_labels = pickle.load(open('/home/weiwya/mini_classes.pkl','rb'))
hierarchy = pickle.load(open('/home/weiwya/class_conditional_clusters.pkl','rb'))


n_classes = len(classLabelToFindName)
n_actual_classes = len(conditions)

print(n_classes, n_actual_classes)

print(classLabelToFindName)
print(categoryNamesIdxLookup)

30 6
{0: 'No Finding', 1: 'No Finding', 2: 'No Finding', 3: 'No Finding', 4: 'No Finding', 5: 'Atelectasis', 6: 'Atelectasis', 7: 'Atelectasis', 8: 'Atelectasis', 9: 'Atelectasis', 10: 'Cardiomegaly', 11: 'Cardiomegaly', 12: 'Cardiomegaly', 13: 'Cardiomegaly', 14: 'Cardiomegaly', 15: 'Consolidation', 16: 'Consolidation', 17: 'Consolidation', 18: 'Consolidation', 19: 'Consolidation', 20: 'Edema', 21: 'Edema', 22: 'Edema', 23: 'Edema', 24: 'Edema', 25: 'Pleural Effusion', 26: 'Pleural Effusion', 27: 'Pleural Effusion', 28: 'Pleural Effusion', 29: 'Pleural Effusion'}
{'No Finding': 0, 'Atelectasis': 1, 'Cardiomegaly': 2, 'Consolidation': 3, 'Edema': 4, 'Pleural Effusion': 5}


In [3]:
def get_data(idx, all_class_labels):

    org_data = pickle.load(open('/home/weiwya/teamdrive_bak/weiwei_temp_data/CheXpert-v1.0-small/train_frontal_Bit_m-r101x1_with_labels.p', 'rb'))

#     pa = org_data.loc[org_data['AP/PA'] == 'PA']
    ap = org_data.loc[org_data['AP/PA'] == 'AP']
    org_data = ap
#     print('ap %s ' %len(org_data))

    class_labels = all_class_labels[idx]
    class_labels = [[k, int(v)] for k, v in class_labels.items()]
    class_labels = pd.DataFrame(data=class_labels, columns=["Path", "class_label"])
    dtypes = {"Path": str ,"class_label": int}
    class_labels = class_labels.astype(dtypes)
#     print(len(class_labels))

    org_data = org_data.merge(class_labels, on='Path')
#     print (len(org_data))
    
    cols = ['vector', 'class_label', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion',  ]

    #select only cared cols
    data = org_data[cols]
    data = data.replace(-1, 0)

# #     #filter out any rows with labels we care about contrains any -1
# #     data = data.loc[(data.iloc[:, 2:] !=-1).all(axis=1)]


#     #filter out rows with multiple 1 in the cols
#     data['sum']  = data.iloc[:, 2:].sum(axis=1)
#     fly_list = data.loc[data['sum']<=1.0].index
# #     #treat all other uncertainties as 0
# #     data = org_data.loc[fly_list].replace(-1, 0 )
# #     print(len(data))
    
    X = np.array([vec for vec in data['vector']])
    Y = data['class_label'].to_numpy()

    print(X.shape, Y.shape)
    return X, Y



In [4]:
def flat_clf (X, Y, n_estimators, max_depth):
    le = LabelEncoder().fit(Y)
    yy = le.transform(Y)
    clf = UncertaintyForest (n_estimators= n_estimators, max_depth= max_depth)
    clf.fit(X, yy)
    return clf, le

def gen_fine_level_data(X, fine_label, coarse_label, wanted_idx):
        
    selected = np.where(coarse_label==wanted_idx)[0]
    X_selected = X[selected]
    Y_fine_selected = fine_label[selected]    
    le = LabelEncoder().fit(Y_fine_selected)
    
    Y_fine_selected = le.transform(Y_fine_selected)

    return X_selected, Y_fine_selected, le


def gen_fine_clf (X, Y_coarse, Y_fine,  wanted_idx, n_estimators=100, max_depth=10, min_sample = 3):

    X_fine_train, Y_fine_train, le = gen_fine_level_data(X, Y_fine, Y_coarse, wanted_idx )
    if X_fine_train.shape[0] <min_sample:
        print ('not enough sample to even try, lets not bother')
        return None
    clf_fine = UncertaintyForest(n_estimators=n_estimators,
                        max_depth=max_depth, tree_construction_proportion = 0.5)
    clf_fine.fit(X_fine_train, Y_fine_train)
    return clf_fine, le


def hieracy_cluster(X_train, Y_train, X_test, Y_test, used_hierarchy, n_estimators=100, max_depth=10 ):
    #relabel classes to hierarchy labels
    hierarchy_relabel = {i:used_hierarchy[i] for i in range(used_hierarchy.shape[0])}
    
    coarseToFineClusters = defaultdict(list)
    for i, cidx in enumerate(used_hierarchy):
        coarseToFineClusters[cidx].append(i)
                
    #change fine label to corse label
    Y_train_hierarchy = np.array([hierarchy_relabel[c] for c in Y_train])
      
    coarseLE = LabelEncoder().fit(Y_train_hierarchy)
    Y_train_hierarchy = coarseLE.transform(Y_train_hierarchy)
    
    clf_coarse = UncertaintyForest (n_estimators= n_estimators, max_depth= max_depth)
    clf_coarse.fit(X_train, Y_train_hierarchy)
    
    print('done train corase labels')
    
    fine_clfs= {}
    for wanted_idx, memebers in coarseToFineClusters.items():
        print('processing %s: memeber_size: %s' %(wanted_idx, len(memebers)))
        fine_clfs[wanted_idx] = gen_fine_clf(X_train, Y_train_hierarchy, Y_train, wanted_idx, n_estimators=n_estimators, max_depth=max_depth )
                
    return  (clf_coarse, coarseLE), fine_clfs, coarseToFineClusters

In [29]:
#doing just 1 flat with all fine labels 
master_seed = 42
np.random.seed(master_seed)
n_iter = 10
seeds = np.random.randint(10000, size=n_iter)
clfs_flat = []
n_trees = 300
max_depth = 20
scores_flat = defaultdict(list)

#hard code cluster 4
used_hierarchy = hierarchy[4]
X, Y = get_data(4, all_class_labels)


for i in range(n_iter):
    start = time.time()
    seed =  seeds[i]
#     used_hierarchy = hierarchy[i]
    print(i)
#     X, Y = get_data(i, all_class_labels)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.90, random_state=seed)
    clfs_flat.append(flat_clf(X_train, Y_train, n_estimators=n_trees, max_depth=max_depth))
    
    #evaluate flat clf
    clf, le = clfs_flat[i]
    pp = clf.predict_proba(X_test)
    pt = pp.argmax(axis=1)
    print(accuracy_score(Y_test, pt))
    n_samples = len(X_test)
    res = np.zeros((n_samples, n_actual_classes))
    for ii in range(pp.shape[1]):
        idx = categoryNamesIdxLookup[classLabelToFindName[ii]]
        res[:, idx] += pp[:, ii]
    
    yy_test = [categoryNamesIdxLookup[classLabelToFindName[y]] for y in Y_test]
    
    for name, idx in categoryNamesIdxLookup.items():
        truth = [1 if curr == idx else 0 for curr in yy_test]
    
        predict = res[:, idx]
        predict = np.clip(predict, a_min=0.0, a_max=1.0)
        fpr, tpr, thresholds = roc_curve(truth, predict)
        auc_score = auc(fpr, tpr)
        scores_flat[name].append(auc_score)
        print(name, auc_score)
    

(7285, 2048) (7285,)
0
0.33216409943571756
No Finding 0.7528083712773124
Atelectasis 0.5381793170197591
Cardiomegaly 0.6086546865059186
Consolidation 0.4913530905941682
Edema 0.5921288376382465
Pleural Effusion 0.6506021060290548
1
0.33551929235931066
No Finding 0.7498744247788838
Atelectasis 0.5528341181324619
Cardiomegaly 0.5684341967493332
Consolidation 0.5450318567805923
Edema 0.5714101246837187
Pleural Effusion 0.6488226682685625
2
0.3188958365105994
No Finding 0.7463517821931367
Atelectasis 0.5980163546065624
Cardiomegaly 0.5739588353280485
Consolidation 0.493930828732373
Edema 0.6019006830263545
Pleural Effusion 0.6532186901051247
3
0.3333841695897514
No Finding 0.7353947904051887
Atelectasis 0.5558352874761161
Cardiomegaly 0.610967094688357
Consolidation 0.5272791077340901
Edema 0.5666402466525201
Pleural Effusion 0.6512896166559462
4
0.3437547658990392
No Finding 0.7568727160151791
Atelectasis 0.5590841959431201
Cardiomegaly 0.5982396229137608
Consolidation 0.5294367908618163


In [34]:
master_seed = 42
np.random.seed(master_seed)
n_iter = 10
seeds = np.random.randint(10000, size=n_iter)
clfs = []
n_trees = 200
max_depth = 15

used_hierarchy = hierarchy[4]
print(4)
X, Y = get_data(4, all_class_labels)

for i in range(n_iter):
    start = time.time()
    seed =  seeds[i]
#     used_hierarchy = hierarchy[i]
#     print(i)
#     X, Y = get_data(i, all_class_labels)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.50, random_state=seed)
    print(X_train.shape, X_test.shape)

    clf_coarse, clf_fines, coarseToFine = hieracy_cluster(X_train, Y_train, X_test, Y_test, used_hierarchy, 
                                            n_estimators=n_trees, max_depth=max_depth)
    
    clfs.append((clf_coarse, clf_fines, coarseToFine))

    print('took %s' %(time.time() - start))
    print()

4
(7285, 2048) (7285,)
(3642, 2048) (3643, 2048)
done train corase labels
processing 2: memeber_size: 9
processing 3: memeber_size: 6
processing 4: memeber_size: 6
processing 1: memeber_size: 6
processing 0: memeber_size: 3
took 1092.3584337234497

(3642, 2048) (3643, 2048)
done train corase labels
processing 2: memeber_size: 9
processing 3: memeber_size: 6
processing 4: memeber_size: 6
processing 1: memeber_size: 6
processing 0: memeber_size: 3
took 1098.3097336292267

(3642, 2048) (3643, 2048)
done train corase labels
processing 2: memeber_size: 9
processing 3: memeber_size: 6
processing 4: memeber_size: 6
processing 1: memeber_size: 6
processing 0: memeber_size: 3
took 1095.8715987205505

(3642, 2048) (3643, 2048)
done train corase labels
processing 2: memeber_size: 9
processing 3: memeber_size: 6
processing 4: memeber_size: 6
processing 1: memeber_size: 6
processing 0: memeber_size: 3
took 1108.5863289833069

(3642, 2048) (3643, 2048)
done train corase labels
processing 2: memeber_

In [21]:
def calculate_prob(stuff, data, n_classe, n_actual_classes):
    n_samples = data.shape[0]

    coarse = stuff[0][0]
    coarseLE = stuff[0][1]
    fine = stuff[1]
    coarseToFine = stuff[2]

    coarse_prob = coarse.predict_proba(data)
    clf_probas = np.zeros((n_samples, n_classes))
    for coarseIdx, fineitems in fine.items():
        try:
            idx = coarseLE.inverse_transform([coarseIdx])[0]
            prob = coarse_prob[:, idx].reshape(-1,1)
            if fineitems is not None:
                clf = fineitems[0]
                le = fineitems[1]
                fine_prob = clf.predict_proba(data)
                fine_prob = prob * fine_prob

                for i in range(fine_prob.shape[1]):
                    idx = le.inverse_transform([i])[0]
                    clf_probas[:, idx] = fine_prob[:, i]

            else:
                #should this be skipped and just 0????
                #things got skipped because not enough samples,
                idx = coarseToFine[coarseIdx]
                clf_probas[:, idx] = coarse_prob[:, idx]

        except:
            #print('ignore because skipped in coarse')
            continue

    res = np.zeros( (n_samples, n_actual_classes))
    for i in range(n_samples):
        p = clf_probas[i]
        for j, pp in enumerate(p):
            idx = classLabelToFindName[j]
            idx = categoryNamesIdxLookup[idx]
            res[i, idx] += pp
        
    return res
    


In [36]:
run_res = []

for i in range (n_iter):
    seed =  seeds[i]
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.50, random_state=seed)
    stuff = clfs[i]
    run_res.append(calculate_prob(stuff, X_test, n_classes, n_actual_classes))
    print('done %i' %i)
    

done 0
done 1
done 2
done 3
done 4
done 5
done 6
done 7
done 8
done 9


In [37]:
auc_agg  = []
auc_pertype = defaultdict(list)
inversLooup = {v:k for k, v in categoryNamesIdxLookup.items()}

def relabel(fine_labels, return_idx =False):
    if not return_idx:
        return [classLabelToFindName[i] for i in fine_labels]
    else:
        return[categoryNamesIdxLookup[classLabelToFindName[i]] for i in fine_labels]
    

for i, res in enumerate(run_res):
    print(i)
    seed =  seeds[i]
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.50, random_state=seed)
    y_test_labels = relabel(Y_test)
    for l, idx in categoryNamesIdxLookup.items():
        truth = [1 if curr == l else 0 for curr in y_test_labels]

        predict = res[:, idx]
        predict = np.clip(predict, a_min=0.0, a_max=1.0)
        fpr, tpr, thresholds = roc_curve(truth, predict)
        auc_score = auc(fpr, tpr)
        auc_pertype[l].append(auc_score)
        print(l, np.average(auc_pertype[l]), np.max(auc_pertype[l]))
    print()
    

0
No Finding 0.7703964728280838 0.7703964728280838
Atelectasis 0.6008547508499981 0.6008547508499981
Cardiomegaly 0.6605065023956194 0.6605065023956194
Consolidation 0.5637286574354794 0.5637286574354794
Edema 0.6092483851196578 0.6092483851196578
Pleural Effusion 0.6797769435660992 0.6797769435660992

1
No Finding 0.7666807632123056 0.7703964728280838
Atelectasis 0.5898313960051241 0.6008547508499981
Cardiomegaly 0.6579885886572852 0.6605065023956194
Consolidation 0.5525055211243629 0.5637286574354794
Edema 0.6140456555055003 0.6188429258913427
Pleural Effusion 0.6714451322658902 0.6797769435660992

2
No Finding 0.7666687834249312 0.7703964728280838
Atelectasis 0.5870653308394157 0.6008547508499981
Cardiomegaly 0.6581658805469379 0.6605065023956194
Consolidation 0.5531645068421777 0.5637286574354794
Edema 0.6108244775672321 0.6188429258913427
Pleural Effusion 0.6742866182101155 0.6799695900985663

3
No Finding 0.7679498842057889 0.7717931865483624
Atelectasis 0.5952327843536567 0.6197

In [33]:
wanted = ['No Finding','Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion']
for w in wanted:
    print (w, np.average(auc_pertype[w]), np.max(auc_pertype[w]), 
              np.average(scores_flat[w]), np.max(scores_flat[w]))

No Finding 0.7474910530471719 0.756811546407752 0.746799873275362 0.7627334726457643
Atelectasis 0.564874461711647 0.5828899109274028 0.5595979652988315 0.5980163546065624
Cardiomegaly 0.5989646603928906 0.620189209088252 0.5873151898771503 0.6132357194901819
Consolidation 0.5359068628313313 0.5653862212516347 0.5250712990387163 0.5508247557003257
Edema 0.5743510602647112 0.5983242475776659 0.5904297757359579 0.6139027810854436
Pleural Effusion 0.6478140363215972 0.6691570078666852 0.6538675374511016 0.6683146936794374


In [None]:
No Finding 0.7012805307416208
Atelectasis 0.5897324335406501
Cardiomegaly 0.6171985902736001
Consolidation 0.621100518265423
Edema 0.6805668404440721
Pleural Effusion 0.7158244847060644

In [None]:
t0 = pickle.load(open('/home/weiwya/class_conditional_clusters.pkl','rb'))
t1 = pickle.load(open('/home/weiwya/mini_classes.pkl','rb'))

# 

In [None]:
No Finding 0.7627334726457643
Atelectasis 0.5543869329425779
Cardiomegaly 0.575708120749727
Consolidation 0.5112101701510382
Edema 0.5826424661181725
Pleural Effusion 0.6683146936794374

In [None]:
used_hierarchy

In [None]:
conditions = ['No Finding', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion']

dic = {i: conditions[int(np.math.floor(i / 6))] for i in range(len(conditions) * 5)}

In [None]:
len(t0)

In [None]:
len(t[1])