In [17]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [18]:
from fastai.vision import *
from torchvision.models import *
import yaml
import pandas as pd
import datetime

from sklearn.metrics import roc_auc_score

In [19]:
Your_path_to_models_folder = '/media/trevor/main-storag/master_classes/big_data_health/BDH_project' # exclude '/models'

Your_path_to_dataset = '/media/trevor/main-storag/CheXpert-v1.0-small' # '......./CheXpert-v1.0-small' includes '/CheXpert-v1.0-small'
Your_main_path = '/media/trevor/main-storag' # dir contains CheXpert-v1.0-small

In [20]:
MODEL = 'u_zeros' # change to one of u_ignore, u_zeros, u_ones, u_multiclass, u_sefltrained

In [21]:
chestxrays_root = Path(Your_main_path)
data_path = chestxrays_root

In [22]:
full_train_df = pd.read_csv(Your_path_to_dataset + '/train.csv')
full_valid_df = pd.read_csv(Your_path_to_dataset + '/valid.csv')
full_train_df['train_valid'] = False
full_valid_df['train_valid'] = True
full_df = pd.concat([full_train_df, full_valid_df])
full_df = full_df.reset_index(drop=True)

full_train_df['patient'] = full_train_df.Path.str.split('/',3,True)[2]
full_train_df  ['study'] = full_train_df.Path.str.split('/',4,True)[3]

full_valid_df['patient'] = full_valid_df.Path.str.split('/',3,True)[2]
full_valid_df  ['study'] = full_valid_df.Path.str.split('/',4,True)[3]

In [23]:
chexnet_targets = ['No Finding',
       'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
       'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
       'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
       'Support Devices']

chexpert_targets = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion']

In [24]:
def feature_string_u_multiclass(row):
    feature_list = []
    for feature in chexpert_targets:
        # if 1 then has the feature
        if row[feature] == 1:
            feature_list.append(feature+'_p')
        # if -1 then add ''_u to the feature
        elif row[feature] == -1:
            feature_list.append(feature+'_u')
        elif row[feature] == 0:
            feature_list.append(feature+'_n')
            
    return ';'.join(feature_list)

def feature_string_u_binary(row):
    feature_list = []
    for feature in u_one_features:
        if row[feature] in [-1,1]:
            feature_list.append(feature)
            
    for feature in u_zero_features:
        if row[feature] == 1:
            feature_list.append(feature)
            
    return ';'.join(feature_list)

# Determine the labels of the sample
def feature_string_u_ignore(row):
    feature_list = []
    for feature in chexpert_targets:
        # if 1 then has the feature
        if row[feature] == 1:
            feature_list.append(feature+'_p')
        elif row[feature] == 0:
            feature_list.append(feature+'_n')
            
    return ';'.join(feature_list)

In [25]:
def get_src(df = full_df):
    return (ImageList
        .from_df(df, data_path, 'Path')
        .split_from_df('train_valid')
        .label_from_df('feature_string',label_delim=';')
       )
def get_data(size, src, bs=32):
    return (src.transform(get_transforms(do_flip=False), size=size, padding_mode='zeros')
        .databunch(bs=bs).normalize(imagenet_stats))
def get_preds_per_study():
    valid_preds=learn.get_preds(ds_type=DatasetType.Valid)[0]
   
    for i, c in enumerate(learn.data.classes):
        full_valid_df[c] = valid_preds[:,i]
    
    return full_valid_df.groupby(['patient','study'])[learn.data.classes].max().reset_index()

In [26]:
def get_df_onehot(df):
    for c in u_targets:
        df[c] = [None]*len(df)
    for idx,r in df.iterrows():
        for c in chexpert_targets:
            if r[c] == -1:
                df.loc[idx,c+'_u'] = 1
                df.loc[idx,c+'_p'] = 0
                df.loc[idx,c+'_n'] = 0
            elif r[c] == 1:
                df.loc[idx,c+'_u'] = 0
                df.loc[idx,c+'_p'] = 1
                df.loc[idx,c+'_n'] = 0
            elif r[c] == 0:
                df.loc[idx,c+'_u'] = 0
                df.loc[idx,c+'_p'] = 0
                df.loc[idx,c+'_n'] = 1
            else:
                df.loc[idx,c+'_u'] = 0
                df.loc[idx,c+'_p'] = 0
                df.loc[idx,c+'_n'] = 0
    return df

In [27]:
# binary cases
def validation_eval_binary(learn):
    acts = full_valid_df.groupby(['patient','study'])[learn.data.classes].max().values

    valid_preds=learn.get_preds(ds_type=DatasetType.Valid)
    preds = valid_preds[0]
    preds_df = full_valid_df.copy()

    for i, c in enumerate(learn.data.classes):
        preds_df[c] = preds[:,i]

    preds = preds_df.groupby(['patient','study'])[learn.data.classes].max().values

    auc_scores = {data.classes[i]: roc_auc_score(acts[:,i],preds[:,i]) for i in range(len(chexpert_targets))}

    #average results reported in the associated paper
    chexpert_auc_scores = {'Atelectasis':      0.858,
                           'Cardiomegaly':     0.854,
                           'Consolidation':    0.939,
                           'Edema':            0.941,
                           'Pleural Effusion': 0.936}

    max_feat_len = max(map(len, chexpert_targets))

    avg_chexpert_auc = sum(list(chexpert_auc_scores.values()))/len(chexpert_auc_scores.values())
    avg_auc          = sum(list(auc_scores.values()))/len(auc_scores.values())

    [print(f'{k: <{max_feat_len}}\t auc: {auc_scores[k]:.3}\t chexpert auc: {chexpert_auc_scores[k]:.3}\t difference:\
    {(chexpert_auc_scores[k]-auc_scores[k]):.3}') for k in chexpert_targets]

    print(f'\nAverage auc: {avg_auc:.3} \t CheXpert average auc {avg_chexpert_auc:.3}\t Difference {(avg_chexpert_auc-avg_auc):.3}')
    
    return avg_auc

In [28]:
# multiclass case
def validation_eval_multi(learn):
    # unlike using u_1 and u_0, we need extra classes for binary auc_roc_score
    # we will using similiar strategy as 'one_hot_encoding'
    acts = get_df_onehot(full_valid_df)
    
    # model_classes_pred = [x for x in learn.data.classes if x in chexpert_targets]
    acts = full_valid_df.groupby(['patient','study'])[learn.data.classes].max().values

    valid_preds=learn.get_preds(ds_type=DatasetType.Valid)
    preds = valid_preds[0]
    preds_df = full_valid_df.copy()

    for i, c in enumerate(learn.data.classes):
        preds_df[c] = preds[:,i]

    preds = preds_df.groupby(['patient','study'])[learn.data.classes].max().values
    
    auc_scores = {}
    
    for i in range(len(learn.data.classes)):
        # handel only one class case
        try:
            score = roc_auc_score(acts[:,i],preds[:,i])
        except ValueError:
            n_acts = acts[:,i]
            n_acts = np.append(n_acts,0)
            n_acts = np.append(n_acts,1)
            n_preds = preds[:,i]
            n_preds = np.append(n_preds,0)
            n_preds = np.append(n_preds,1)
            print(n_acts)
            score = roc_auc_score(n_acts,n_preds)
        auc_scores[learn.data.classes[i]] = score
    print(auc_scores)

#     #average results reported in the associated paper
#     chexpert_auc_scores = {'Atelectasis':      0.858,
#                            'Cardiomegaly':     0.854,
#                            'Consolidation':    0.939,
#                            'Edema':            0.941,
#                            'Pleural Effusion': 0.936}

#     max_feat_len = max(map(len, chexpert_targets))

#     avg_chexpert_auc = sum(list(chexpert_auc_scores.values()))/len(chexpert_auc_scores.values())
    avg_auc = sum(list(auc_scores.values()))/len(auc_scores.values())

    [print(f'auc: {auc_scores[k]:.3}\t') for k in u_multi_targets]

    print(f'\nAverage auc: {avg_auc:.3} \t')
    
    return avg_auc

In [29]:
if MODEL == 'u_multiclass':
    u_targets = ['Atelectasis_u', 'Cardiomegaly_u', 'Consolidation_u', 'Edema_u', 'Pleural Effusion_u']
    p_targets = ['Atelectasis_p', 'Cardiomegaly_p', 'Consolidation_p', 'Edema_p', 'Pleural Effusion_p']
    n_targets = ['Atelectasis_n', 'Cardiomegaly_n', 'Consolidation_n', 'Edema_n', 'Pleural Effusion_n']
    u_multi_targets = u_targets + p_targets + n_targets
    full_df['feature_string'] = full_df.apply(feature_string_u_multiclass,axis = 1).fillna('')
elif MODEL == 'u_zeros':
    u_one_features = []
    u_zero_features = ['Atelectasis', 'Edema','Cardiomegaly', 'Consolidation', 'Pleural Effusion']
    full_df['feature_string'] = full_df.apply(feature_string_u_binary,axis = 1).fillna('')
elif MODEL == 'u_ones':
    u_one_features = ['Atelectasis', 'Edema','Cardiomegaly', 'Consolidation', 'Pleural Effusion']
    u_zero_features = []
    full_df['feature_string'] = full_df.apply(feature_string_u_binary,axis = 1).fillna('')
else:
    u_targets = []
    p_targets = ['Atelectasis_p', 'Cardiomegaly_p', 'Consolidation_p', 'Edema_p', 'Pleural Effusion_p']
    n_targets = ['Atelectasis_n', 'Cardiomegaly_n', 'Consolidation_n', 'Edema_n', 'Pleural Effusion_n']
    u_multi_targets = u_targets + p_targets + n_targets
    full_df['feature_string'] = full_df.apply(feature_string_u_ignore,axis = 1).fillna('')

In [30]:
class CheXpertEvalCallback(LearnerCallback):
    def __init__(self, learn):
        super().__init__(learn)
        self.skip = False
        self.avg_auc = 0
    def on_epoch_end(self,**kwargs):
        if self.skip: return
        self.avg_auc = validation_eval(self.learn)
class SaveCallback(LearnerCallback):
    _order = 99
    def __init__(self, learn):
        super().__init__(learn)
        self.epoch = 0
        self.skip = False
    def on_epoch_end(self, **kwargs):
        self.epoch += 1
cbfs = [CheXpertEvalCallback, SaveCallback]
def lr_find_no_cbs(learn):
    learn.callback_fns = [cbf for cbf in learn.callback_fns if cbf not in cbfs]
    lr_find(learn)
    learn.recorder.plot(suggestion=True)
    learn.callback_fns += cbfs

In [31]:
img_size = 224
data = get_data(img_size, get_src(full_df))

In [32]:
# learn = load_learner(Path(Your_path_to_models_folder)/'models',MODEL+'.pkl')
learn = cnn_learner(data, densenet121, callback_fns=cbfs)
learn.load(Your_path_to_models_folder + '/models/' + MODEL)

FileNotFoundError: [Errno 2] No such file or directory: '/media/trevor/main-storag/master_classes/big_data_health/BDH_project/u_zeros.pth'

In [None]:
if MODEL == 'u_multiclass':
    print(validation_eval_multi(learn))
elif MODEL == 'u_zeros':
    print(validation_eval_binary(learn))
elif MODEL == 'u_ones':
    print(validation_eval_binary(learn))
else:
    print(validation_eval_multi(learn))