In [1]:
#this notebook establishes baseline auc for CheXpert -to cheXphoto transfer rate
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
from PIL import Image
from matplotlib import pyplot as plt
import torchvision
import torch
import os
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, roc_curve, plot_roc_curve, auc

In [2]:
acorn = 1234
torch.manual_seed(acorn)
np.random.seed(acorn)

torch.cuda.is_available()

if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  
    
device = torch.device(dev)  

In [3]:
bad_files = 'CheXphoto-v1.0/train/natural/iphone/patient01595/study8/view1_frontal.jpg'

#process data, filter out only frontal, ap, fillter out uncertainty in classes we care and fill in rest data
def process_data(df):
    
    print('starting size %s' %len(df))
    data = df
    #only use frontal/AP data
    data = data.loc[data['Frontal/Lateral'] == 'Frontal']
    data = data.loc[data['AP/PA'] == 'AP']

    
    category_names = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion']
    
    #filter out all uncertainty labels in classes we care about
    data = data[category_names]
    #tread all empty values in these selected cols as 0
    data = data.fillna(0)
    #filter out -1 (uncertain labels)
    data = data.loc[(data.iloc[:, :] !=-1).all(axis=1)]
    #row-idx of the data we care to keep
    fly_list = data.index
    #reselect from orginal of kept rows
    data = df.iloc[fly_list]

    #select the cols we care about
    wanted_cols = ["Path", 'No Finding'] + category_names
    data = data[wanted_cols]
    
    #filter out rows with no label values
    data['sum']  = data.iloc[:, 1:].sum(axis=1)
    fly_list = data.loc[data['sum']>0].index
    data = df[wanted_cols].iloc[fly_list]
    
    print(len(data))
    #filer out bad_files
    fly_list = data.loc[data['Path'] != bad_files].index
    data = df[wanted_cols].iloc[fly_list]
    
    
    # fill all NA and uncertainty as 0     
    data = data.fillna(0)
    data = data.replace(-1,0)

    print("final size %s" %len(data))
    return data


In [4]:
data_root = '/home/data/CheXphoto/'
train_df = pd.read_csv('%s/CheXphoto-v1.0/train.csv'%data_root)
test_df = pd.read_csv('%s/CheXphoto-v1.0/valid.csv' %data_root)

train_df = process_data(train_df)
test_df = process_data(test_df)
test_df.head()

starting size 32521
13734
final size 13733
starting size 702
396
final size 396


Unnamed: 0,Path,No Finding,Atelectasis,Cardiomegaly,Consolidation,Edema,Pleural Effusion
0,CheXphoto-v1.0/valid/synthetic/digital/patient...,0.0,0.0,1.0,0.0,0.0,0.0
3,CheXphoto-v1.0/valid/synthetic/digital/patient...,0.0,0.0,0.0,0.0,1.0,0.0
4,CheXphoto-v1.0/valid/synthetic/digital/patient...,1.0,0.0,0.0,0.0,0.0,0.0
5,CheXphoto-v1.0/valid/synthetic/digital/patient...,0.0,1.0,0.0,0.0,0.0,1.0
6,CheXphoto-v1.0/valid/synthetic/digital/patient...,0.0,1.0,1.0,0.0,0.0,0.0


In [5]:
class ChestXRayDataset(torch.utils.data.Dataset):
    def __init__(self, df, transform, data_root):
        #TODO::put something here that perserves aspect ratio
        self.class_names = ['No Finding', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion']
        self.image_dir = data_root
        self.transform = transform
        self.total = len(df)
        self.image_names = df['Path'].to_list()
        self.labels = df[self.class_names].to_numpy()
                    
    def __len__(self):
        return self.total
    
    def __getitem__(self, idx):
        try:
            image_name = self.image_names[idx]
            image_path = os.path.join(self.image_dir, image_name)
            image = self.transform(Image.open(image_path).convert('RGB'))
            label = self.labels[idx]
            return image, label
        except:
            print (self.image_names[idx])


In [6]:
image_size = (320, 320)
resnet_mean = [0.485, 0.456, 0.406]
resnet_std = [0.229, 0.224, 0.225]

#Creating a Transformation Object
train_transform = torchvision.transforms.Compose([
    #Converting images to the size that the model expects
    torchvision.transforms.Resize(size=image_size),
    torchvision.transforms.RandomHorizontalFlip(), #A RandomHorizontalFlip to augment our data
    torchvision.transforms.ToTensor(), #Converting to tensor
    #Normalizing the data to the data that the ResNet18 was trained on
    torchvision.transforms.Normalize(mean = resnet_mean ,
                                    std = resnet_std) 
    
])


#Creating a Transformation Object
test_transform = torchvision.transforms.Compose([
    #Converting images to the size that the model expects
    torchvision.transforms.Resize(size=image_size),
    # We don't do data augmentation in the test/val set    
    torchvision.transforms.ToTensor(), #Converting to tensor
    torchvision.transforms.Normalize(mean = resnet_mean,
                                    std = resnet_std) 
    
])

In [7]:
train_dataset = ChestXRayDataset(train_df, train_transform, data_root)
test_dataset = ChestXRayDataset(test_df, test_transform, data_root)

batch_size = 16

dl_train = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

dl_test = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

print('smaples: %s, batches: %s,  classes: %s' %( len(train_dataset), len(dl_train), len(train_dataset.class_names) ))
print('smaples: %s, batches: %s,  classes: %s' %( len(test_dataset), len(dl_test), len(test_dataset.class_names) ))



smaples: 13733, batches: 859,  classes: 6
smaples: 396, batches: 25,  classes: 6


In [8]:
class Resnext50(torch.nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        resnet = torchvision.models.resnet18(pretrained=True)
        resnet.fc = torch.nn.Sequential(
            torch.nn.Dropout(p=0.2),
            torch.nn.Linear(in_features=resnet.fc.in_features, out_features=n_classes)
        )
        self.base_model = resnet
        self.sigm = torch.nn.Sigmoid()

    def forward(self, x):
        return self.sigm(self.base_model(x))

# Initialize the model
chexpert_model = Resnext50(6)
optimizer = torch.optim.Adam(chexpert_model.parameters(), lr=5e-5)

checkpt = torch.load('CheXpert_0_resnet50')
chexpert_model.load_state_dict(checkpt['model_state_dict'])
optimizer.load_state_dict(checkpt['optimizer_state_dict'])

chexpert_model.to(device)
loss_fn = torch.nn.BCELoss().to(device)


In [9]:
def eval_model (model, dl, verbose=True):
    model.eval()
    predicts = []
    targets = []
    total_loss = 0
    class_lookup = dl.dataset.class_names
    n_class = len(class_lookup)

    for val_step, (images, labels) in enumerate(dl):

        imagesGPU, labelsGPU = images.to(device), labels.to(device)        
        outputs = model(imagesGPU)
        loss = loss_fn(outputs, labelsGPU.type(torch.float))       
        total_loss += loss.item()               
        outputs = torch.Tensor.cpu(outputs)
        predicts.append(outputs.detach().numpy())
        targets.append(labels)

    predicts = np.vstack(predicts)
    targets = np.vstack(targets)
    loss = total_loss/len(dl)
    
    res = {}
    total_auc = 0
    total_counts = 0
    for idx in range(n_class):
        truth  = targets[:, idx]
        pp = predicts[:,idx]
        fpr, tpr, thresholds = roc_curve(truth, pp)
        auc_score = auc(fpr, tpr)
        res[class_lookup[idx]] = auc_score
        counts = np.sum(truth)
        #A hack to skip no-funding in auc caclualtion
        if idx != 0 :
            total_auc += auc_score * counts
            total_counts += counts
    avg_auc = total_auc / total_counts
    
    if verbose:
        print()
        for k, v in res.items():
            print(k, v)
        print()
        print('loss:%s, avg_auc:%s' %(loss, avg_auc))
        
    return avg_auc, loss

In [10]:
eval_model(chexpert_model, dl_test)


No Finding 0.942283950617284
Atelectasis 0.7592549834526283
Cardiomegaly 0.7602245184334736
Consolidation 0.8103819444444444
Edema 0.8681020876142826
Pleural Effusion 0.8776007593832578

loss:0.6042515885829925, avg_auc:0.8088138950374034


(0.8088138950374034, 0.6042515885829925)

In [11]:
eval_model(chexpert_model, dl_train)


No Finding 0.8696857470404071
Atelectasis 0.6367356250139212
Cardiomegaly 0.8134791446105784
Consolidation 0.6859232952856652
Edema 0.7662576161983433
Pleural Effusion 0.8477130800962663

loss:0.40971196079420685, avg_auc:0.7779290823063768


(0.7779290823063768, 0.40971196079420685)

In [12]:
full_df = pd.read_csv('%s/CheXphoto-v1.0/train.csv'%data_root)
full_df.head()

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXphoto-v1.0/train/synthetic/digital/patient...,Female,20,Frontal,PA,1.0,0.0,,,,,0.0,,,,0.0,,,
1,CheXphoto-v1.0/train/synthetic/digital/patient...,Female,20,Lateral,,1.0,0.0,,,,,0.0,,,,0.0,,,
2,CheXphoto-v1.0/train/synthetic/digital/patient...,Female,46,Frontal,PA,,,,,1.0,,,,,0.0,,,,
3,CheXphoto-v1.0/train/synthetic/digital/patient...,Female,46,Lateral,,,,,,1.0,,,,,0.0,,,,
4,CheXphoto-v1.0/train/synthetic/digital/patient...,Female,50,Frontal,AP,,,1.0,1.0,1.0,,,,1.0,1.0,1.0,,,0.0
