In [1]:
#this notebook establishes baseline auc for CheXpert, using Resnet18 as backbone and 100%% train data
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
from PIL import Image
from matplotlib import pyplot as plt
import torchvision
import torch
import os
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, roc_curve, plot_roc_curve, auc
from sklearn.model_selection import train_test_split

In [2]:
acorn = 1234
torch.manual_seed(acorn)
np.random.seed(acorn)

torch.cuda.is_available()

if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  
    
device = torch.device(dev)  

In [3]:
#process data, filter out only frontal, ap, fillter out uncertainty in classes we care and fill in rest data
def process_data(df):
    
    print('starting size %s' %len(df))
    data = df
    #only use frontal/AP data
    data = data.loc[data['Frontal/Lateral'] == 'Frontal']
    data = data.loc[data['AP/PA'] == 'AP']

    
    category_names = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion']
    
    #filter out all uncertainty labels in classes we care about
    data = data[category_names]
    #tread all empty values in these selected cols as 0
    data = data.fillna(0)
    #filter out -1 (uncertain labels)
    data = data.loc[(data.iloc[:, :] !=-1).all(axis=1)]
    #row-idx of the data we care to keep
    fly_list = data.index
    #reselect from orginal of kept rows
    data = df.iloc[fly_list]

    #select the cols we care about
    wanted_cols = ["Path", 'No Finding'] + category_names
    data = data[wanted_cols]
    
    #filter out rows with no label values
    data['sum']  = data.iloc[:, 1:].sum(axis=1)
    fly_list = data.loc[data['sum']>0].index

    
    data = df[wanted_cols].iloc[fly_list]
    # fill all NA and uncertainty as 0     
    data = data.fillna(0)
    data = data.replace(-1,0)

    print("final size %s" %len(data))
    return data



In [4]:
data_root = '/home/data/'
train = pd.read_csv('%s/CheXpert-v1.0-small/train.csv'%data_root)
test = pd.read_csv('%s/CheXpert-v1.0-small/valid.csv' %data_root)

train = process_data(train)
test = process_data(test)


train.head()

train, valid = train_test_split(train, test_size=0.05, random_state=42, shuffle=True)
print('Train_size %s, valid_size %s ' %(len(train), len(valid)))
valid.head()

starting size 223414
final size 92771
starting size 234
final size 132
Train_size 88132, valid_size 4639 


Unnamed: 0,Path,No Finding,Atelectasis,Cardiomegaly,Consolidation,Edema,Pleural Effusion
61712,CheXpert-v1.0-small/train/patient14892/study18...,0.0,1.0,0.0,1.0,0.0,0.0
91899,CheXpert-v1.0-small/train/patient22063/study1/...,1.0,0.0,0.0,0.0,0.0,0.0
140062,CheXpert-v1.0-small/train/patient33631/study1/...,0.0,0.0,1.0,0.0,0.0,0.0
216267,CheXpert-v1.0-small/train/patient58974/study1/...,0.0,0.0,0.0,0.0,0.0,1.0
81089,CheXpert-v1.0-small/train/patient19481/study1/...,0.0,1.0,1.0,0.0,1.0,0.0


In [5]:
class ChestXRayDataset(torch.utils.data.Dataset):
    def __init__(self, df, transform, data_root):
        #TODO::put something here that perserves aspect ratio
        self.class_names = ['No Finding', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion']
        self.image_dir = data_root
        self.transform = transform
        self.total = len(df)
        self.image_names = df['Path'].to_list()
        self.labels = df[self.class_names].to_numpy()
                    
    def __len__(self):
        return self.total
    
    def __getitem__(self, idx):
        image_name = self.image_names[idx]
        image_path = os.path.join(self.image_dir, image_name)
        image = self.transform(Image.open(image_path).convert('RGB'))
        label = self.labels[idx]
        return image, label

In [6]:
image_size = (320, 320)
resnet_mean = [0.485, 0.456, 0.406]
resnet_std = [0.229, 0.224, 0.225]

#Creating a Transformation Object
train_transform = torchvision.transforms.Compose([
    #Converting images to the size that the model expects
    torchvision.transforms.Resize(size=image_size),
    torchvision.transforms.RandomHorizontalFlip(), #A RandomHorizontalFlip to augment our data
    torchvision.transforms.ToTensor(), #Converting to tensor
    #Normalizing the data to the data that the ResNet18 was trained on
    torchvision.transforms.Normalize(mean = resnet_mean ,
                                    std = resnet_std) 
    
])


#Creating a Transformation Object
test_transform = torchvision.transforms.Compose([
    #Converting images to the size that the model expects
    torchvision.transforms.Resize(size=image_size),
    # We don't do data augmentation in the test/val set    
    torchvision.transforms.ToTensor(), #Converting to tensor
    torchvision.transforms.Normalize(mean = resnet_mean,
                                    std = resnet_std) 
    
])

In [7]:
train_dataset = ChestXRayDataset(train, train_transform, data_root)
test_dataset = ChestXRayDataset(test, test_transform, data_root)
valid_dataset = ChestXRayDataset(valid, test_transform, data_root)

batch_size = 16

dl_train = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

dl_test = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

dl_valid = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

print('smaples: %s, batches: %s,  classes: %s' %( len(train_dataset), len(dl_train), len(train_dataset.class_names) ))
print('smaples: %s, batches: %s,  classes: %s' %( len(test_dataset), len(dl_test), len(test_dataset.class_names) ))
print('smaples: %s, batches: %s,  classes: %s' %( len(valid_dataset), len(dl_valid), len(valid_dataset.class_names) ))



smaples: 88132, batches: 5509,  classes: 6
smaples: 132, batches: 9,  classes: 6
smaples: 4639, batches: 290,  classes: 6


In [8]:
class Resnext50(torch.nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        resnet = torchvision.models.resnet18(pretrained=True)
        resnet.fc = torch.nn.Sequential(
            torch.nn.Dropout(p=0.2),
            torch.nn.Linear(in_features=resnet.fc.in_features, out_features=n_classes)
        )
        self.base_model = resnet
        self.sigm = torch.nn.Sigmoid()

    def forward(self, x):
        return self.sigm(self.base_model(x))

# Initialize the model
model = Resnext50(len(train_dataset.class_names))
model.to(device)
loss_fn = torch.nn.BCELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

In [9]:
def eval_model (model, dl, verbose=True):
    model.eval()
    predicts = []
    targets = []
    total_loss = 0
    class_lookup = dl.dataset.class_names
    n_class = len(class_lookup)

    for val_step, (images, labels) in enumerate(dl):

        imagesGPU, labelsGPU = images.to(device), labels.to(device)        
        outputs = model(imagesGPU)
        loss = loss_fn(outputs, labelsGPU.type(torch.float))       
        total_loss += loss.item()               
        outputs = torch.Tensor.cpu(outputs)
        predicts.append(outputs.detach().numpy())
        targets.append(labels)

    predicts = np.vstack(predicts)
    targets = np.vstack(targets)
    loss = total_loss/len(dl)
    
    res = {}
    total_auc = 0
    total_counts = 0
    for idx in range(n_class):
        truth  = targets[:, idx]
        pp = predicts[:,idx]
        fpr, tpr, thresholds = roc_curve(truth, pp)
        auc_score = auc(fpr, tpr)
        res[class_lookup[idx]] = auc_score
        counts = np.sum(truth)
        #A hack to skip no-funding in auc caclualtion
        if idx != 0 :
            total_auc += auc_score * counts
            total_counts += counts
    avg_auc = total_auc / total_counts
    
    if verbose:
        print()
        for k, v in res.items():
            print(k, v)
        print()
        print('loss:%s, avg_auc:%s' %(loss, avg_auc))
        
    return avg_auc, loss


def train_model(epochs, model, dl_train, dl_test, check_pt = './chexpert_full_checkpt/'):
    print('Starting training..')
    prev_auc = 0.
    for e in range(0, epochs):
        train_loss = 0.        
        model.train() # set model to training phase
        with tqdm(dl_train, unit="batch") as tepoch:
            for images, labels in tepoch:
                images, targets = images.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(images)
                loss = loss_fn(outputs, targets.type(torch.float))

                #Once we get the loss we need to take a gradient step
                loss.backward() #Back propogation
                optimizer.step() #Completes the gradient step by updating all the parameter values(We are using all parameters)
                train_loss += loss.item() #Loss is a tensor which can't be added to train_loss so .item() converts it to float                
                tepoch.set_postfix(loss=loss.item())
        
        print('ave_train_loss %s ' %(train_loss / len(dl_train)))
        print('test...')
        test_auc, test_loss = eval_model(model, dl_test)
        print('valid...')
        valid_acu, valid_loss = eval_model(model, dl_valid)
        
        torch.save(model, 'CheXpert_full_%s_resnet50' %(e) )
        if test_auc > prev_auc:
            prev_auc = test_auc
            
        print('done %s' %e)
    print('Training complete..')



In [None]:
%%time
train_model(10, model, dl_train, dl_test)

  0%|          | 0/5509 [00:00<?, ?batch/s]

Starting training..


 37%|███▋      | 2013/5509 [08:01<12:46,  4.56batch/s, loss=0.339]