In [1]:
import torch
import os
import pandas as pd
import cxr_dataset as CXR
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import matplotlib.pyplot as plt
import sklearn
import sklearn.metrics as sklm
from torch.autograd import Variable
import numpy as np
import warnings
import datetime
date_object = datetime.date.today()

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings("ignore")

In [3]:
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
data_transforms = {
        'val': transforms.Compose([
            transforms.Scale(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])
    }
checkpoint_best = torch.load('checkpoint')
model = checkpoint_best['model']

PRED_LABEL = [
            'Atelectasis',
            'Cardiomegaly',
            'Effusion',
            'Infiltration',
            'Mass',
            'Nodule',
            'Pneumonia',
            'Pneumothorax',
            'Consolidation',
            'Edema',
            'Emphysema',
            'Fibrosis',
            'Pleural_Thickening',
            'Hernia']

In [4]:
def make_pred_multilabel(data_transforms, model, PATH_TO_IMAGES):
    """
    Gives predictions for test fold and calculates AUCs using previously trained model

    Args:
        data_transforms: torchvision transforms to preprocess raw images; same as validation transforms
        model: densenet-121 from torchvision previously fine tuned to training data
        PATH_TO_IMAGES: path at which NIH images can be found
    Returns:
        pred_df: dataframe containing individual predictions and ground truth for each test image
        auc_df: dataframe containing aggregate AUCs by train/test tuples
    """

    # calc preds in batches of 16, can reduce if your GPU has less RAM
    BATCH_SIZE = 4

    # set model to eval mode; required for proper predictions given use of batchnorm
    model.train(False)

    # create dataloader
    dataset = CXR.CXRDataset(
        path_to_images=PATH_TO_IMAGES,
        fold="test",
        transform=data_transforms['val'])
    
    dataloader = torch.utils.data.DataLoader(
        dataset, BATCH_SIZE, shuffle=False, num_workers=16)
    size = len(dataset)

    # create empty dfs
    pred_df = pd.DataFrame(columns=["Image Index"])
    true_df = pd.DataFrame(columns=["Image Index"])

    # iterate over dataloader
    for i, data in enumerate(dataloader):

        inputs, labels, _ = data
        inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())

        true_labels = labels.cpu().data.numpy()
        batch_size = true_labels.shape
        #print(inputs.shape)
        outputs = model(inputs)
        probs = outputs.cpu().data.numpy()

        # get predictions and true values for each item in batch
        for j in range(0, batch_size[0]):
            thisrow = {}
            truerow = {}
            thisrow["Image Index"] = dataset.df.index[BATCH_SIZE * i + j]
            truerow["Image Index"] = dataset.df.index[BATCH_SIZE * i + j]

            # iterate over each entry in prediction vector; each corresponds to
            # individual label
            for k in range(len(dataset.PRED_LABEL)):
                thisrow["prob_" + dataset.PRED_LABEL[k]] = probs[j, k]
                truerow[dataset.PRED_LABEL[k]] = true_labels[j, k]

            pred_df = pred_df.append(thisrow, ignore_index=True)
            true_df = true_df.append(truerow, ignore_index=True)

    #save_pred = os.path.join(PATH_TO_IMAGES, 'preds.csv')
    #pred_df.to_csv(save_pred, index=False)
    return pred_df

In [5]:
pred_pd = pd.read_csv('C:/Users/peter/2080Ti/Jupyter/Fabian_experiment/preds.csv')
pred_pd.mean(axis=0)
row_name = pred_pd.columns[1:]
new_row_name = []
for i in range(len(row_name)):
    new_row_name.append(row_name[i].split('_')[1])
new_row_name[-3] = 'Pleural_Thickening'
new_row_name

prob_Atelectasis           0.142571
prob_Cardiomegaly          0.025346
prob_Consolidation         0.033273
prob_Edema                 0.016827
prob_Effusion              0.094957
prob_Emphysema             0.020726
prob_Fibrosis              0.015991
prob_Hernia                0.002277
prob_Infiltration          0.191990
prob_Mass                  0.041064
prob_Nodule                0.041099
prob_Pleural_Thickening    0.018387
prob_Pneumonia             0.015402
prob_Pneumothorax          0.052554
dtype: float64

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax']

In [6]:
for label in PRED_LABEL:
    base_dir = 'C:/NIH/co_occurrence_5000/'
    path_base = os.path.join(base_dir, label)
    result_mean = pd.DataFrame()
    result_mean['pathology'] = new_row_name
    result_std = pd.DataFrame()
    result_std['pathology'] = new_row_name
    Pathology_result_df = pd.DataFrame()

    path_to_images = path_base
    pred_df = make_pred_multilabel(data_transforms, model, path_to_images)
    Pathology_result_df = Pathology_result_df.append(pred_df)
    print(len(Pathology_result_df))
    mean = pred_df.mean(axis=0)
    std = pred_df.sem(axis=0)
    mean_name = 'mean' 
    std_name = 'std'
    result_mean[mean_name] = mean.values
    result_std[std_name] = std.values
        
    ############### save classification mean and std as .csv files ################
    result_mean = result_mean.T
    result_std = result_std.T
    result_basedir = 'C:/Users/peter/2080Ti/Jupyter/Fabian_experiment/result_correlation'
    result_dir = os.path.join(result_basedir, label)
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
    mean_path = os.path.join(result_dir, 'mean.csv')
    std_path = os.path.join(result_dir, 'std.csv')
    result_path = os.path.join(result_dir, label+'_result.csv')
    result_mean.to_csv(mean_path, index=False)
    result_std.to_csv(std_path, index=False)
    Pathology_result_df.to_csv(result_path, index=False)

5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
5000
