In [26]:
import os
import pandas as pd
from tqdm import tqdm

In [27]:
###########################################################
############ CARGAMOS ESTUDIO 
###########################################################

In [28]:
ROOT_DIR   = 'C:/Users/Usuario/Datasets/MURA-v1.1/' # directorio raíz
study_type = 'XR_WRIST' # tipo de estudio

DATA_CAT = ['train', 'valid'] # división de datos

In [29]:
def get_study_level_data(study_type):
    """
    Returns a dict, with keys 'train' and 'valid' and respective values as study level dataframes, 
    these dataframes contain three columns 'Path', 'Count', 'Label'
    Args:
        study_type (string): one of the seven study type folder names in 'train/valid/test' dataset 
    """
    study_data = {}
    study_label = {'positive': 1, 'negative': 0}
    for phase in DATA_CAT:
        BASE_DIR = ROOT_DIR + '%s/%s/' % (phase, study_type)
        print(BASE_DIR, '\n')
        #patients = list(os.walk(BASE_DIR))[0][1] # list of patient folder names
        patients = os.listdir(BASE_DIR)
        study_data[phase] = pd.DataFrame(columns=['Path', 'Count', 'Label'])
        i = 0
        for patient in tqdm(patients): # for each patient folder
            for study in os.listdir(BASE_DIR + patient): # for each study in that patient folder
                label = study_label[study.split('_')[1]] # get label 0 or 1
                path = BASE_DIR + patient + '/' + study + '/' # path to this study
                study_data[phase].loc[i] = [path, len(os.listdir(path)), label] # add new row
                i+=1
    return study_data

In [30]:
# #### load study level dict data
study_data = get_study_level_data(study_type='XR_WRIST')

C:/Users/Usuario/Datasets/MURA-v1.1/train/XR_WRIST/ 



  0%|          | 0/3267 [00:00<?, ?it/s]

100%|██████████| 3267/3267 [00:07<00:00, 462.50it/s]


C:/Users/Usuario/Datasets/MURA-v1.1/valid/XR_WRIST/ 



100%|██████████| 207/207 [00:00<00:00, 479.34it/s]


In [31]:
study_data['train'].head()

Unnamed: 0,Path,Count,Label
0,C:/Users/Usuario/Datasets/MURA-v1.1/train/XR_W...,3,1
1,C:/Users/Usuario/Datasets/MURA-v1.1/train/XR_W...,4,0
2,C:/Users/Usuario/Datasets/MURA-v1.1/train/XR_W...,3,1
3,C:/Users/Usuario/Datasets/MURA-v1.1/train/XR_W...,3,1
4,C:/Users/Usuario/Datasets/MURA-v1.1/train/XR_W...,2,0


In [32]:
study_data['valid'].head()

Unnamed: 0,Path,Count,Label
0,C:/Users/Usuario/Datasets/MURA-v1.1/valid/XR_W...,4,1
1,C:/Users/Usuario/Datasets/MURA-v1.1/valid/XR_W...,2,1
2,C:/Users/Usuario/Datasets/MURA-v1.1/valid/XR_W...,3,1
3,C:/Users/Usuario/Datasets/MURA-v1.1/valid/XR_W...,3,1
4,C:/Users/Usuario/Datasets/MURA-v1.1/valid/XR_W...,1,1


In [33]:
###########################################################
######## CREAMOS DATA LOADER Y CLASE DATASET ASOCIADA
###########################################################

In [34]:
from torch.utils.data import Dataset

In [35]:
class ImageDataset(Dataset):
    """training dataset."""

    def __init__(self, df, transform=None):
        """
        Args:
            df (pd.DataFrame): a pandas DataFrame with image path and labels.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        study_path = self.df.iloc[idx, 0]
        count = self.df.iloc[idx, 1]
        images = []
        for i in range(count):
            image = pil_loader(study_path + 'image%s.png' % (i+1))
            images.append(self.transform(image))
        images = torch.stack(images)
        label = self.df.iloc[idx, 2]
        sample = {'images': images, 'label': label}
        
        return sample

In [36]:
from torchvision import transforms
from torch.utils.data import DataLoader

In [37]:
def get_dataloaders(data, batch_size=8, study_level=False):
    '''
    Returns dataloader pipeline with data augmentation
    '''
    data_transforms = {
        'train': transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(10),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 
        ]),
        'valid': transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }

    image_datasets = {x: ImageDataset(data[x], transform=data_transforms[x]) 
                                      for x in DATA_CAT}
    dataloaders = {x: DataLoader(image_datasets[x], batch_size=batch_size, 
                                 shuffle=True, num_workers=4) 
                                 for x in DATA_CAT}
    
    return dataloaders

In [38]:
dataloaders = get_dataloaders(study_data, batch_size=1)

In [39]:
print(dataloaders['train'])

<torch.utils.data.dataloader.DataLoader object at 0x000002A8C6BABA70>


In [40]:
print(dataloaders['valid'])

<torch.utils.data.dataloader.DataLoader object at 0x000002A8C6BAAB10>


In [41]:
dataset_sizes = {x: len(study_data[x]) for x in DATA_CAT}

In [42]:
print(dataset_sizes)

{'train': 3460, 'valid': 237}


In [43]:
####################################################################
############### SE CONSTRUYE EL MODELO
####################################################################

In [45]:
def get_count(df, cat):
    '''
    Returns number of images in a study type dataframe which are of abnormal or normal
    Args:
    df -- dataframe
    cat -- category, "positive" for abnormal and "negative" for normal
    '''
    return df[df['Path'].str.contains(cat)]['Count'].sum()

In [46]:
# tai = total abnormal images, tni = total normal images
tai = {x: get_count(study_data[x], 'positive') for x in DATA_CAT}
tni = {x: get_count(study_data[x], 'negative') for x in DATA_CAT}

In [53]:
print('tai:', tai)
print('tni:', tni, '\n')

tai: {'train': 3987, 'valid': 295}
tni: {'train': 5769, 'valid': 364} 



In [49]:
import torch
from torch.autograd import Variable

In [50]:
def n_p(x):
    '''convert numpy float to Variable tensor float'''    
    return Variable(torch.cuda.FloatTensor([x]), requires_grad=False)

In [51]:
Wt1 = {x: n_p(tni[x] / (tni[x] + tai[x])) for x in DATA_CAT}
Wt0 = {x: n_p(tai[x] / (tni[x] + tai[x])) for x in DATA_CAT}

  return Variable(torch.cuda.FloatTensor([x]), requires_grad=False)


In [54]:
print('Wt0 train:', Wt0['train'])
print('Wt0 valid:', Wt0['valid'])
print('Wt1 train:', Wt1['train'])
print('Wt1 valid:', Wt1['valid'])


Wt0 train: tensor([0.4087], device='cuda:0')
Wt0 valid: tensor([0.4476], device='cuda:0')
Wt1 train: tensor([0.5913], device='cuda:0')
Wt1 valid: tensor([0.5524], device='cuda:0')


In [None]:
#############################################################################################
############### ENTRENAMIENTO DEL MODELO
#############################################################################################

In [55]:
# Definición de pérdida 
class Loss(torch.nn.modules.Module):
    def __init__(self, Wt1, Wt0):
        super(Loss, self).__init__()
        self.Wt1 = Wt1
        self.Wt0 = Wt0
        
    def forward(self, inputs, targets, phase):
        loss = - (self.Wt1[phase] * targets * inputs.log() + self.Wt0[phase] * (1 - targets) * (1 - inputs).log())
        return loss

In [56]:
# importación desde el fichero densenet.py del repositorio
from densenet import densenet169

In [57]:
model = densenet169(pretrained=True)
model = model.cuda()

  nn.init.kaiming_normal(m.weight.data)
Downloading: "https://download.pytorch.org/models/densenet169-b2777c0a.pth" to C:\Users\Usuario/.cache\torch\hub\checkpoints\densenet169-b2777c0a.pth
100%|██████████| 54.7M/54.7M [00:08<00:00, 6.93MB/s]


In [58]:
criterion = Loss(Wt1, Wt0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=1, verbose=True)



In [59]:
def plot_training(costs, accs):
    '''
    Plots curve of Cost vs epochs and Accuracy vs epochs for 'train' and 'valid' sets during training
    '''
    train_acc = accs['train']
    valid_acc = accs['valid']
    train_cost = costs['train']
    valid_cost = costs['valid']
    epochs = range(len(train_acc))

    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1,)
    plt.plot(epochs, train_acc)
    plt.plot(epochs, valid_acc)
    plt.legend(['train', 'valid'], loc='upper left')
    plt.title('Accuracy')
    
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_cost)
    plt.plot(epochs, valid_cost)
    plt.legend(['train', 'valid'], loc='upper left')
    plt.title('Cost')
    
    plt.show()

In [62]:
import copy, time

In [63]:
def train_model(model, criterion, optimizer, dataloaders, scheduler, 
                dataset_sizes, num_epochs):
    since = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    costs = {x:[] for x in DATA_CAT} # for storing costs per epoch
    accs = {x:[] for x in DATA_CAT} # for storing accuracies per epoch
    print('Train batches:', len(dataloaders['train']))
    print('Valid batches:', len(dataloaders['valid']), '\n')
    
    for epoch in range(num_epochs):
        #confusion_matrix = {x: meter.ConfusionMeter(2, normalized=True) 
        #                    for x in DATA_CAT}
        
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)
        # Each epoch has a training and validation phase
        for phase in DATA_CAT:
            model.train(phase=='train')
            running_loss = 0.0
            running_corrects = 0
           
            # Iterate over data.
            for i, data in enumerate(dataloaders[phase]):
                # get the inputs
                print(i, end='\r')
                inputs = data['images'][0]
                labels = data['label'].type(torch.FloatTensor)
                # wrap them in Variable
                inputs = Variable(inputs.cuda())
                labels = Variable(labels.cuda())
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                outputs = model(inputs)
                outputs = torch.mean(outputs)
                loss = criterion(outputs, labels, phase)
                running_loss += loss.data[0]
                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()
                # statistics
                preds = (outputs.data > 0.5).type(torch.cuda.FloatTensor)
                running_corrects += torch.sum(preds == labels.data)
                #confusion_matrix[phase].add(preds, labels.data)
            
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]
            costs[phase].append(epoch_loss)
            accs[phase].append(epoch_acc)
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
            
            #print('Confusion Meter:\n', confusion_matrix[phase].value())
            
            # deep copy the model
            if phase == 'valid':
                scheduler.step(epoch_loss)
                if epoch_acc > best_acc:
                    best_acc = epoch_acc
                    best_model_wts = copy.deepcopy(model.state_dict())
        
        time_elapsed = time.time() - since
        print('Time elapsed: {:.0f}m {:.0f}s'.format(
                time_elapsed // 60, time_elapsed % 60))
        print()
    
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best valid Acc: {:4f}'.format(best_acc))
    plot_training(costs, accs)
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model


In [64]:
model = train_model(model, criterion, optimizer, dataloaders, scheduler, dataset_sizes, num_epochs=5)

Train batches: 3460
Valid batches: 237 

Epoch 1/5
----------


In [None]:
torch.save(model.state_dict(), 'models/model.pth')

In [None]:
#############################################################################################
############### MÉTRICAS
#############################################################################################

In [None]:
def get_metrics(model, criterion, dataloaders, dataset_sizes, phase='valid'):
    '''
    Loops over phase (train or valid) set to determine acc, loss and 
    confusion meter of the model.
    '''
    #confusion_matrix = meter.ConfusionMeter(2, normalized=True)
    running_loss = 0.0
    running_corrects = 0
    for i, data in enumerate(dataloaders[phase]):
        print(i, end='\r')
        labels = data['label'].type(torch.FloatTensor)
        inputs = data['images'][0]
        # wrap them in Variable
        inputs = Variable(inputs.cuda())
        labels = Variable(labels.cuda())
        # forward
        outputs = model(inputs)
        outputs = torch.mean(outputs)
        loss = criterion(outputs, labels, phase)
        # statistics
        running_loss += loss.data[0] * inputs.size(0)
        preds = (outputs.data > 0.5).type(torch.cuda.FloatTensor)
        running_corrects += torch.sum(preds == labels.data)
        #confusion_matrix.add(preds, labels.data)

    loss = running_loss / dataset_sizes[phase]
    acc = running_corrects / dataset_sizes[phase]
    print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, loss, acc))
    #print('Confusion Meter:\n', confusion_matrix.value())

In [None]:
get_metrics(model, criterion, dataloaders, dataset_sizes)