In [1]:
import numpy as np
import torch
import sparseconvnet as scn
from next_sparseconvnet.utils.data_loaders import DataGen, collatefn, LabelType
from next_sparseconvnet.networks.architectures import UNet
from next_sparseconvnet.utils.train_utils import *

In [3]:
from torchsummary import summary

Hacer una funcion que me calcule el IoU para un numero n de clases...

In [5]:
true = np.random.randint(0, 3, 50000)
pred = np.random.randint(0, 3, 50000)

In [13]:
def IoU(true, pred, nclass = 3):
    """
        Intersection over union is a metric for semantic segmentation. 
        It returns a IoU value for each class of our input tensors/arrays.
    """
    confusion_matrix = np.zeros((nclass, nclass))

    for i in range(len(true)):
        confusion_matrix[true[i]][pred[i]] += 1
    
    IoU = []
    for i in range(nclass): 
        IoU.append(confusion_matrix[i, i]/(sum(confusion_matrix[:, i]) + sum(confusion_matrix[i, :]) - confusion_matrix[i, i]))
    return np.array(IoU)

In [17]:
a = IoU(true, pred) #lo hace bastante rapido para arrays grandes
b = np.array([1, 1, 1])
print(b / a)

[5.08964253 4.98856122 5.06654512]


In [16]:
print(np.zeros(3)+ a)

[0.19647745 0.2004586  0.19737316]


Parece que IoU me hace bien la confusion matrix con tensores de entrada en lugar de arrays, bien

Defino una funcion que entrena un epoch en el que le tengo que decir que numero de epoch es, el tamaño del batch que se pasa, la red, loss, optimizer y el loader con los datos

Creo que tengo q quitar batch_size como parametro y poner simplemente batch_size como la len de event

In [6]:
def train_one_epoch(epoch_id, net, criterion, optimizer, loader): 
    """       
        Trains the net for all the train data one time
    """
    net.train()
    loss_epoch, iou_epoch = [], []
    for batchid, (coord, ener, label, event) in enumerate(loader):
        label = label.type(torch.LongTensor) #quitar esto una vez se corrija en el collate
        batch_size = len(event)
        ener, label = ener.cuda(), label.cuda()
        
        optimizer.zero_grad()
            
        output = net.forward((coord, ener, batch_size))
            
        loss = criterion(output, label) 
        loss.backward()
            
        optimizer.step()
            
        loss_epoch.append(loss.item())
            
        #IoU
        softmax = torch.nn.Softmax(dim = 1)
        prediction = torch.argmax(softmax(output), 1) 
        iou_epoch.append(IoU(label.cpu(), prediction.cpu()))
        
        if batchid%2==0:
            progress = f"Train Epoch: {epoch_id} [{batchid*batch_size:5}/{len(loader.dataset)}" +\
            f" ({int(100*batchid/len(loader)):2}%)]"
            loss_ = f"\t Loss: {loss.item():.6f}"
            print(progress + loss_)
                
    return loss_epoch, iou_epoch

In [31]:
def valid_one_epoch(net, loader):
    """
        Computes loss and IoU for all the validation data
    """
    net.eval()
    loss_epoch, iou_epoch = [], []
    with torch.autograd.no_grad():
        for batchid, (coord, ener, label, event) in enumerate(loader):
            batch_size = len(event)
            ener, label = ener.cuda(), label.cuda()
                    
            output = net.forward((coord, ener, batch_size))
            
            loss = criterion(output, label) 
            
            loss_epoch.append(loss.item())
            
            #IoU
            softmax = torch.nn.Softmax(dim = 1)
            prediction = torch.argmax(softmax(output), 1) 
            iou_epoch.append(IoU(label.cpu(), prediction.cpu()))
    return loss_epoch, iou_epoch

Creo los loaders con datos de train y validation. En principio unos pocos para probar (entendi sobre 100 de train...)

In [2]:
train_path = "/home/mmkekic/MC_dataset/new_data/train_dataset_200.h5"
valid_path = "/home/mmkekic/MC_dataset/new_data/valid_dataset_10.h5"

In [3]:
nevents_train = 100 #numero de eventos que pillo de cada dataset
nevents_valid = 10

In [4]:
datagen_train = DataGen(train_path, LabelType.Segmentation, nevents = nevents_train)
datagen_valid = DataGen(valid_path, LabelType.Segmentation, nevents = nevents_valid)

In [5]:
batch_size_train = 10
batch_size_valid = 2

In [6]:
loader_train = torch.utils.data.DataLoader(datagen_train, batch_size = batch_size_train, shuffle = True, num_workers=1, collate_fn=collatefn, drop_last=True, pin_memory=False)
loader_valid = torch.utils.data.DataLoader(datagen_valid, batch_size = batch_size_valid, shuffle = True, num_workers=1, collate_fn=collatefn, drop_last=True, pin_memory=False)

In [24]:
def valid_one_epoch_segmentation(net, criterion, loader, nclass = 3):
    """
        Computes loss and IoU for all the validation data
    """
    net.eval()
    loss_epoch, iou_epoch = 0, np.zeros(nclass)
    with torch.autograd.no_grad():
        for batchid, (coord, ener, label, event) in enumerate(loader):
            batch_size = len(event)
            ener, label = ener.cuda(), label.cuda()

            output = net.forward((coord, ener, batch_size))

            loss = criterion(output, label)
            
            loss_epoch += loss.item()
            print(loss_epoch, len(loader))
            #IoU
            softmax = torch.nn.Softmax(dim = 1)
            prediction = torch.argmax(softmax(output), 1)
            iou_epoch += IoU(label.cpu(), prediction.cpu())

        loss_epoch = loss_epoch / len(loader)
        iou_epoch = iou_epoch / len(loader)
        loss_ = f"\t Validation Loss: {loss_epoch:.6f}"
        print(loss_)

    return loss_epoch, iou_epoch

In [9]:
net = UNet((561, 561, 561), 6, 3, [9, 9, 3], [2, 2], 3) 
net = net.cuda()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-2, betas=(0.9, 0.999), eps=1e-6, weight_decay=0)

train_one_epoch_segmentation(0, net, criterion, optimizer, loader_train)
valid_one_epoch_segmentation(net, criterion, loader_valid)

Train Epoch: 0	 Loss: 0.835661
	 Validation Loss: 1662949792.000000


(1662949792.0, array([0.0164881 , 0.82769225, 0.01143791]))

Probamos a sacar las clases que me predice la net y pruebo tambien a ver como funciona IoU 

In [65]:
batch = [datagen_train[0]]
coord, ener, label, event = collatefn(batch)

In [66]:
net = UNet((561, 561, 561), 6, 3, [9, 9, 7, 5, 3, 3, 3], 2, 3) 
output = net.forward((coord, ener))
del(net)

In [67]:
softmax = torch.nn.Softmax(dim = 1)

t = torch.argmax(softmax(output), 1) 

In [68]:
IoU(label, t)

[0.008264462809917356, 0.502092050209205, 0.08333333333333333]

Parece que va bien. Aun asi, cuando lo haga tendre que PASAR TENSORES A CPU EN FUNCION

In [69]:
mem_params = sum([param.nelement()*param.element_size() for param in net.parameters()])
mem_bufs = sum([buf.nelement()*buf.element_size() for buf in net.buffers()])
print((mem_params+mem_bufs)/1024**3)

NameError: name 'net' is not defined

Asigno la red que voy a usar. Se procuran parámetros con spatial size mayor que el del detector, que es 400x400x530 (tenia esto apuntado pero creo que esta mal pq mirando ahora pone que son 441x441x550...), de forma que en la bottom layer la imagen llegue a tener dimension 7x7x7.

In [70]:
#net = UNet((561, 561, 561), 6, 3, [9, 9, 7, 5, 3, 3, 3], 2, 3) 

Loss y optimizer...

In [71]:
#criterion = torch.nn.CrossEntropyLoss() 
#optimizer = torch.optim.Adam(net.parameters(), lr=1e-2, betas=(0.9, 0.999), eps=1e-6, weight_decay=0)

# Entreno a ver

In [7]:
from next_sparseconvnet.utils.train_utils import *
from torch.utils.tensorboard import SummaryWriter

In [8]:
def save_checkpoint(state, filename='checkpoint.pth.tar'):
    torch.save(state, filename)

In [6]:
#net = UNet((561, 561, 561), 6, 3, [9, 9, 7, 5, 3, 3, 3], [2, 2, 2, 2, 2, 2], 3) 
#save_checkpoint(net.state_dict(), 'checkpoint_try')
#del(net)
#red = net.load_state_dict(torch.load('checkpoint_try'))
#vuelve a ser la red entiendo, da igual el nombre q le ponga al archivo...

In [33]:
def train_segmentation(*, nepoch, train_data_path, valid_data_path, train_batch_size, valid_batch_size, net, criterion, optimizer, checkpoint_dir, tensorboard_dir):
    train_gen = DataGen(train_data_path, LabelType.Segmentation, nevents = 100)
    valid_gen = DataGen(valid_data_path, LabelType.Segmentation, nevents = 10)
    
    loader_train = torch.utils.data.DataLoader(train_gen, batch_size = train_batch_size, shuffle = True, num_workers=1, collate_fn=collatefn, drop_last=True, pin_memory=False)
    loader_valid = torch.utils.data.DataLoader(valid_gen, batch_size = valid_batch_size, shuffle = True, num_workers=1, collate_fn=collatefn, drop_last=True, pin_memory=False)
    
    start_loss = np.inf
    writer = SummaryWriter(tensorboard_dir)
    for i in range(nepoch):
        train_loss, train_iou = train_one_epoch_segmentation(i, net, criterion, optimizer, loader_train)
        valid_loss, valid_iou = valid_one_epoch_segmentation(net, criterion, loader_valid)
        
        if valid_loss < start_loss:
            save_checkpoint({'state_dict': net.state_dict(),
                             'optimizer': optimizer.state_dict()}, f'{checkpoint_dir}/net_checkpoint_{i}.pth.tar') 
            start_loss = valid_loss
        
        writer.add_scalar('loss/train', train_loss, i)
        for k, iou in enumerate(train_iou):
            writer.add_scalar(f'iou/train_{k}class', iou, i)
        
        writer.add_scalar('loss/valid', valid_loss, i)
        for k, iou in enumerate(valid_iou):
            writer.add_scalar(f'iou/valid_{k}class', iou, i)

In [10]:
#Train params
nepoch = 2
train_file = '/home/mmkekic/MC_dataset/new_data/train_dataset_all.h5'
valid_file = '/home/mmkekic/MC_dataset/new_data/valid_dataset_10.h5'
train_batch = 10
valid_batch = 2
checkpoint_dir = 'home/mperez/NEXT_SPARSECONVNET/scripts/save_progress'
tensorboard_dir = 'home/mperez/NEXT_SPARSECONVNET/scripts/save_progress'
num_workers = 1
nevents_train = 100
nevents_valid = 10

#UNet params
spatial_size      = (543, 543, 543)
init_conv_nplanes = 8
init_conv_kernel  = 7
kernel_sizes      = [7, 7, 5, 3, 3, 3]
stride_sizes      = [4, 2, 2, 2, 2]
basic_num         = 2

#Optimizer parameters
lr = 1e-2
betas = (0.9, 0.999)
eps = 1e-6
weight_decay = 0

In [30]:
valid_one_epoch_segmentation(net, criterion, loader_valid)

76885.7109375 5
96662.001953125 5
113494.34375 5
172266.40625 5
232724.421875 5
	 Validation Loss: 46544.884375


(46544.884375, array([7.14285714e-03, 8.81238272e-01, 1.16107996e-17]))

In [40]:
net = UNet((543, 543, 543), 8, 7, [7, 7, 5, 3, 3, 3], [4, 2, 2, 2, 2], 2) 
net = net.cuda()
criterion = torch.nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(net.parameters(), lr=1e-2, betas=(0.9, 0.999), eps=1e-6, weight_decay=0)

train_segmentation(nepoch = 2, 
                   train_data_path = train_path, 
                   valid_data_path = valid_path, 
                   train_batch_size = 10,
                   valid_batch_size = 2,
                   net = net, 
                   criterion = criterion, 
                   optimizer = optimizer, 
                   checkpoint_dir = 'checkpoint_example', 
                   tensorboard_dir = 'tensorboard_example',
                   num_workers = 1,
                   nevents_train = 100,
                   nevents_valid = 10)

Train Epoch: 0	 Loss: 0.708779
	 Validation Loss: 80155773.600000
Train Epoch: 1	 Loss: 0.515038
	 Validation Loss: 4.461229
