In [1]:
import numpy as np
import pandas as pd
import torch
import copy
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision
from torchvision import datasets
import spconv.pytorch as spconv
import matplotlib.pyplot as plt
import mytools
import mymodels

# Prepare Data

In [2]:
# Read pandas dataframe with all information about sparse training tensors
file_loc = '/mnt/scratch/lustre_01/scratch/majd/sparse_training_tensors/'
st_info = pd.read_pickle(file_loc+'sparse_tensor_info.pk')
st_info

Unnamed: 0,dir,offset,diff,energy,true_index
0,"[0.5923457337920527, -0.5369941830475861, -0.6...","[-0.851898273495669, 2.1253245532459824, 0.445...",0.046168,40,0
1,"[-0.6164927192719855, 0.5695943083433039, -0.5...","[-1.017085182270888, -1.6805460012244295, 1.10...",0.028843,40,1
2,"[0.6322337566233259, -0.16773581669128113, -0....","[-0.10613203070195368, 0.22289410895907838, 1....",0.025293,40,2
3,"[0.2908139608694231, -0.8484810341097399, -0.4...","[-1.0096727220437194, 1.2613684348817842, 1.42...",0.034410,40,3
4,"[0.7738521869833273, -0.07925597736546798, -0....","[0.7752193984015442, 0.8404383794565299, 0.902...",0.033654,40,4
...,...,...,...,...,...
2766793,"[-0.2069418024409927, 0.2672370217202399, -0.9...","[-0.5891835692649702, -0.5092523892090935, 1.8...",0.043683,50,2766793
2766794,"[0.5942730241053608, 0.6175260630673811, -0.51...","[-0.723736545709404, -0.10798660967928463, 0.6...",0.030250,50,2766794
2766795,"[0.3312106913072638, 0.5407131844563555, 0.773...","[-0.3615579024644222, 1.934812461239543, -2.26...",0.030298,50,2766795
2766796,"[-0.0071324298603245555, 0.623790473641556, 0....","[0.7469683683355023, -2.603944946514045, -0.74...",0.039175,50,2766796


In [3]:
# Make custom dataset
MyDataset = mytools.CustomDataset(dir_loc = file_loc, st_info = st_info)

# Split datat into training, validation, and testing sets
train_dataset, val_dataset = torch.utils.data.random_split(MyDataset,[0.8, 0.2], generator=torch.Generator().manual_seed(42))

print("Training samples: ", len(train_dataset))
print("Validation samples: ", len(val_dataset))

# Create training and validation DataLoaders
batch_size = 256
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

Training samples:  2213439
Validation samples:  553359


In [4]:
# Print tensor shapes
for X_plot, y_plot, offset_plot in train_dataloader:
    print(f"Shape of X [N, C, H, W, D]: {X_plot.shape}")
    print(f"Shape of y: {y_plot.shape} {y_plot.dtype}")
    print("Offsets: ", offset_plot.shape)
    break

Shape of X [N, C, H, W, D]: torch.Size([256, 120, 120, 120, 1])
Shape of y: torch.Size([256, 3]) torch.float32
Offsets:  torch.Size([256, 3])


In [5]:
#Record shape of voxel grid
grid_shape = X_plot.shape[1:4]
print("Voxel grid shape: " , grid_shape)

Voxel grid shape:  torch.Size([120, 120, 120])


# Load Convnet

In [6]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

model = torch.load('../3D_Heteroscedastic_Convnet_models/3D_HSCDC_CNN_subM-256.pt').to(device)
print(model)

Using cuda device
spConvnet_HSCDC_subM(
  (net): SparseSequential(
    (0): SubMConv3d(1, 32, kernel_size=[7, 7, 7], stride=[1, 1, 1], padding=[0, 0, 0], dilation=[1, 1, 1], output_padding=[0, 0, 0], algo=ConvAlgo.Native)
    (1): ReLU()
    (2): SubMConv3d(32, 40, kernel_size=[5, 5, 5], stride=[1, 1, 1], padding=[0, 0, 0], dilation=[1, 1, 1], output_padding=[0, 0, 0], algo=ConvAlgo.Native)
    (3): ReLU()
    (4): SparseConv3d(40, 50, kernel_size=[6, 6, 6], stride=[2, 2, 2], padding=[0, 0, 0], dilation=[1, 1, 1], output_padding=[0, 0, 0], algo=ConvAlgo.Native)
    (5): ReLU()
    (6): SparseMaxPool3d(kernel_size=[2, 2, 2], stride=[2, 2, 2], padding=[0, 0, 0], dilation=[1, 1, 1], algo=ConvAlgo.MaskImplicitGemm)
    (7): SparseConv3d(50, 30, kernel_size=[3, 3, 3], stride=[2, 2, 2], padding=[0, 0, 0], dilation=[1, 1, 1], output_padding=[0, 0, 0], algo=ConvAlgo.MaskImplicitGemm)
    (8): ReLU()
    (9): SparseConv3d(30, 10, kernel_size=[3, 3, 3], stride=[1, 1, 1], padding=[0, 0, 0], dilat

# Train the model

In [None]:
# Specify optimizer
optimizer = torch.optim.Adam(params = model.parameters(), lr=0.0001, betas=(0.94, 0.999), eps=1e-07)



In [8]:
# Negative Log Likelihood Loss for HSCDC convnet
def NLLloss(output, target):
    
    # target us the x parameters in the Kent distribution
    G = output[0] # \gamma_1 parameters in Kent distribution
    K = output[1] # \kappa parameter in Kent distribution
    
    #print(K.flatten() )
    #print(-1.0*torch.log(torch.div(K,4*torch.pi*torch.sinh(K))).flatten() [K.flatten() < 70])
    
    loss1 = -1.0 * torch.log(torch.div(K,4*torch.pi*torch.sinh(K))).flatten()
    loss2 = -1.0 * ( torch.log(torch.div(K,2*torch.pi)) - K ).flatten()
    
    # Compute negative log likelihood using Kent distribution
    loss = torch.mean( torch.minimum(loss1,loss2) - ( K.flatten() * torch.sum(G*target,dim=1) ) )
    
    return loss

In [9]:
def train(dataloader, model, loss_fn, optimizer, device):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.train()
    train_loss = 0
    for batch, (X, y, offset) in enumerate(dataloader):
        
        X, y = X.type(torch.FloatTensor).to(device), y.to(device)
        
        X = X.coalesce()
        indices = X.indices().permute(1, 0).contiguous().int()
        features = X.values()
            
        # Compute prediction error
        pred = model(features,indices,X.shape[0])
        loss = loss_fn(pred, y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        
        # Gradient Norm Clipping
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 10, norm_type=2)
        
        for name, param in model.named_parameters():
            if torch.isnan(param.grad).any():
                print("nan gradient found")
                print("The loss is :", loss.item())
                raise SystemExit
                
        optimizer.step()
            
        train_loss += loss.item()
            
        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"Current batch training loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            
    train_loss /= num_batches
    print(f"Training loss: {train_loss:>7f}")
    return(train_loss)

In [10]:
Training_losses = np.array([])
Training_losses = np.append(Training_losses, train(train_dataloader, model, NLLloss, optimizer, device))

Current batch training loss: 0.140218  [    0/2213439]
Current batch training loss: 0.060658  [25600/2213439]
Current batch training loss: 0.143541  [51200/2213439]
Current batch training loss: 0.205980  [76800/2213439]
Current batch training loss: 0.069807  [102400/2213439]
Current batch training loss: -0.032565  [128000/2213439]
Current batch training loss: -0.011085  [153600/2213439]
Current batch training loss: 0.175005  [179200/2213439]
Current batch training loss: 0.058386  [204800/2213439]
Current batch training loss: 0.113592  [230400/2213439]
Current batch training loss: 0.017840  [256000/2213439]
Current batch training loss: -0.011780  [281600/2213439]
Current batch training loss: 0.119567  [307200/2213439]
Current batch training loss: 0.494554  [332800/2213439]
Current batch training loss: 0.187047  [358400/2213439]
Current batch training loss: -0.144897  [384000/2213439]
Current batch training loss: 0.157405  [409600/2213439]
Current batch training loss: 0.304793  [435200/2

In [11]:
Training_losses = np.append(Training_losses, train(train_dataloader, model, NLLloss, optimizer, device))

Current batch training loss: 0.151439  [    0/2213439]
Current batch training loss: 0.273674  [25600/2213439]
Current batch training loss: -0.088438  [51200/2213439]
Current batch training loss: 0.060766  [76800/2213439]
Current batch training loss: -0.163592  [102400/2213439]
Current batch training loss: -0.058581  [128000/2213439]
Current batch training loss: 0.105056  [153600/2213439]
Current batch training loss: 0.034851  [179200/2213439]
Current batch training loss: 0.073247  [204800/2213439]
Current batch training loss: 0.191679  [230400/2213439]
Current batch training loss: -0.004271  [256000/2213439]
Current batch training loss: 0.005362  [281600/2213439]
Current batch training loss: 0.127751  [307200/2213439]
Current batch training loss: 0.155060  [332800/2213439]
Current batch training loss: 0.036145  [358400/2213439]
Current batch training loss: 0.108818  [384000/2213439]
Current batch training loss: 0.223151  [409600/2213439]
Current batch training loss: 0.517959  [435200/2

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
Training_losses = np.append(Training_losses, train(train_dataloader, model, NLLloss, optimizer, device))

In [None]:
Training_losses = np.append(Training_losses, train(train_dataloader, model, NLLloss, optimizer, device))

In [None]:
Training_losses = np.append(Training_losses, train(train_dataloader, model, NLLloss, optimizer, device))

# Save the Model

In [11]:
torch.save(final_model, "../3D_Heteroscedastic_Convnet_models/3D_HSCDC_CNN_subM-256.pt")
print("Saved PyTorch Model State to ../3D_Heteroscedastic_Convnet_models/3D_HSCDC_CNN_subM-256.pt")

Saved PyTorch Model State to ../3D_Heteroscedastic_Convnet_models/3D_HSCDC_CNN_subM-256.pt
