In [1]:
import numpy as np
import pandas as pd
import torch
import copy
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision
from torchvision import datasets
import spconv.pytorch as spconv
import matplotlib.pyplot as plt
import mytools
import mymodels

# Loss Function

The first note is that the 1st term of our loss function explodes, while an almost equivalent high K approximation does not. Idea: use approximation (loss2) when the original (loss1) explodes.

In [5]:
K = torch.tensor(2.65)

# Try below case to see why Taylor Series method eventually fails for arrows case
# K = torch.tensor(538.0)
# K = torch.tensor(2.65)

# The 1st term of our loss function
loss1 = -1.0 * torch.log(torch.div(K,4*torch.pi*torch.sinh(K)))

# A high K approximation of the 1st term of our loss function
loss2 = -1.0 * ( torch.log(torch.div(K,2*torch.pi)) - K )

# 15th order Taylor series of 1st term of our loss function about K=0
loss3 = K**2/6 - K**4/180 + K**6/2835 - K**8/37800 + K**10/467775 - (691* (K**12) )/ 3831077250 + (2 * (K**14))/127702575 + torch.log(torch.tensor(4)*torch.pi)



print("kappa: ", K)
print("Original 1st term: ", loss1)
print("High K approx.: ", loss2)
print("O(15) TS about 0: ", loss3)
print("--------------------------------------------")
print("loss1-loss2 is 0: ", (loss1-loss2)==0, loss1-loss2, "frac error: ", (loss1-loss2)/loss1 )

print("loss1-loss3 is 0: ", (loss1-loss3)==0, loss1-loss3, "frac error: ", (loss1-loss3)/loss1)




kappa:  tensor(2.6500)
Original 1st term:  tensor(3.5083)
High K approx.:  tensor(3.5133)
O(15) TS about 0:  tensor(3.5134)
--------------------------------------------
loss1-loss2 is 0:  tensor(False) tensor(-0.0050) frac error:  tensor(-0.0014)
loss1-loss3 is 0:  tensor(False) tensor(-0.0050) frac error:  tensor(-0.0014)


# Gradients

Pytorch seems to have issues computing the gradient of 1st term of our loss function, even before it explodes. Idea: use approximation (loss2) when Kappa > 30, otherwise use original (loss1).

In [12]:
val = 9.0

# Analytic grad of 1st term of our loss function
x = torch.tensor(val, requires_grad = False)
grad = (1/torch.tanh(x))-(1/x)
print("Analytic grad: ", grad)

# Pytorch grad of 1st term of our loss function
x1 = torch.tensor(val, requires_grad = True)
y1 = -1.0 * torch.log(torch.div(x1,4*torch.pi*torch.sinh(x1)))
y1.backward()
print("Torch grad: ",x1.grad)

# Pytorch grad of high K approximation of the 1st term of our loss function
x2 = torch.tensor(val, requires_grad = True)
y2 = -1.0 * ( torch.log(torch.div(x2,2*torch.pi)) - x2 )
y2.backward()
print("Torch grad approx: ",x2.grad)

# Pytorch grad of 15th order Taylor series of 1st term of our loss function about K=0
x3 = torch.tensor(val, requires_grad = True)
y3 = x3**2/6 - x3**4/180 + x3**6/2835 - x3**8/37800 + x3**10/467775 - (691* (x3**12) )/ 3831077250 + (2 * (x3**14))/127702575 + torch.log(torch.tensor(4)*torch.pi)
y3.backward()
print("Torch grad TS: ",x3.grad)

print("----------------------------------------------")
print("error grad2: ", grad-x2.grad, ", fractional error: ", (x1.grad-x2.grad)/x1.grad)
print("error grad3: ", grad-x3.grad, ", fractional error: ",  (x1.grad-x3.grad)/x1.grad)



Analytic grad:  tensor(0.8889)
Torch grad:  tensor(0.8889)
Torch grad approx:  tensor(0.8889)
Torch grad TS:  tensor(496788.5625)
----------------------------------------------
error grad2:  tensor(1.1921e-07) , fractional error:  tensor(0.)
error grad3:  tensor(-496787.6875) , fractional error:  tensor(-558886.1250)


# Issues with torch.where and torch.minimum

If one of the arguments in torch.minimum or torch.where is inf, then the gradient will always be nan, even when the condition chooses the differentiable argument.

For example: below, b exists and is differentiable but pytorch says the gradient is nan.

In [17]:
a = torch.tensor(100., requires_grad=True)
b = torch.where(a < 0, torch.exp(a), 1 + a)
b.backward()
print(torch.exp(a))
print(b)
print(a.grad)


tensor(inf, grad_fn=<ExpBackward0>)
tensor(101., grad_fn=<WhereBackward0>)
tensor(nan)


In [18]:
a = torch.tensor(100., requires_grad=True)
b = torch.minimum(torch.exp(a), 1 + a)
b.backward()
print(torch.exp(a))
print(b)
print(a.grad)


tensor(inf, grad_fn=<ExpBackward0>)
tensor(101., grad_fn=<MinimumBackward0>)
tensor(nan)


Below, we see what the gradient should be.

In [64]:
a = torch.tensor(100., requires_grad=True)
b = 1 + a
b.backward()
a.grad

tensor(1.)

# Fixing the issue

## Attempt 1: Using torch.nan_to_num

In [22]:
a = torch.tensor(100., requires_grad=True)
b = torch.where(a < 0, torch.nan_to_num(torch.exp(a)), 1 + a)
b.backward()
a.grad

tensor(nan)

This attempt clearly did not work.

## Attempt 2: using masked tensors

This bug is discussed here: https://github.com/pytorch/pytorch/issues/10729. Pytorch created masked tensors to adress this issue.

In [55]:
from torch.masked import masked_tensor, as_masked_tensor

x = torch.tensor([-10., -5, 0, 5, 10, 50, 60, 70, 80, 90, 100], requires_grad=True)
mask = x < 0
mx = masked_tensor(x, mask, requires_grad=True)
my = masked_tensor(torch.ones_like(x), ~mask, requires_grad=True)
y = torch.where(mask, torch.exp(mx), my)
s = y.sum()
s.backward()

print("mx.grad: ", mx.grad)
print("my.grad: ", my.grad)


mx.grad:  MaskedTensor(
  [  0.0000,   0.0067,       --,       --,       --,       --,       --,       --,       --,       --,       --]
)
my.grad:  MaskedTensor(
  [      --,       --,   1.0000,   1.0000,   1.0000,   1.0000,   1.0000,   1.0000,   1.0000,   1.0000,   1.0000]
)




However, I'm not totaly sure how to implement this / whether it supports the operations that I am using. See the error below when I try to implement this.

In [56]:
# Read pandas dataframe with all information about sparse training tensors
file_loc = '/home/majd/sparse_training_arrows/'
st_info = pd.read_pickle(file_loc+'sparse_tensor_info.pk')

# Make custom dataset
MyDataset = mytools.CustomDataset(dir_loc = file_loc, st_info = st_info)

# Split datat into training, validation, and testing sets
train_dataset, val_dataset = torch.utils.data.random_split(MyDataset,[0.8, 0.2], generator=torch.Generator().manual_seed(42))

print("Training samples: ", len(train_dataset))
print("Validation samples: ", len(val_dataset))

# Create training and validation DataLoaders
batch_size = 256
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

# Print tensor shapes
for X_plot, y_plot, offset_plot in train_dataloader:
    print(f"Shape of X [N, C, H, W, D]: {X_plot.shape}")
    print(f"Shape of y: {y_plot.shape} {y_plot.dtype}")
    print("Offsets: ", offset_plot.shape)
    break
    
#Record shape of voxel grid
grid_shape = X_plot.shape[1:4]
print("Voxel grid shape: " , grid_shape)

# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

model = mymodels.spConvnet_HSCDC_subM(shape = grid_shape ).to(device)
print(model)


# Specify optimizer
optimizer = torch.optim.Adam(params = model.parameters(), lr=0.0001, betas=(0.94, 0.999), eps=1e-08)



Training samples:  4000
Validation samples:  1000
Shape of X [N, C, H, W, D]: torch.Size([256, 120, 120, 120, 1])
Shape of y: torch.Size([256, 3]) torch.float32
Offsets:  torch.Size([256, 3])
Voxel grid shape:  torch.Size([120, 120, 120])
Using cuda device
spConvnet_HSCDC_subM(
  (net): SparseSequential(
    (0): SubMConv3d(1, 32, kernel_size=[7, 7, 7], stride=[1, 1, 1], padding=[0, 0, 0], dilation=[1, 1, 1], output_padding=[0, 0, 0], algo=ConvAlgo.Native)
    (1): ReLU()
    (2): SubMConv3d(32, 40, kernel_size=[5, 5, 5], stride=[1, 1, 1], padding=[0, 0, 0], dilation=[1, 1, 1], output_padding=[0, 0, 0], algo=ConvAlgo.Native)
    (3): ReLU()
    (4): SparseConv3d(40, 50, kernel_size=[6, 6, 6], stride=[2, 2, 2], padding=[0, 0, 0], dilation=[1, 1, 1], output_padding=[0, 0, 0], algo=ConvAlgo.Native)
    (5): ReLU()
    (6): SparseMaxPool3d(kernel_size=[2, 2, 2], stride=[2, 2, 2], padding=[0, 0, 0], dilation=[1, 1, 1], algo=ConvAlgo.MaskImplicitGemm)
    (7): SparseConv3d(50, 30, kernel_siz

In [57]:
# Negative Log Likelihood Loss for HSCDC convnet
def NLLloss_masked(output, target):
    
    # target us the x parameters in the Kent distribution
    G = output[0] # \gamma_1 parameters in Kent distribution
    K = output[1].flatten() # \kappa parameter in Kent distribution
    
    loss1 = -1.0 * torch.log(torch.div(K,4*torch.pi*torch.sinh(K)))
    loss2 = -1.0 * ( torch.log(torch.div(K,2*torch.pi)) - K )
    
    mask = K<30.0
    
    mx = masked_tensor(loss1.clone().detach(), mask)
    my = masked_tensor(loss2.clone().detach(), ~mask)
    
    loss_K = torch.where(mask, mx, my)
    
        
    # Compute negative log likelihood using Kent distribution
    loss = torch.mean( loss_K - ( K * torch.sum(G*target,dim=1) ))
    
    
    return loss

In [58]:
# Implement early stopping in training loop
# Stop if validation loss has not decreased for the last [patience] epochs
# The model with the lowest loss is stored
patience = 2

Training_losses = np.array([])
Validation_losses = np.array([])

epochs = 1000
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    
    Training_losses = np.append(Training_losses, mytools.train(train_dataloader, model, NLLloss_masked, optimizer, device))
    Validation_losses = np.append(Validation_losses, mytools.validate(val_dataloader, model, NLLloss_masked, device))
    
    # Keep a running copy of the model with the lowest loss
    # Do not copy the model if loss in nan
    if (Validation_losses[-1] == np.min(Validation_losses)) and (~np.isnan(Validation_losses[-1])):
        final_model = copy.deepcopy(model)
    
    if len(Validation_losses) > patience:
        if np.sum((Validation_losses[-1*np.arange(patience)-1] - Validation_losses[-1*np.arange(patience)-2]) < 0) == 0:
            print("Stopping early!")
            break
            
print("Done!")

Epoch 1
-------------------------------


If you would like this operator to be supported, please file an issue for a feature request at https://github.com/pytorch/maskedtensor/issues with a minimal reproducible code snippet.
In the case that the semantics for the operator are not trivial, it would be appreciated to also include a proposal for the semantics.


TypeError: no implementation found for 'torch._ops.aten.softplus_backward.default' on types that implement __torch_dispatch__: [<class 'torch.masked.maskedtensor.core.MaskedTensor'>]

## Attempt 3: using a Taylor series to replace the function that causes the inf values

The 1st term of our loss function causes the inf values which lead to nans. The high K approximation of the 1st term of our loss function is extremely accurate even to low Ks. Therefore, we replace our first time completely. For low values of K (0-2.6), we use a 15th order taylor series about k=0/ Fot higher values (K>2.6), we use the high K approximation. With this treatment there are no more inf values given to torch.where and hence no more nan gradients.

In [62]:
# Read pandas dataframe with all information about sparse training tensors
file_loc = '/home/majd/sparse_training_arrows/'
st_info = pd.read_pickle(file_loc+'sparse_tensor_info.pk')

# Make custom dataset
MyDataset = mytools.CustomDataset(dir_loc = file_loc, st_info = st_info)

# Split datat into training, validation, and testing sets
train_dataset, val_dataset = torch.utils.data.random_split(MyDataset,[0.8, 0.2], generator=torch.Generator().manual_seed(42))

print("Training samples: ", len(train_dataset))
print("Validation samples: ", len(val_dataset))

# Create training and validation DataLoaders
batch_size = 256
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

# Print tensor shapes
for X_plot, y_plot, offset_plot in train_dataloader:
    print(f"Shape of X [N, C, H, W, D]: {X_plot.shape}")
    print(f"Shape of y: {y_plot.shape} {y_plot.dtype}")
    print("Offsets: ", offset_plot.shape)
    break
    
#Record shape of voxel grid
grid_shape = X_plot.shape[1:4]
print("Voxel grid shape: " , grid_shape)

# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

model = mymodels.spConvnet_HSCDC_subM(shape = grid_shape ).to(device)
print(model)


# Specify optimizer
optimizer = torch.optim.Adam(params = model.parameters(), lr=0.0001, betas=(0.94, 0.999), eps=1e-08)



Training samples:  4000
Validation samples:  1000
Shape of X [N, C, H, W, D]: torch.Size([256, 120, 120, 120, 1])
Shape of y: torch.Size([256, 3]) torch.float32
Offsets:  torch.Size([256, 3])
Voxel grid shape:  torch.Size([120, 120, 120])
Using cuda device
spConvnet_HSCDC_subM(
  (net): SparseSequential(
    (0): SubMConv3d(1, 32, kernel_size=[7, 7, 7], stride=[1, 1, 1], padding=[0, 0, 0], dilation=[1, 1, 1], output_padding=[0, 0, 0], algo=ConvAlgo.Native)
    (1): ReLU()
    (2): SubMConv3d(32, 40, kernel_size=[5, 5, 5], stride=[1, 1, 1], padding=[0, 0, 0], dilation=[1, 1, 1], output_padding=[0, 0, 0], algo=ConvAlgo.Native)
    (3): ReLU()
    (4): SparseConv3d(40, 50, kernel_size=[6, 6, 6], stride=[2, 2, 2], padding=[0, 0, 0], dilation=[1, 1, 1], output_padding=[0, 0, 0], algo=ConvAlgo.Native)
    (5): ReLU()
    (6): SparseMaxPool3d(kernel_size=[2, 2, 2], stride=[2, 2, 2], padding=[0, 0, 0], dilation=[1, 1, 1], algo=ConvAlgo.MaskImplicitGemm)
    (7): SparseConv3d(50, 30, kernel_siz

In [63]:
# Negative Log Likelihood Loss for HSCDC convnet
def NLLloss_TS(output, target):
    
    # target us the x parameters in the Kent distribution
    G = output[0] # \gamma_1 parameters in Kent distribution
    K = output[1].flatten() # \kappa parameter in Kent distribution
    
    # 15th order taylor series about 0
    loss1 = K**2/6 - K**4/180 + K**6/2835 - K**8/37800 + K**10/467775 - (691* (K**12) )/ 3831077250 + (2 * (K**14))/127702575 + torch.log(torch.tensor(4)*torch.pi)
    # high K approx
    loss2 = -1.0 * ( torch.log(torch.div(K,2*torch.pi)) - K )
    
    loss_K = torch.where(K<2.56, loss1, loss2)
    
        
    # Compute negative log likelihood using Kent distribution
    loss = torch.mean( loss_K  - ( K * torch.sum(G*target,dim=1) ))
    
    
    return loss

In [64]:
# Implement early stopping in training loop
# Stop if validation loss has not decreased for the last [patience] epochs
# The model with the lowest loss is stored
patience = 2

Training_losses = np.array([])
Validation_losses = np.array([])

epochs = 1000
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    
    Training_losses = np.append(Training_losses, mytools.train(train_dataloader, model, NLLloss_TS, optimizer, device))
    Validation_losses = np.append(Validation_losses, mytools.validate(val_dataloader, model, NLLloss_TS, device))
    
    # Keep a running copy of the model with the lowest loss
    # Do not copy the model if loss in nan
    if (Validation_losses[-1] == np.min(Validation_losses)) and (~np.isnan(Validation_losses[-1])):
        final_model = copy.deepcopy(model)
    
    if len(Validation_losses) > patience:
        if np.sum((Validation_losses[-1*np.arange(patience)-1] - Validation_losses[-1*np.arange(patience)-2]) < 0) == 0:
            print("Stopping early!")
            break
            
print("Done!")

Epoch 1
-------------------------------
Current batch training loss: 2.586734  [    0/ 4000]
Training loss: 2.533583
Validation loss: 2.128556 

Epoch 2
-------------------------------
Current batch training loss: 2.131131  [    0/ 4000]
Training loss: 1.970529
Validation loss: 1.886137 

Epoch 3
-------------------------------
Current batch training loss: 1.886433  [    0/ 4000]
Training loss: 1.811873
Validation loss: 1.603441 

Epoch 4
-------------------------------
Current batch training loss: 1.602957  [    0/ 4000]
Training loss: 0.331600
Validation loss: -1.465047 

Epoch 5
-------------------------------
Current batch training loss: -1.467933  [    0/ 4000]
Training loss: -2.036310
Validation loss: -2.295343 

Epoch 6
-------------------------------
Current batch training loss: -2.296365  [    0/ 4000]
Training loss: -2.447041
Validation loss: -2.573034 

Epoch 7
-------------------------------
Current batch training loss: -2.584703  [    0/ 4000]
Training loss: -2.705175
Vali

Interestingly, only using the high K approximation seems to work best for the simple arrows case. 