In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import RandomRotation

torch.multiprocessing.set_start_method('spawn')
torch.set_default_tensor_type('torch.cuda.FloatTensor')

from pyquaternion import Quaternion
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objects as go

from tqdm import tqdm

from src.preprocess import sample_to_rangeview, pcl_to_rangeview
from src.utils import rotation_matrix
from src.settings import DATASET_PATH, LABEL_NUMBER, RV_WIDTH, RV_HEIGHT, NUSCENES
from src.datasets import NuscenesRangeViewDataset
from src.models.lasernet import LaserNet
from src.losses import LaserNetLoss

Loading NuScenes tables for version v1.0-trainval...
Loading nuScenes-lidarseg...
32 category,
8 attribute,
4 visibility,
64386 instance,
12 sensor,
10200 calibrated_sensor,
2631083 ego_pose,
68 log,
850 scene,
34149 sample,
2631083 sample_data,
1166187 sample_annotation,
4 map,
34149 lidarseg,
Done loading in 23.177 seconds.
Reverse indexing ...
Done reverse indexing in 6.6 seconds.


---

## Training data 

- ___Classification___ task includes semantic segmentation. We predict class labels for each point (cell) in the Range View. If a cell in RV gets a class C, we extrapolate that all points (which fell into that cell during the transformation to RV) get the same label.
- ___Regression___ task includes BB regression and mixture parameter tuning

so a single training example consists of: 

- __$X$__: range_view image | __5 x W x H__


- __$Y_{image}$:__ | __C x W x H__, where C - number of classes


- __$Y_{bb}$:__ $\{\{b_{m,1}, b_{m,2}, b_{m,3}, b_{m,4}\}, ..., \}_m^M$ | __M x 4 x 2 x W x H__ | where M is the number of bounding boxes in the image, $b_{m, j} \in R ^2$ is the absolute coordinate of $m$-th bounding box's $j$-th corner

- __$Y_{logstd}$:__ $\log(\sigma)$ of the predicted bb coordinates| __scalar__
    

### DataSets, DataLoaders and Transforms

In [2]:
%%time
train_dataset = NuscenesRangeViewDataset(data_root=DATASET_PATH, n=(0, 4))
val_dataset = NuscenesRangeViewDataset(data_root=DATASET_PATH, n=(0, 4))

CPU times: user 4.26 ms, sys: 44 µs, total: 4.31 ms
Wall time: 3.33 ms


  self.point_clouds_features = np.array(self.point_clouds_features)
  self.point_clouds_labels = np.array(self.point_clouds_labels)


In [3]:
# %%time
# train_dataset = NuscenesRangeViewDataset(data_root=DATASET_PATH, n=(0, 8064))
# val_dataset = NuscenesRangeViewDataset(data_root=DATASET_PATH, n=(8064, 9152))

In [4]:
train_dataloader = DataLoader(train_dataset, batch_size=4, num_workers=0)
val_dataloader = DataLoader(val_dataset, batch_size=4, num_workers=0)

---

## Training the model

1. For each point in the image, we use the focal loss $L_{prob}$ to learn the class probabilities $\{p_c\}_{c=1}^C$. The classification loss for the entire image is defined as follows 
$$
L_{cls} = {1 \over P} \sum_i{L_{prob, i}}
$$ 
where P is the number of points in the image

2. For each point on an object, we learn the parameters of the object’s mixture model by first identifying which component best matches the ground truth
$$
k^* = \arg \min_k || \hat b_k − b^{gt} ||
$$
where $\hat b_k$ is the k-th mean component of the mixture model
and $b^{gt}$ is the corresponding ground truth bounding box.

3. Afterwards, we update the parameters of the $k^{*}$ component
$$
L_{box} = \sum_n {1 \over \hat \sigma_{k^*}} | \hat b_{n, k} − b^{gt}_n | + \log{\hat \sigma_{k^*}}
$$


4. Next, we update the mixture weights $\{α_k\}^K_{k=1}$ again using the multi-class cross entropy loss $L_{mix}$, where the positive label corresponds to the $k^*$ component

5. The regression loss for the entire image is defined as follows:
$$
L_{reg} = {{1 \over N} \sum_i{L_{box, i} + \lambda L_{mix, i} \over n_i}}
$$
where $L_{box, i}$ and $L_{mix, i}$ are the losses for the $i$-th point in the image which is on an object, $n_i$ is the total number of points that lie on the same object as $i$, $N$ is the total instances of objects in the image, and $\lambda$ is the relative weighting of the two losses.

6. Final loss is 

$$
L = L_{reg} + L_{cls}
$$

___!NOTE! In this experiment we do not model a distribution of BB, which is equivalent to having a single mixture component or  K=1. Meaning: we do not have $L_{mix}$, we skip step 2___

---

In [34]:
import torch
import torch.nn.functional as F

class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        n_samples, n_classes, _, _ = inputs.shape
        
        targets = torch.argmax(targets, axis=1)
        ce_loss = F.cross_entropy(inputs, targets.type(torch.long), reduction='none')  # == -log(pt)

        pt = torch.exp(-ce_loss)
        
        f_loss = torch.mean(self.alpha * (1 - pt) ** self.gamma * ce_loss, dim=(1, 2))
        return f_loss

class BoundingBoxRegressionLoss(torch.nn.Module):
    def __init__(self):
        super(BoundingBoxRegressionLoss, self).__init__()

    def forward(self, inputs, targets, log_std_preds, bb_loss_mask):
        """
        inputs.shape == targets.shape == (N, 8, RV_WIDTH, RV_HEIGHT)

        std_preds - predicted log standart deviations of bounding box corners
        """
        
        if len(inputs.shape) == 0:
            return torch.tensor(0)

#         log_std_preds = log_std_preds.unsqueeze(1)
#         one_over_sigma = torch.exp(-log_std_preds)

#         box_losses = one_over_sigma * torch.abs(inputs - targets) + log_std_preds  # N x C x W x H
        
        box_losses = torch.nn.MSELoss(reduction='none')(inputs, targets)
        
        print("|0|", torch.mean(box_losses,axis=1).shape)
        print("|1|", bb_loss_mask.shape)
        print("|2|", torch.mean(box_losses,axis=1)[bb_loss_mask].shape)
        print("|3|", (targets.permute(0, 2, 3, 1)[bb_loss_mask, :] == 0).any())
        return torch.mean(torch.mean(box_losses,axis=1)[bb_loss_mask])

class LaserNetLoss(torch.nn.Module):

    def __init__(self, f_alpha=1, f_gamma=2, focal_loss_reduction='mean'):
        super(LaserNetLoss, self).__init__()

        self.focal_loss = FocalLoss(alpha=f_alpha, gamma=f_gamma, reduction=focal_loss_reduction)
        self.bb_reg_loss = BoundingBoxRegressionLoss()
        self.non_object_labels = [0, 24, 25, 26, 27, 28, 29, 30, 31]

    def forward(self,
                y_pointclass_preds, y_bb_preds, y_logstd_preds, 
                y_pointclass_target, y_bb_targets):
        
        point_target_labels = torch.argmax(y_pointclass_target, axis=1)
        
        L_point_cls = self.focal_loss(inputs=y_pointclass_preds,
                                      targets=y_pointclass_target)
        
        # cell mask for points that have bb targets
        bb_mask = torch.sum(y_bb_targets, axis=1) != 0

        L_box_corners = self.bb_reg_loss(y_bb_preds,
                                         y_bb_targets,
                                         y_logstd_preds,
                                         bb_mask)
        
        print('|point_classification_loss|', L_point_cls.mean().item(), '|bounding_box_loss|', L_box_corners.mean().item())

        return torch.mean(L_point_cls + L_box_corners)

In [35]:
EPOCHS = 2000

lasernet = torch.nn.DataParallel(LaserNet(), device_ids=[0, 1])
loss = LaserNetLoss()
optimizer = torch.optim.Adam(lasernet.parameters(), lr=0.005)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.95)


lasernet.zero_grad()
loss.zero_grad()
optimizer.zero_grad()

train_losses = []
train_accs = []

val_losses = []
val_accs = []
for epoch in range(EPOCHS):
    
    for batch_rv, batch_labels, batch_target_bbs in train_dataloader:
        
        batch_pointclass_preds, batch_bb_param_preds, batch_log_stds = lasernet(x=batch_rv)
        
#         print(batch_pointclass_preds.shape, batch_bb_param_preds.shape, batch_log_stds.shape)
        
        L_train = loss(batch_pointclass_preds, batch_bb_param_preds, batch_log_stds, 
                       batch_labels, batch_target_bbs)
        
        lasernet.zero_grad()
        
        if torch.isnan(L_train):
            break
        L_train.backward()
        optimizer.step()
    
#     with torch.no_grad():
#         for batch_rv, batch_labels, batch_target_bbs in val_dataloader:
            
#             batch_pointclass_preds, batch_bb_param_preds = lasernet(x=batch_rv)

#             L_val = loss(batch_pointclass_preds, batch_bb_param_preds, batch_labels, batch_target_bbs)
        
#             val_losses.append(L_val.item())

#     torch.save(lasernet, f'lasernet-d{len(train_dataset)}-b64-e{epoch}-adam-lr002-sch095e1')

    if torch.isnan(L_train):
        break
            
    if epoch % 10 == 0:
        batch_pointclass_pred_labels = torch.argmax(batch_pointclass_preds, axis=1)
        batch_pointclass_labels = torch.argmax(batch_labels, axis=1)
        
        correct_preds = torch.sum(batch_pointclass_pred_labels == batch_pointclass_labels)
        all_points = torch.sum(batch_pointclass_labels == batch_pointclass_labels)
        accuracy = (correct_preds / all_points).item() * 100
        
        train_losses.append(L_train.item())
        train_accs.append(accuracy)
        
        print(epoch, "train_loss", L_train.item(), "accuracy",  accuracy) #, "val_loss", L_val.item())
    if epoch % 500 == 0:
        lr_scheduler.step()

|0| torch.Size([4, 128, 32])
|1| torch.Size([4, 128, 32])
|2| torch.Size([699])
|3| tensor(False)
|point_classification_loss| 3.8292064666748047 |bounding_box_loss| 11.120442390441895
0 train_loss 14.9496488571167 accuracy 0.750732421875
|0| torch.Size([4, 128, 32])
|1| torch.Size([4, 128, 32])
|2| torch.Size([699])
|3| tensor(False)
|point_classification_loss| 3.048405885696411 |bounding_box_loss| 11.560037612915039
|0| torch.Size([4, 128, 32])
|1| torch.Size([4, 128, 32])
|2| torch.Size([699])
|3| tensor(False)
|point_classification_loss| 2.2995004653930664 |bounding_box_loss| 10.606685638427734
|0| torch.Size([4, 128, 32])
|1| torch.Size([4, 128, 32])
|2| torch.Size([699])
|3| tensor(False)
|point_classification_loss| 1.8630638122558594 |bounding_box_loss| 10.485335350036621


KeyboardInterrupt: 

In [None]:
plt.plot(train_losses)
plt.show()

In [None]:
plt.plot(train_accs)
plt.show()

__[3] after transform fix | cross entropy and MSE | no log std | lr=0.001| gamma 097 every 100 epochs__

|point_classification_loss| 0.25498661398887634 |bounding_box_loss| 85.12548828125

1980 train_loss 85.60669708251953 val_loss 85.47611236572266


bad boxes (even lines, not boxes) in the right places!

---
__[4] after transform fix | cross entropy and MSE | no log std | only non-zero boxes | lr=0.001| gamma 097 every 100 epochs____

|point_classification_loss| 0.8824732303619385 |bounding_box_loss| 19.866052627563477

870 train_loss 20.781124114990234 val_loss 20.748525619506836


looks more like boxes, but in the wrong places as well, may fals positives near ego

the further the object is, the thinner is it's bounding box

---

__[5] after transform fix | cross entropy and MSE | no log std | only non-zero boxes | lr=0.01| gamma 097 every 500 epochs__

|point_classification_loss| 0.3059435784816742 |bounding_box_loss| 20.469608306884766

870 train_loss 20.82895851135254 accuracy 89.617919921875

---
__[6] after transform fix| heads no elu | focal loss and log_std error | all boxes | lr=0.00005| gamma 095 every 200 epochs| std for each rv cell| mean by object__

too slow, could ve made the lr bigger or the decrease slower

|point_classification_loss| 0.2606876790523529 |bounding_box_loss| 1.9241503477096558


1990 train_loss 2.184837818145752 accuracy 83.06884765625

---

__[7] after transform fix| heads no elu | focal loss and log_std error | all boxes | lr=0.001| gamma 095 every 500 epochs| std for each rv cell| mean by object__

|point_classification_loss| 0.08178392797708511 |bounding_box_loss| 1.3698492050170898

1990 train_loss 1.4516332149505615 accuracy 93.096923828125

---

__[8] after transform fix| heads no elu | focal loss and mse | selected boxes | lr=0.01| gamma 095 every 500 epochs| std for each rv cell| mean by object | 2k epochs__


|point_classification_loss| 0.15576070547103882 |bounding_box_loss| 1.5972236394882202

1380 train_loss 1.7529842853546143 accuracy 88.897705078125

---

__[9] fixed bb gradient mask| focal loss and mse| selected boxes| lr=0.01| gamma 095 every 500 epochs| std for each rv cell| 2k epochs__

|point_classification_loss| 0.028342999517917633 |bounding_box_loss| 7.2635931968688965

1130 train_loss 7.312100887298584 accuracy 97.137451171875



---

In [11]:
torch.save(lasernet, 'lasernet9')

In [12]:
# lasernet = torch.load('lasernet8')

In [13]:
non_object_labels = [0, 24, 25, 26, 27, 28, 29, 30, 31]

with torch.no_grad():
    pointclass_preds, bb_param_preds, _ = lasernet(x=batch_rv)

pointclass_preds = pointclass_preds.detach().cpu().numpy()
bb_corner_preds = bb_param_preds.detach().cpu().numpy()

range_view_ex = batch_rv[1]
class_predictions_ex = pointclass_preds[1]
bb_corner_preds_ex = bb_corner_preds[1]

range_view_ex.shape, class_predictions_ex.shape, bb_corner_preds_ex.shape

(torch.Size([7, 128, 32]), (32, 128, 32), (8, 128, 32))

In [105]:
# Width x Height, max class labels of each cell
class_pred_labels_ex = np.argmax(class_predictions_ex, axis=0)

# sum([class_pred_labels_ex == nol for nol in non_object_labels]) == 0 128x32

indices = np.array(np.nonzero(sum([class_pred_labels_ex == nol for nol in non_object_labels]) == 0)).T
# unique widths and heights of cells wich are classified as objects

# Classes x Width x Height, softmax probabilities of classes in each cell
bb_class_probs_ex = torch.softmax(torch.Tensor(class_predictions_ex), dim=0).cpu().numpy()
print(bb_class_probs_ex.shape)
fig = go.Figure(data=[go.Scatter3d(x=range_view_ex[0].cpu().flatten(),
                                   y=range_view_ex[1].cpu().flatten(),
                                   z=np.zeros_like(range_view_ex[2].cpu().flatten()),
                                   mode='markers',
                                   marker=dict(size=2))])

for w, h in indices:
        # check model's certainty
        class_prob = max(bb_class_probs_ex[:, w, h])
        class_label = np.argmax(bb_class_probs_ex[:, w, h])

        if class_prob > 0.5:
            x = bb_corner_preds_ex[0::2, w, h]
            y = bb_corner_preds_ex[1::2, w, h]
            
            fig.add_mesh3d(x=list(x), 
                           y=list(y),
                           z=np.zeros_like(x))

fig.write_html("kek9.html")

(32, 128, 32)


---

### Adding image information to the LaserNet

In [None]:
class LaserNetPP(nn.Module):
    pass

### Non-maximum supression

- first we discard all boxes with it's class probability less than 0.6
- then we select the most confident point and get rid of all boxes, that overlap with IoU >= 0.5