In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import RandomRotation

torch.multiprocessing.set_start_method('spawn')
torch.set_default_tensor_type('torch.cuda.FloatTensor')

from pyquaternion import Quaternion
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objects as go

from tqdm import tqdm

from src.preprocess import sample_to_rangeview, pcl_to_rangeview
from src.utils import rotation_matrix
from src.settings import DATASET_PATH, LABEL_NUMBER, RV_WIDTH, RV_HEIGHT, NUSCENES
from src.datasets import NuscenesRangeViewDataset
from src.models.lasernet import LaserNet
from src.losses import LaserNetLoss

Loading NuScenes tables for version v1.0-trainval...
Loading nuScenes-lidarseg...
32 category,
8 attribute,
4 visibility,
64386 instance,
12 sensor,
10200 calibrated_sensor,
2631083 ego_pose,
68 log,
850 scene,
34149 sample,
2631083 sample_data,
1166187 sample_annotation,
4 map,
34149 lidarseg,
Done loading in 24.237 seconds.
Reverse indexing ...
Done reverse indexing in 6.8 seconds.


---

## Training data 

- ___Classification___ task includes semantic segmentation. We predict class labels for each point (cell) in the Range View. If a cell in RV gets a class C, we extrapolate that all points (which fell into that cell during the transformation to RV) get the same label.
- ___Regression___ task includes BB regression and mixture parameter tuning

so a single training example consists of: 

- __$X$__: range_view image | __5 x W x H__


- __$Y_{image}$:__ | __C x W x H__, where C - number of classes


- __$Y_{bb}$:__ $\{\{b_{m,1}, b_{m,2}, b_{m,3}, b_{m,4}\}, ..., \}_m^M$ | __M x 4 x 2 x W x H__ | where M is the number of bounding boxes in the image, $b_{m, j} \in R ^2$ is the absolute coordinate of $m$-th bounding box's $j$-th corner

- __$Y_{logstd}$:__ $\log(\sigma)$ of the predicted bb coordinates| __scalar__
    

### DataSets, DataLoaders and Transforms

In [4]:
%%time
train_dataset = NuscenesRangeViewDataset(data_root=DATASET_PATH, n=(0, 128))
val_dataset = NuscenesRangeViewDataset(data_root=DATASET_PATH, n=(128, 256))

CPU times: user 14 ms, sys: 88.6 ms, total: 103 ms
Wall time: 100 ms


  self.point_clouds_features = np.array(self.point_clouds_features)
  self.point_clouds_labels = np.array(self.point_clouds_labels)


In [5]:
# %%time
# train_dataset = NuscenesRangeViewDataset(data_root=DATASET_PATH, n=(0, 8064))
# val_dataset = NuscenesRangeViewDataset(data_root=DATASET_PATH, n=(8064, 9152))

In [6]:
train_dataloader = DataLoader(train_dataset, batch_size=64, num_workers=0)
val_dataloader = DataLoader(val_dataset, batch_size=64, num_workers=0)

---

## Training the model

1. For each point in the image, we use the focal loss $L_{prob}$ to learn the class probabilities $\{p_c\}_{c=1}^C$. The classification loss for the entire image is defined as follows 
$$
L_{cls} = {1 \over P} \sum_i{L_{prob, i}}
$$ 
where P is the number of points in the image

2. For each point on an object, we learn the parameters of the object’s mixture model by first identifying which component best matches the ground truth
$$
k^* = \arg \min_k || \hat b_k − b^{gt} ||
$$
where $\hat b_k$ is the k-th mean component of the mixture model
and $b^{gt}$ is the corresponding ground truth bounding box.

3. Afterwards, we update the parameters of the $k^{*}$ component
$$
L_{box} = \sum_n {1 \over \hat \sigma_{k^*}} | \hat b_{n, k} − b^{gt}_n | + \log{\hat \sigma_{k^*}}
$$


4. Next, we update the mixture weights $\{α_k\}^K_{k=1}$ again using the multi-class cross entropy loss $L_{mix}$, where the positive label corresponds to the $k^*$ component

5. The regression loss for the entire image is defined as follows:
$$
L_{reg} = {{1 \over N} \sum_i{L_{box, i} + \lambda L_{mix, i} \over n_i}}
$$
where $L_{box, i}$ and $L_{mix, i}$ are the losses for the $i$-th point in the image which is on an object, $n_i$ is the total number of points that lie on the same object as $i$, $N$ is the total instances of objects in the image, and $\lambda$ is the relative weighting of the two losses.

6. Final loss is 

$$
L = L_{reg} + L_{cls}
$$

___!NOTE! In this experiment we do not model a distribution of BB, which is equivalent to having a single mixture component or  K=1. Meaning: we do not have $L_{mix}$, we skip step 2___

---

In [7]:
EPOCHS = 2000

lasernet = torch.nn.DataParallel(LaserNet(), device_ids=[0, 1])
loss = LaserNetLoss(focal_loss_reduction='mean')
optimizer = torch.optim.Adam(lasernet.parameters(), lr=0.002)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.95)


lasernet.zero_grad()
loss.zero_grad()
optimizer.zero_grad()

train_losses = []
train_accs = []

val_losses = []
val_accs = []
for epoch in tqdm(range(EPOCHS), position=0, leave=True):
    for batch_rv, batch_labels, batch_target_bbs in tqdm(train_dataloader, leave=True):
        
        batch_pointclass_preds, batch_bb_param_preds, batch_log_std_preds = lasernet(x=batch_rv)
                
        L_train = loss(batch_pointclass_preds, batch_bb_param_preds, batch_log_std_preds,
                 batch_labels,           batch_target_bbs)
        
        train_losses.append(L_train.item())
        
        lasernet.zero_grad()
        L_train.backward()
        optimizer.step()

    with torch.no_grad():
        for batch_rv, batch_labels, batch_target_bbs in tqdm(val_dataloader):
            
            batch_pointclass_preds, batch_bb_param_preds, batch_log_std_preds = lasernet(x=batch_rv)
            
            L_val = loss(batch_pointclass_preds, batch_bb_param_preds, batch_log_std_preds,
                         batch_labels,           batch_target_bbs)
        
            val_losses.append(L_val.item())

    torch.save(lasernet, f'lasernet-d{len(train_dataset)}-b64-e{epoch}-adam-lr002-sch095e1')
    print(epoch, "train_loss", L_train.item(), "val_loss", L_val.item())
    lr_scheduler.step()

  0%|          | 0/2000 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:14<00:14, 14.07s/it][A
100%|██████████| 2/2 [00:25<00:00, 12.95s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:12<00:12, 12.17s/it][A
100%|██████████| 2/2 [00:23<00:00, 11.93s/it][A
  0%|          | 1/2000 [00:49<27:38:46, 49.79s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A

0 train_loss 3.3455796241760254 val_loss 2.9250404834747314



 50%|█████     | 1/2 [00:11<00:11, 11.43s/it][A
100%|██████████| 2/2 [00:23<00:00, 11.69s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:12<00:12, 12.12s/it][A
100%|██████████| 2/2 [00:23<00:00, 11.89s/it][A
  0%|          | 2/2000 [01:36<26:46:40, 48.25s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A

1 train_loss 2.2696895599365234 val_loss 2.122434616088867



 50%|█████     | 1/2 [00:11<00:11, 11.39s/it][A
100%|██████████| 2/2 [00:23<00:00, 11.67s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:12<00:12, 12.18s/it][A
100%|██████████| 2/2 [00:23<00:00, 11.94s/it][A
  0%|          | 3/2000 [02:24<26:30:21, 47.78s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A

2 train_loss 1.6173593997955322 val_loss 1.7101939916610718



 50%|█████     | 1/2 [00:13<00:13, 13.11s/it][A
100%|██████████| 2/2 [00:25<00:00, 12.57s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:12<00:12, 12.12s/it][A
100%|██████████| 2/2 [00:23<00:00, 11.89s/it][A
  0%|          | 4/2000 [03:13<26:44:50, 48.24s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A

3 train_loss 1.25167715549469 val_loss 1.8185899257659912



 50%|█████     | 1/2 [00:11<00:11, 11.38s/it][A
100%|██████████| 2/2 [00:23<00:00, 11.67s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:12<00:12, 12.17s/it][A
100%|██████████| 2/2 [00:23<00:00, 11.90s/it][A
  0%|          | 5/2000 [04:00<26:31:09, 47.85s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A

4 train_loss 1.1063421964645386 val_loss 1.3443888425827026



 50%|█████     | 1/2 [00:11<00:11, 11.37s/it][A
100%|██████████| 2/2 [00:23<00:00, 11.66s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:12<00:12, 12.07s/it][A
100%|██████████| 2/2 [00:23<00:00, 11.86s/it][A
  0%|          | 6/2000 [04:47<26:21:28, 47.59s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A

5 train_loss 1.0421099662780762 val_loss 1.339877963066101



 50%|█████     | 1/2 [00:11<00:11, 11.41s/it][A
100%|██████████| 2/2 [00:23<00:00, 11.67s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:12<00:12, 12.10s/it][A
100%|██████████| 2/2 [00:23<00:00, 11.87s/it][A
  0%|          | 7/2000 [05:34<26:15:22, 47.43s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A

6 train_loss 0.9853835701942444 val_loss 1.2162928581237793


  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 7/2000 [05:35<26:31:29, 47.91s/it]


KeyboardInterrupt: 

In [21]:
len(torch.tensor(0).shape)

0

---

### Adding image information to the LaserNet

In [3]:
class LaserNetPP(nn.Module):
    pass

### Non-maximum supression

- first we discard all boxes with it's class probability less than 0.6
- then we select the most confident point and get rid of all boxes, that overlap with IoU >= 0.5