In [1]:
import math
import os
import random
import sys
import struct
import warnings

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K

import torch

# Importing all the layers, even though some are redundant.
from library import pc, quantize, sample_points, augment_pc_with_offsets
from library import VFE_FCN, ElementwiseMaxpool, PointwiseConcat, VFE, VFE_out
from library import ConvMiddleLayer, RPNConvBlock



## Let's get down to business

Let's try to use `tianweiy`'s VoxelNet backbone for CenterPOint

In [6]:
cwd = os.getcwd()
folder = './pretrained/tianweiy/'
fn = folder + 'epoch_20.pth'

In [8]:
checkpoint = torch.load(fn, map_location=torch.device('cpu'))

In [10]:
checkpoint.keys()

dict_keys(['meta', 'state_dict', 'optimizer'])

In [11]:
checkpoint['state_dict'].keys()

odict_keys(['backbone.conv_input.0.weight', 'backbone.conv_input.1.weight', 'backbone.conv_input.1.bias', 'backbone.conv_input.1.running_mean', 'backbone.conv_input.1.running_var', 'backbone.conv_input.1.num_batches_tracked', 'backbone.conv1.0.conv1.weight', 'backbone.conv1.0.conv1.bias', 'backbone.conv1.0.bn1.weight', 'backbone.conv1.0.bn1.bias', 'backbone.conv1.0.bn1.running_mean', 'backbone.conv1.0.bn1.running_var', 'backbone.conv1.0.bn1.num_batches_tracked', 'backbone.conv1.0.conv2.weight', 'backbone.conv1.0.conv2.bias', 'backbone.conv1.0.bn2.weight', 'backbone.conv1.0.bn2.bias', 'backbone.conv1.0.bn2.running_mean', 'backbone.conv1.0.bn2.running_var', 'backbone.conv1.0.bn2.num_batches_tracked', 'backbone.conv1.1.conv1.weight', 'backbone.conv1.1.conv1.bias', 'backbone.conv1.1.bn1.weight', 'backbone.conv1.1.bn1.bias', 'backbone.conv1.1.bn1.running_mean', 'backbone.conv1.1.bn1.running_var', 'backbone.conv1.1.bn1.num_batches_tracked', 'backbone.conv1.1.conv2.weight', 'backbone.conv1.1.

In [13]:
checkpoint['state_dict']['bbox_head.tasks.5.hm.1.num_batches_tracked'].shape

torch.Size([])

In [14]:
type(checkpoint['state_dict'])

collections.OrderedDict

In [31]:
list(checkpoint['state_dict'].items())[0][1]

tensor([[[[[ 1.3147e-02,  8.8868e-04,  6.1531e-02,  ...,  6.1120e-02,
             3.1954e-02, -3.1665e-03],
           [-1.7851e-02,  2.7073e-03, -2.1664e-02,  ..., -4.3554e-02,
            -2.7546e-02,  2.7744e-03],
           [-2.7556e-01,  4.0173e-04, -3.9653e-02,  ...,  1.4062e-01,
            -9.6160e-02,  9.7630e-03],
           [ 3.4876e-02,  5.3687e-03, -7.8558e-04,  ..., -4.4527e-02,
            -5.5418e-02, -2.3668e-03],
           [-3.6068e-01,  1.0762e-01,  6.5200e-01,  ...,  2.0742e-01,
             3.3824e-01, -6.5442e-02]],

          [[ 3.8680e-03, -3.4087e-03,  1.5677e-02,  ...,  1.0207e-02,
             2.7462e-02, -5.1879e-03],
           [ 3.3284e-02,  2.6664e-03,  3.9889e-03,  ..., -1.1412e-02,
            -1.5951e-02,  1.2873e-02],
           [-1.7044e-01, -1.5014e-03, -1.2220e-01,  ..., -9.8058e-02,
            -7.4972e-03,  1.5385e-02],
           [ 2.9827e-02,  4.3306e-03, -9.8829e-03,  ..., -1.7712e-02,
            -9.5279e-02, -1.6502e-02],
           [-3.29

In [42]:
# okay let's inspect...

for ii, item in enumerate(checkpoint['state_dict'].items()):
    shape = "none"
    try:
        shape = item[1].shape
    except:
        pass
    print(f"{ii:>4}\t{str(shape):<32}\t{item[0]}")



   0	torch.Size([3, 3, 3, 5, 16])    	backbone.conv_input.0.weight
   1	torch.Size([16])                	backbone.conv_input.1.weight
   2	torch.Size([16])                	backbone.conv_input.1.bias
   3	torch.Size([16])                	backbone.conv_input.1.running_mean
   4	torch.Size([16])                	backbone.conv_input.1.running_var
   5	torch.Size([])                  	backbone.conv_input.1.num_batches_tracked
   6	torch.Size([3, 3, 3, 16, 16])   	backbone.conv1.0.conv1.weight
   7	torch.Size([16])                	backbone.conv1.0.conv1.bias
   8	torch.Size([16])                	backbone.conv1.0.bn1.weight
   9	torch.Size([16])                	backbone.conv1.0.bn1.bias
  10	torch.Size([16])                	backbone.conv1.0.bn1.running_mean
  11	torch.Size([16])                	backbone.conv1.0.bn1.running_var
  12	torch.Size([])                  	backbone.conv1.0.bn1.num_batches_tracked
  13	torch.Size([3, 3, 3, 16, 16])   	backbone.conv1.0.conv2.weight
  14	torch.Size([16]) 

In [67]:
# Let's ignore bias / batchnorm / misc weights.

# okay let's inspect...

for ii, item in enumerate(checkpoint['state_dict'].items()):
    shape = "none"
    try:
        shape = item[1].shape
    except:
        pass
    if len(shape) > 1:
        print(f"{ii:>4}\t{str(shape):<32}\t{item[0]}")



   0	torch.Size([3, 3, 3, 5, 16])    	backbone.conv_input.0.weight
   6	torch.Size([3, 3, 3, 16, 16])   	backbone.conv1.0.conv1.weight
  13	torch.Size([3, 3, 3, 16, 16])   	backbone.conv1.0.conv2.weight
  20	torch.Size([3, 3, 3, 16, 16])   	backbone.conv1.1.conv1.weight
  27	torch.Size([3, 3, 3, 16, 16])   	backbone.conv1.1.conv2.weight
  34	torch.Size([3, 3, 3, 16, 32])   	backbone.conv2.0.weight
  40	torch.Size([3, 3, 3, 32, 32])   	backbone.conv2.3.conv1.weight
  47	torch.Size([3, 3, 3, 32, 32])   	backbone.conv2.3.conv2.weight
  54	torch.Size([3, 3, 3, 32, 32])   	backbone.conv2.4.conv1.weight
  61	torch.Size([3, 3, 3, 32, 32])   	backbone.conv2.4.conv2.weight
  68	torch.Size([3, 3, 3, 32, 64])   	backbone.conv3.0.weight
  74	torch.Size([3, 3, 3, 64, 64])   	backbone.conv3.3.conv1.weight
  81	torch.Size([3, 3, 3, 64, 64])   	backbone.conv3.3.conv2.weight
  88	torch.Size([3, 3, 3, 64, 64])   	backbone.conv3.4.conv1.weight
  95	torch.Size([3, 3, 3, 64, 64])   	backbone.conv3.4.conv2.

What we want are the **backbones.** That is, the first 142 weights, ending with `...num_batches_tracked`.

## 2. Let's get down to details

Let's recall, roughly, the architecture of VoxelNet:

1. The preprocessing with live data augmentation to get everything into a `(D, H, W, T, 7)` shape, where `T = 35`.
2. The VFE layers, which do point-wise transforms.
    1. The VFE(7, 32) layer has `(7, 16)` matrices in the FCN unit with BN and relu. Combines with elementwise maxpool and concats.
    2. The VFE(32, 128) layer likewise has `(32, 64)` matrices in the FCN unit.
    3. These output on a final FCN unit, with a `(128, 128)` matrix.
3. The convolutional middle layers. The notation is (filters in, filters out, kernel size, stride, padding).
    1. Conv3D$(128, 64, 3, (2,1,1), (1,1,1)$
    2. Conv3D$(64,  64, 3, (1,1,1), (0,1,1)$
    3. Conv3D$(64,  64, 3, (2,1,1), (1,1,1)$
    4. This outputs to a tensor with size $(64, 2, 400, 352)$.
4. The RPN blocks, made from convolution layers with kernel size 3 and filter size 128. The first in each block has stride 2 and the rest 1.
    1. Block 1 has 4 convolution layers
    2. Block 2 has 6 convolution layers
    3. Block 3 has 6 convolution layers
5. The RPN deconv blocks.
    1. filters = 256, kernel_size = 3, strides = 1
    2. filters = 256, kernel_size = 2, strides = 2
    3. filters = 256, kernel_size = 4, strides = 4
6. The RPN output 
    1. Probability map, filter size 2 and kernel size 1
    2. Regression  map, filter size 14 and kernel size 1

In [53]:
voxelnet_weights = list(checkpoint['state_dict'].items())[:142]

for ii, item in enumerate(voxelnet_weights):
    print(f"{ii:>4}  {str(item[1].shape):<32}  {item[0]}")

   0  torch.Size([3, 3, 3, 5, 16])      backbone.conv_input.0.weight
   1  torch.Size([16])                  backbone.conv_input.1.weight
   2  torch.Size([16])                  backbone.conv_input.1.bias
   3  torch.Size([16])                  backbone.conv_input.1.running_mean
   4  torch.Size([16])                  backbone.conv_input.1.running_var
   5  torch.Size([])                    backbone.conv_input.1.num_batches_tracked
   6  torch.Size([3, 3, 3, 16, 16])     backbone.conv1.0.conv1.weight
   7  torch.Size([16])                  backbone.conv1.0.conv1.bias
   8  torch.Size([16])                  backbone.conv1.0.bn1.weight
   9  torch.Size([16])                  backbone.conv1.0.bn1.bias
  10  torch.Size([16])                  backbone.conv1.0.bn1.running_mean
  11  torch.Size([16])                  backbone.conv1.0.bn1.running_var
  12  torch.Size([])                    backbone.conv1.0.bn1.num_batches_tracked
  13  torch.Size([3, 3, 3, 16, 16])     backbone.conv1.0.conv2.w

In [59]:

# Let's make index splices for this
# the names for these layers are weird...
conv_input = voxelnet_weights[0:6]
conv_1_0_1 = voxelnet_weights[6:13]
conv_1_0_2 = voxelnet_weights[13:20]
conv_1_1_1 = voxelnet_weights[20:27]
conv_1_1_2 = voxelnet_weights[27:34]
conv_2     = voxelnet_weights[34:40]
conv_2_3_1 = voxelnet_weights[40:47]
conv_2_3_2 = voxelnet_weights[47:54]
conv_2_4_1 = voxelnet_weights[54:61]
conv_2_4_2 = voxelnet_weights[61:68]
conv_3     = voxelnet_weights[68:74]
conv_3_3_1 = voxelnet_weights[74:81]
conv_3_3_2 = voxelnet_weights[81:88]
conv_3_3_1 = voxelnet_weights[88:95]
conv_3_4_2 = voxelnet_weights[95:102]
conv_4     = voxelnet_weights[102:108]
conv_4_3_1 = voxelnet_weights[108:115]
conv_4_3_2 = voxelnet_weights[115:122]
conv_4_4_1 = voxelnet_weights[122:129]
conv_4_4_2 = voxelnet_weights[129:136]
conv_extra = voxelnet_weights[136:]

voxelnet_blocks = [
    conv_input,
    conv_1_0_1, conv_1_0_2, conv_1_1_1, conv_1_1_2, conv_2,
    conv_2_3_1, conv_2_3_2, conv_2_4_1, conv_2_4_2, conv_3,
    conv_3_3_1, conv_3_3_2, conv_3_3_1, conv_3_4_2, conv_4,
    conv_4_3_1, conv_4_3_2, conv_4_4_1, conv_4_4_2,
    conv_extra
]


In [66]:
for ii, block in enumerate(voxelnet_blocks):
    conv_layer = block[0]
    name = conv_layer[0]
    shape = conv_layer[1].shape
    print(f"{ii:>3}  {str(shape):<32}  {name}")


  0  torch.Size([3, 3, 3, 5, 16])      backbone.conv_input.0.weight
  1  torch.Size([3, 3, 3, 16, 16])     backbone.conv1.0.conv1.weight
  2  torch.Size([3, 3, 3, 16, 16])     backbone.conv1.0.conv2.weight
  3  torch.Size([3, 3, 3, 16, 16])     backbone.conv1.1.conv1.weight
  4  torch.Size([3, 3, 3, 16, 16])     backbone.conv1.1.conv2.weight
  5  torch.Size([3, 3, 3, 16, 32])     backbone.conv2.0.weight
  6  torch.Size([3, 3, 3, 32, 32])     backbone.conv2.3.conv1.weight
  7  torch.Size([3, 3, 3, 32, 32])     backbone.conv2.3.conv2.weight
  8  torch.Size([3, 3, 3, 32, 32])     backbone.conv2.4.conv1.weight
  9  torch.Size([3, 3, 3, 32, 32])     backbone.conv2.4.conv2.weight
 10  torch.Size([3, 3, 3, 32, 64])     backbone.conv3.0.weight
 11  torch.Size([3, 3, 3, 64, 64])     backbone.conv3.4.conv1.weight
 12  torch.Size([3, 3, 3, 64, 64])     backbone.conv3.3.conv2.weight
 13  torch.Size([3, 3, 3, 64, 64])     backbone.conv3.4.conv1.weight
 14  torch.Size([3, 3, 3, 64, 64])     backbone

This is extremely strange.

For starters, the lack of FC layers show that this is certainly not the entire VoxelNet used in the backbone.

Second, it's not evident where these layers come from. The `extra_conv` layer with kernel `(3, 1, 1)` seems not to be from VoxelNet.

Moreso, recall the number of layers:

1. 3 conv middle layers
2. 16 (4 + 6 + 6) RPN conv blocks
3. 3 deconv blocks
4. 2 conv blocks mapping to probability score and regression map, respectively.

This is a total of 24, while we only have 21. 

Maybe I need to understand CenterPoint a little more if I want to add these weights?