In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import torchvision
import numpy as np
from torch.autograd import Variable
import pickle
import re

In [2]:
__all__ = ['I3DResNet', 'resnet50', 'resnet101', 'resnet152']

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, time_kernel=1, space_stride=1, downsample=None, addnon=False):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv3d(inplanes, 
                               planes, 
                               kernel_size=(time_kernel,1,1), 
                               padding=(int((time_kernel-1)/2), 0,0),
                               bias=False) # timepadding: make sure time-dim not reduce
        self.bn1 = nn.BatchNorm3d(planes)
        self.conv2 = nn.Conv3d(planes, 
                               planes, 
                               kernel_size=(1,3,3), 
                               stride=(1,space_stride,space_stride),
                               padding=(0,1,1), 
                               bias=False)
        self.bn2 = nn.BatchNorm3d(planes)
        self.conv3 = nn.Conv3d(planes, 
                               planes * 4, 
                               kernel_size=(1,1,1), 
                               bias=False)
        self.bn3 = nn.BatchNorm3d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.addnon = addnon
        if self.addnon:
            self.nonlocal_block = NonLocalBlock3D(in_channels=planes * 4, mode='embedded_gaussian')

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)
        if self.addnon:
            out = self.nonlocal_block(out)
        return out



class I3DResNet(nn.Module):

    def __init__(self, block, layers, frame_num=32, num_classes=400):
        if torch.cuda.is_available():
            torch.set_default_tensor_type('torch.cuda.FloatTensor')
        self.inplanes = 64
        super(I3DResNet, self).__init__()
        self.conv1 = nn.Conv3d(3, 64, 
                               kernel_size=(5,7,7), 
                               stride=(2,2,2), 
                               padding=(2,3,3),
                               bias=False)
        self.bn1 = nn.BatchNorm3d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool3d(kernel_size=(3,3,3), stride=(2,2,2), padding=(1,1,1))
        self.layer1 = self._make_layer_inflat(block, 64, layers[0], first_block=True)
        self.temporalpool = nn.MaxPool3d(kernel_size=(3,1,1), stride=(2,1,1), padding=(1,0,0))
        self.layer2 = self._make_layer_inflat(block, 128, layers[1], space_stride=2)
        self.layer3 = self._make_layer_inflat(block, 256, layers[2], space_stride=2)
        self.layer4 = self._make_layer_inflat(block, 512, layers[3], space_stride=2)
        self.avgpool = nn.AvgPool3d((int(frame_num/8),7,7))
        self.avgdrop =nn.Dropout(0.5)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

   
    def _make_layer_inflat(self, block, planes, blocks, space_stride=1, first_block=False):
        downsample = None
        if space_stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv3d(self.inplanes, planes * block.expansion,
                          kernel_size=(1,1,1), stride=(1,space_stride,space_stride), bias=False),
                nn.BatchNorm3d(planes * block.expansion),
            )

        layers = []
        if blocks%2 == 0 or first_block:
            time_kernel = 3
        else:
            time_kernel = 1
            
        # Add first block
        layers.append(block(self.inplanes, planes, time_kernel, space_stride, downsample, addnon = False))
        self.inplanes = planes * block.expansion
        
        if first_block:
            for i in range(1, blocks):
                layers.append(block(self.inplanes, planes, time_kernel))
        elif blocks%2 == 0:
            time_kernel = 1
            add_nonlocal = True
            for i in range(1, blocks):
                layers.append(block(self.inplanes, planes, time_kernel, addnon=add_nonlocal))
                time_kernel = (time_kernel + 2)%4
                add_nonlocal = not add_nonlocal
        else:
            time_kernel = 3
            for i in range(1, blocks):
                layers.append(block(self.inplanes, planes, time_kernel))
                time_kernel = (time_kernel + 2)%4
         
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.temporalpool(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        x = x.permute(0, 2, 1, 3, 4).contiguous()
        x = x.view(x.size(0), -1)
        x = self.avgdrop(x)
        x = self.fc(x)

        return x


def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = I3DResNet(Bottleneck, [3, 4, 6, 3], **kwargs)

    return model



class _NonLocalBlockND(nn.Module):
    def __init__(self, in_channels, inter_channels=None, dimension=3, mode='embedded_gaussian',
                 sub_sample=True, bn_layer=True):
        super(_NonLocalBlockND, self).__init__()

        assert dimension in [1, 2, 3]
        assert mode in ['embedded_gaussian', 'gaussian', 'dot_product', 'concatenation']

        # print('Dimension: %d, mode: %s' % (dimension, mode))

        self.mode = mode
        self.dimension = dimension
        self.sub_sample = sub_sample

        self.in_channels = in_channels
        self.inter_channels = inter_channels

        if self.inter_channels is None:
            self.inter_channels = in_channels // 2
            if self.inter_channels == 0:
                self.inter_channels = 1

        if dimension == 3:
            conv_nd = nn.Conv3d
            max_pool = nn.MaxPool3d
            bn = nn.BatchNorm3d
        elif dimension == 2:
            conv_nd = nn.Conv2d
            max_pool = nn.MaxPool2d
            bn = nn.BatchNorm2d
        else:
            conv_nd = nn.Conv1d
            max_pool = nn.MaxPool1d
            bn = nn.BatchNorm1d

        self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
                         kernel_size=1, stride=1, padding=0)       
        nn.init.kaiming_normal_(self.g.weight)
        nn.init.constant_(self.g.bias,0)
        
        if bn_layer:
            self.W = nn.Sequential(
                conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
                        kernel_size=1, stride=1, padding=0),
                bn(self.in_channels)
            )
            nn.init.kaiming_normal_(self.W[0].weight)
            nn.init.constant_(self.W[0].bias, 0)
            nn.init.constant_(self.W[1].weight, 0)
            nn.init.constant_(self.W[1].bias, 0)

            
        else:
            self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
                             kernel_size=1, stride=1, padding=0)
            nn.init.kaiming_normal(self.W.weight)
            nn.init.constant(self.W.bias, 0)

        self.theta = None
        self.phi = None

        self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
                             kernel_size=1, stride=1, padding=0)
        self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
                           kernel_size=1, stride=1, padding=0)
        
        if self.mode == "embedded_gaussian":
            self.operation_function = self._embedded_gaussian

        if sub_sample:
            self.g = nn.Sequential(self.g, max_pool(kernel_size=2))
            if self.phi is None:
                self.phi = max_pool(kernel_size=2)
            else:
                self.phi = nn.Sequential(self.phi, max_pool(kernel_size=2))

    def forward(self, x):
        '''
        :param x: (b, c, t, h, w)
        :return:
        '''

        output = self.operation_function(x)
        return output

    def _embedded_gaussian(self, x):
        batch_size = x.size(0)

        # g=>(b, c, t, h, w)->(b, 0.5c, t, h, w)->(b, thw, 0.5c)
        g_x = self.g(x).view(batch_size, self.inter_channels, -1)
        g_x = g_x.permute(0, 2, 1)

        # theta=>(b, c, t, h, w)[->(b, 0.5c, t, h, w)]->(b, thw, 0.5c)
        # phi  =>(b, c, t, h, w)[->(b, 0.5c, t, h, w)]->(b, 0.5c, thw)
        # f=>(b, thw, 0.5c)dot(b, 0.5c, twh) = (b, thw, thw)
        theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
        theta_x = theta_x.permute(0, 2, 1)
        phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
        f = torch.matmul(theta_x, phi_x)
        f_div_C = F.softmax(f, dim=-1)

        # (b, thw, thw)dot(b, thw, 0.5c) = (b, thw, 0.5c)->(b, 0.5c, t, h, w)->(b, c, t, h, w)
        y = torch.matmul(f_div_C, g_x)
        y = y.permute(0, 2, 1).contiguous()
        y = y.view(batch_size, self.inter_channels, *x.size()[2:])
        W_y = self.W(y)
        z = W_y + x

        return z

class NonLocalBlock3D(_NonLocalBlockND):
    def __init__(self, in_channels, inter_channels=None, mode='embedded_gaussian', sub_sample=True, bn_layer=True):
        super(NonLocalBlock3D, self).__init__(in_channels,
                                              inter_channels=inter_channels,
                                              dimension=3, mode=mode,
                                              sub_sample=sub_sample,
                                              bn_layer=bn_layer)

# def nonlocal_block(input_layer,input_channel):
#     net = NonLocalBlock3D(in_channels=input_channel,mode='embedded_gaussian')
#     out = net(input_layer)
#     return out

In [3]:
resnet_i3d = resnet50()

In [7]:
resnet_i3d.cuda(0)

I3DResNet(
  (conv1): Conv3d(3, 64, kernel_size=(5, 7, 7), stride=(2, 2, 2), padding=(2, 3, 3), bias=False)
  (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace)
  (maxpool): MaxPool3d(kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1), dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv3d(64, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
      (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
      (bn2): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv3d(64, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
      (bn3): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (downsample): Sequen

In [4]:
data = np.ones((1, 256, 8, 56, 56), dtype=np.float32)
tensor = torch.from_numpy(data)
inputs = Variable(tensor)
inputs1 = inputs.cuda()

relu = nn.ReLU(inplace=True)
out = relu(inputs1)


nl = NonLocalBlock3D(in_channels=out.size(1),mode='embedded_gaussian')

In [5]:
for k, v in nl.named_parameters():
    print(k, v.shape)

g.0.weight torch.Size([128, 256, 1, 1, 1])
g.0.bias torch.Size([128])
W.0.weight torch.Size([256, 128, 1, 1, 1])
W.0.bias torch.Size([256])
W.1.weight torch.Size([256])
W.1.bias torch.Size([256])
theta.weight torch.Size([128, 256, 1, 1, 1])
theta.bias torch.Size([128])
phi.0.weight torch.Size([128, 256, 1, 1, 1])
phi.0.bias torch.Size([128])


In [6]:
resnet_i3d

I3DResNet(
  (conv1): Conv3d(3, 64, kernel_size=(5, 7, 7), stride=(2, 2, 2), padding=(2, 3, 3), bias=False)
  (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace)
  (maxpool): MaxPool3d(kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1), dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv3d(64, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0), bias=False)
      (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=False)
      (bn2): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv3d(64, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
      (bn3): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (downsample): Sequen

In [7]:
data2 = np.ones((1, 3, 32, 224, 224), dtype=np.float32)
tensor2 = torch.from_numpy(data2)
inputs2 = Variable(tensor2)
inputs22 = inputs2.cuda()
out2 = resnet_i3d(inputs22)
print(out2)

tensor([[-4.2431e-01, -6.0598e-01,  2.7394e-01,  6.7827e-01,  4.5890e-01,
          2.2402e-01, -5.8076e-01, -3.3391e-01,  3.0159e-01,  9.0302e-01,
         -6.5314e-01,  5.8609e-01, -1.7506e-01, -6.4449e-01,  1.5612e-01,
          9.6105e-01, -1.7071e-01,  4.9209e-01,  1.5183e-01,  3.7344e-01,
         -6.4171e-01,  1.5618e-01,  6.1644e-01,  2.0064e-01, -4.2201e-02,
          3.1081e-01, -4.5532e-01, -8.3999e-01, -4.9580e-01, -4.4943e-02,
          4.2768e-01,  4.6239e-02, -2.2576e-01, -6.4847e-02,  7.5747e-01,
          2.1065e-02, -2.4400e-01, -2.6327e-01,  6.0633e-01,  1.7263e-01,
         -2.9549e-01, -4.3230e-01, -4.0394e-02,  6.9829e-01, -2.2613e-01,
          7.2546e-03,  6.4750e-01,  4.3742e-01,  2.2268e-01, -2.8759e-01,
          4.5785e-01, -6.3462e-01, -6.5380e-01, -2.1650e-01,  7.4622e-01,
         -8.0658e-01,  1.4614e+00, -2.7274e-01, -5.1095e-01, -5.5215e-01,
         -3.9809e-01, -7.5295e-01,  1.3879e-01, -3.7343e-01,  4.4159e-01,
          3.6519e-01, -3.5435e-01, -6.

In [8]:
pre_trained_model_path = "/media/v-pakova/New Volume/OnlineActionRecognition/models/pre-trained/non-local/i3d_nonlocal_32x2_IN_pretrain_400k.pkl"
with open(pre_trained_model_path, 'rb') as model_file:
    pretrained_data = pickle.load(model_file, encoding='latin1')

pretrained_data1 = pretrained_data['blobs']

In [9]:
# Removing training parameters
pretrained_data2 = {k: v for k,v in pretrained_data1.items() if 
                   ('momentum' not in k) and ('bn_rm' not in k) and ('bn_riv' not in k)
                   and ('lr' not in k) and ('model_iter' not in k)}

pretrained_data_list = sorted(pretrained_data2.items())
        
for k, v in pretrained_data_list:
    print(k, np.array(v).shape)

conv1_w (64, 3, 5, 7, 7)
nonlocal_conv3_1_bn_b (512,)
nonlocal_conv3_1_bn_s (512,)
nonlocal_conv3_1_g_b (256,)
nonlocal_conv3_1_g_w (256, 512, 1, 1, 1)
nonlocal_conv3_1_out_b (512,)
nonlocal_conv3_1_out_w (512, 256, 1, 1, 1)
nonlocal_conv3_1_phi_b (256,)
nonlocal_conv3_1_phi_w (256, 512, 1, 1, 1)
nonlocal_conv3_1_theta_b (256,)
nonlocal_conv3_1_theta_w (256, 512, 1, 1, 1)
nonlocal_conv3_3_bn_b (512,)
nonlocal_conv3_3_bn_s (512,)
nonlocal_conv3_3_g_b (256,)
nonlocal_conv3_3_g_w (256, 512, 1, 1, 1)
nonlocal_conv3_3_out_b (512,)
nonlocal_conv3_3_out_w (512, 256, 1, 1, 1)
nonlocal_conv3_3_phi_b (256,)
nonlocal_conv3_3_phi_w (256, 512, 1, 1, 1)
nonlocal_conv3_3_theta_b (256,)
nonlocal_conv3_3_theta_w (256, 512, 1, 1, 1)
nonlocal_conv4_1_bn_b (1024,)
nonlocal_conv4_1_bn_s (1024,)
nonlocal_conv4_1_g_b (512,)
nonlocal_conv4_1_g_w (512, 1024, 1, 1, 1)
nonlocal_conv4_1_out_b (1024,)
nonlocal_conv4_1_out_w (1024, 512, 1, 1, 1)
nonlocal_conv4_1_phi_b (512,)
nonlocal_conv4_1_phi_w (512, 1024, 1, 1,

In [245]:
pretrained_data3 = {}

r = re.compile('res._._branch*')
for k, v in pretrained_data_list:
    if k[-2:] == '_b':
        d = '.bias'
    elif k[-2:] == '_s' or k[-2:] == '_w':
        d = '.weight'
    else:
        d = ''
    a = k[:-2]
    if r.match(a) is not None:
        b = a.replace(a, 'layer'+str(int(a[3])-1)+'.'+a[5])
        c = a[13:]
        if '_' in c:
            c = c[-2:]+c[0]
        else:
            c = 'conv'+c[0]
        pretrained_data3[k] = {'data': v, 'new_name':b+'.'+c+d}

In [10]:
pretrained_data3 = {}
for k, v in pretrained_data_list:
    a = k[:]
    # Correcting the end
    if a[-2:] == '_b':
        a = a[:-2]+'.bias'
    elif a[-2:] == '_s' or a[-2:] == '_w':
        a = a[:-2]+'.weight'

    # Correcting the begin
    a = a.replace('res_conv1_bn', 'bn1')
    a = a.replace('pred', 'fc')
    r = re.compile('res._*')
    if r.match(a) is not None:
        layer = int(a[3])-1
        a = 'layer'+str(layer)+'.'+a[5:]
        
    # Correcting the middle
    a = a.replace('_branch1_bn', '.downsample.1')
    a = a.replace('_branch1', '.downsample.0')
    
    for i, l in enumerate(['a', 'b', 'c']):
        a = a.replace('_branch2{}_bn'.format(l), '.bn{}'.format(i+1))
        a = a.replace('_branch2{}'.format(l), '.conv{}'.format(i+1))
        
    # Correcting nonlocal
    if 'nonlocal' in a:
        layer = int(a[13])-1
        sub_layer = a[15]
        a = 'layer{}.{}.nonlocal_block.'.format(layer, sub_layer) + a[17:]
        a = a.replace('out', 'W.0')
        a = a.replace('bn', 'W.1')
        a = a.replace('phi', 'phi.0')
        a = a.replace('.g.', '.g.0.')
    
    pretrained_data3[a] = {'old_name':k, 'data':v}

In [25]:
param_i3d_keys = resnet_i3d.state_dict().keys()
count = 0
for k, v in pretrained_data3.items():
    if k in param_i3d_keys:
        count += 1
        assert(resnet_i3d.state_dict()[k].shape==v['data'].shape)
    else:
        print('Not a match for: {}!!'.format(k))
        
print('{} matches of {}.'.format(count, len(list(resnet_i3d.named_parameters()))))

211 matches of 211.


In [23]:
len(list(resnet_i3d.named_parameters()))

211

In [13]:
for k, v in resnet_i3d.named_parameters():
    new_data = pretrained_data3[k]['data']
    v.data = nn.Parameter(torch.Tensor(new_data))

In [14]:
for k, v in pretrained_data3.items():
    assert(torch.equal(resnet_i3d.state_dict()[k].data, nn.Parameter(torch.Tensor(v['data']))))

In [15]:
for k, v in resnet_i3d.named_parameters():
    assert(torch.equal(v.data, nn.Parameter(torch.Tensor(pretrained_data3[k]['data']))))

In [307]:
torch.save(resnet_i3d, 'resnet50_i3d_pre_trained.pt')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [300]:
model2 = 
loaded_model = torch.load('resnet50_i3d_pre_trained.pt')

In [304]:
# Test loaded model
for p1, p2 in zip(resnet_i3d.parameters(), loaded_model.parameters()):
    if p1.data.ne(p2.data).sum() > 0:
        print(False)
print(True)

True
