In [22]:
import torch
import torch.nn as nn
import numpy as np
import torch.utils.data as data
from torchvision import transforms
import torch
from PIL import Image
import os

In [2]:
__all__ = ['I3DResNet', 'resnet50', 'resnet101', 'resnet152']

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, time_kernel=1, space_stride=1, downsample=None, addnon=False):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv3d(inplanes, 
                               planes, 
                               kernel_size=(time_kernel,1,1), 
                               padding=(int((time_kernel-1)/2), 0,0),
                               bias=False) # timepadding: make sure time-dim not reduce
        self.bn1 = nn.BatchNorm3d(planes)
        self.conv2 = nn.Conv3d(planes, 
                               planes, 
                               kernel_size=(1,3,3), 
                               stride=(1,space_stride,space_stride),
                               padding=(0,1,1), 
                               bias=False)
        self.bn2 = nn.BatchNorm3d(planes)
        self.conv3 = nn.Conv3d(planes, 
                               planes * 4, 
                               kernel_size=(1,1,1), 
                               bias=False)
        self.bn3 = nn.BatchNorm3d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.addnon = addnon
        if self.addnon:
            self.nonlocal_block = NonLocalBlock3D(in_channels=planes * 4, mode='embedded_gaussian')

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)
        if self.addnon:
            out = self.nonlocal_block(out)
        return out



class I3DResNet(nn.Module):

    def __init__(self, block, layers, frame_num=32, num_classes=400):
        if torch.cuda.is_available():
            torch.set_default_tensor_type('torch.cuda.FloatTensor')
        self.inplanes = 64
        super(I3DResNet, self).__init__()
        self.conv1 = nn.Conv3d(3, 64, 
                               kernel_size=(5,7,7), 
                               stride=(2,2,2), 
                               padding=(2,3,3),
                               bias=False)
        self.bn1 = nn.BatchNorm3d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool3d(kernel_size=(3,3,3), stride=(2,2,2), padding=(1,1,1))
        self.layer1 = self._make_layer_inflat(block, 64, layers[0], first_block=True)
        self.temporalpool = nn.MaxPool3d(kernel_size=(3,1,1), stride=(2,1,1), padding=(1,0,0))
        self.layer2 = self._make_layer_inflat(block, 128, layers[1], space_stride=2)
        self.layer3 = self._make_layer_inflat(block, 256, layers[2], space_stride=2)
        self.layer4 = self._make_layer_inflat(block, 512, layers[3], space_stride=2)
        self.avgpool = nn.AvgPool3d((int(frame_num/8),7,7))
        self.avgdrop =nn.Dropout(0.5)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

   
    def _make_layer_inflat(self, block, planes, blocks, space_stride=1, first_block=False):
        downsample = None
        if space_stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv3d(self.inplanes, planes * block.expansion,
                          kernel_size=(1,1,1), stride=(1,space_stride,space_stride), bias=False),
                nn.BatchNorm3d(planes * block.expansion),
            )

        layers = []
        if blocks%2 == 0 or first_block:
            time_kernel = 3
        else:
            time_kernel = 1
            
        # Add first block
        layers.append(block(self.inplanes, planes, time_kernel, space_stride, downsample, addnon = False))
        self.inplanes = planes * block.expansion
        
        if first_block:
            for i in range(1, blocks):
                layers.append(block(self.inplanes, planes, time_kernel))
        elif blocks%2 == 0:
            time_kernel = 1
            add_nonlocal = True
            for i in range(1, blocks):
                layers.append(block(self.inplanes, planes, time_kernel, addnon=add_nonlocal))
                time_kernel = (time_kernel + 2)%4
                add_nonlocal = not add_nonlocal
        else:
            time_kernel = 3
            for i in range(1, blocks):
                layers.append(block(self.inplanes, planes, time_kernel))
                time_kernel = (time_kernel + 2)%4
         
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.temporalpool(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        x = x.permute(0, 2, 1, 3, 4).contiguous()
        x = x.view(x.size(0), -1)
        x = self.avgdrop(x)
        x = self.fc(x)

        return x


def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = I3DResNet(Bottleneck, [3, 4, 6, 3], **kwargs)

    return model



class _NonLocalBlockND(nn.Module):
    def __init__(self, in_channels, inter_channels=None, dimension=3, mode='embedded_gaussian',
                 sub_sample=True, bn_layer=True):
        super(_NonLocalBlockND, self).__init__()

        assert dimension in [1, 2, 3]
        assert mode in ['embedded_gaussian', 'gaussian', 'dot_product', 'concatenation']

        # print('Dimension: %d, mode: %s' % (dimension, mode))

        self.mode = mode
        self.dimension = dimension
        self.sub_sample = sub_sample

        self.in_channels = in_channels
        self.inter_channels = inter_channels

        if self.inter_channels is None:
            self.inter_channels = in_channels // 2
            if self.inter_channels == 0:
                self.inter_channels = 1

        if dimension == 3:
            conv_nd = nn.Conv3d
            max_pool = nn.MaxPool3d
            bn = nn.BatchNorm3d
        elif dimension == 2:
            conv_nd = nn.Conv2d
            max_pool = nn.MaxPool2d
            bn = nn.BatchNorm2d
        else:
            conv_nd = nn.Conv1d
            max_pool = nn.MaxPool1d
            bn = nn.BatchNorm1d

        self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
                         kernel_size=1, stride=1, padding=0)       
        nn.init.kaiming_normal_(self.g.weight)
        nn.init.constant_(self.g.bias,0)
        
        if bn_layer:
            self.W = nn.Sequential(
                conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
                        kernel_size=1, stride=1, padding=0),
                bn(self.in_channels)
            )
            nn.init.kaiming_normal_(self.W[0].weight)
            nn.init.constant_(self.W[0].bias, 0)
            nn.init.constant_(self.W[1].weight, 0)
            nn.init.constant_(self.W[1].bias, 0)

            
        else:
            self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
                             kernel_size=1, stride=1, padding=0)
            nn.init.kaiming_normal(self.W.weight)
            nn.init.constant(self.W.bias, 0)

        self.theta = None
        self.phi = None

        self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
                             kernel_size=1, stride=1, padding=0)
        self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
                           kernel_size=1, stride=1, padding=0)
        
        if self.mode == "embedded_gaussian":
            self.operation_function = self._embedded_gaussian

        if sub_sample:
            self.g = nn.Sequential(self.g, max_pool(kernel_size=2))
            if self.phi is None:
                self.phi = max_pool(kernel_size=2)
            else:
                self.phi = nn.Sequential(self.phi, max_pool(kernel_size=2))

    def forward(self, x):
        '''
        :param x: (b, c, t, h, w)
        :return:
        '''

        output = self.operation_function(x)
        return output

    def _embedded_gaussian(self, x):
        batch_size = x.size(0)

        # g=>(b, c, t, h, w)->(b, 0.5c, t, h, w)->(b, thw, 0.5c)
        g_x = self.g(x).view(batch_size, self.inter_channels, -1)
        g_x = g_x.permute(0, 2, 1)

        # theta=>(b, c, t, h, w)[->(b, 0.5c, t, h, w)]->(b, thw, 0.5c)
        # phi  =>(b, c, t, h, w)[->(b, 0.5c, t, h, w)]->(b, 0.5c, thw)
        # f=>(b, thw, 0.5c)dot(b, 0.5c, twh) = (b, thw, thw)
        theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
        theta_x = theta_x.permute(0, 2, 1)
        phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
        f = torch.matmul(theta_x, phi_x)
        f_div_C = F.softmax(f, dim=-1)

        # (b, thw, thw)dot(b, thw, 0.5c) = (b, thw, 0.5c)->(b, 0.5c, t, h, w)->(b, c, t, h, w)
        y = torch.matmul(f_div_C, g_x)
        y = y.permute(0, 2, 1).contiguous()
        y = y.view(batch_size, self.inter_channels, *x.size()[2:])
        W_y = self.W(y)
        z = W_y + x

        return z

class NonLocalBlock3D(_NonLocalBlockND):
    def __init__(self, in_channels, inter_channels=None, mode='embedded_gaussian', sub_sample=True, bn_layer=True):
        super(NonLocalBlock3D, self).__init__(in_channels,
                                              inter_channels=inter_channels,
                                              dimension=3, mode=mode,
                                              sub_sample=sub_sample,
                                              bn_layer=bn_layer)


In [3]:
loaded_model = torch.load('resnet50_i3d_pre_trained.pt')

In [45]:
loaded_model.conv1

Conv3d(3, 64, kernel_size=(5, 7, 7), stride=(2, 2, 2), padding=(2, 3, 3), bias=False)

In [67]:
class Kinetics400Dataset(data.Dataset):
    def __init__(self, root_dir, list_file, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.landmarks_frame = self.read_list_file(list_file)

    def __len__(self):
        return len(self.landmarks_frame)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.landmarks_frame[idx][1])
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        image = np.rollaxis(np.asarray(image), 2, 0)
        label = int(self.landmarks_frame[idx][0])
#         print(image.shape, label)

        return image, label
    
    def read_list_file(self, file_path):
        with open(file_path, 'r') as file:
            frame_list = file.readlines()
        for i, line in enumerate(frame_list):
            frame_list[i] = line.replace('\n', '').split(' ')
        return frame_list

In [69]:
root_dir = '/media/v-pakova/New Volume/OnlineActionRecognition/datasets/Kinetics-NonLocal/val'
list_file = '/media/v-pakova/New Volume/OnlineActionRecognition/datasets/Kinetics-NonLocal/val_list.txt'

data_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.RandomCrop(224),
        transforms.RandomHorizontalFlip()
    ])

dataset = Kinetics400Dataset(root_dir, list_file, data_transform)

In [72]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=False, num_workers=4)

In [73]:
total_loss = []
total_acc = 0
total_sample = 0

for test_batch_index, (img_batch, label_batch) in enumerate(dataloader):
    print(test_batch_index, img_batch.shape, label_batch)
    if torch.cuda.is_available():
        img_batch = img_batch.cuda()
        
    
    
    predict = loaded_model(img_batch)
    loss = loss_func(predict, label_batch)

    predict = predict.argmax(dim=1)
    acc = (predict == label_batch).sum()
    print(predict, acc)
    total_loss.append(loss)
    total_acc += acc
    total_sample += img_batch.size(0)
    print(total_sample)
    
    if total_sample > 10:
        raise Exception

(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
(3, 224, 224) 0
0 torch.Size([5, 3, 224, 224]) tensor([0, 0, 0, 0, 0])
(3, 224, 224) 0
(3, 224, 224) 0


RuntimeError: Expected 5-dimensional input for 5-dimensional weight [64, 3, 5, 7, 7], but got 4-dimensional input of size [5, 3, 224, 224] instead