# I3D implementation based on

https://arxiv.org/pdf/1705.07750.pdf

https://github.com/piergiaj/pytorch-i3d


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# !nvidia-smi

#Parameters

In [None]:
class Parameters():
  """Part of the parameters, including:
  - directories to folders
  - path for the model save
  - sequence length
  """
  def __init__(self):
    self.data_dir = 'drive/My Drive/Drive/speedchallenge/' # all the data are stored on the google drive

    # Train
    self.image_dir = self.data_dir + 'data/KITTI_COMMA_resized_10fps/' 
    self.mask_dir = self.data_dir + 'data/KITTI_COMMA_resized_masks_10fps/'  
    self.vel_dir = self.data_dir + 'data/KITTI_COMMA_vel_GT_10fps/'

    # Test
    # self.image_dir = self.data_dir + 'data/COMMA_TEST_cropped_resized_10fps/'
    # self.mask_dir = self.data_dir + 'data/COMMA_TEST_cropped_resized_10fps_masks/'

    # Kitti folders
    self.kitti_folder_list = [str(item) for item in list(range(1000, 1164))]
    self.kitti_folder_list.pop(142)
    # Comma folders
    self.comma_folder_list = [str(item) for item in list(range(100, 185))]
    # My data set folders
    self.my_folders = [str(item) for item in list(range(34))]

    self.video_folders = self.kitti_folder_list + self.comma_folder_list + self.my_folders

    self.save_model_path = 'drive/My Drive/Drive/speedchallenge/i3d_checkpoints/comma_kitti__myData(with_car_removal)/'

    self.seq_len = 10
    self.fps = 10

par = Parameters()

# Dataset

In [None]:
import torch
import torch.utils.data as data_utl
from torch.utils.data.dataloader import default_collate

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable

import torchvision
from torchvision import datasets, transforms

import albumentations as albu
import numpy as np
import json
import csv
import h5py
import random
import os
import os.path
import cv2

def video_to_tensor(pic):
    """Convert a ``numpy.ndarray`` to tensor.
    Converts a numpy.ndarray (T x H x W x C)
    to a torch.FloatTensor of shape (C x T x H x W)
    
    Args:
         pic (numpy.ndarray): Video to be converted to tensor.
    Returns:
         Tensor: Converted video.
    """
    return torch.from_numpy(pic.transpose([3, 0, 1, 2]))

def load_rgb_frames(vid, start, num):
    frames = []
    for i in range(start, start+num):
        img = cv2.imread(os.path.join(par.image_dir, vid, str(i).zfill(10) + '.png'))
        if img is None:
            print("Can't load image " + str(i) + " from " + vid + ", please check the img path", os.path.join(par.image_dir, vid, str(i).zfill(10)+'.png'))
            sys.exit(1)
        img=img[:, :, [2, 1, 0]]

        frames.append(img)
    return np.asarray(frames)


def make_dataset(vid_folder_list): 
    dataset = []
    for vid in vid_folder_list: # go through all training or testing folders
        
        if vid in par.comma_folder_list:
            num_frames = 120 # don't know why but os.listdir sometimes gives wrong file list from google drive, better hardcode this is possible
        else:
            num_frames = len(os.listdir(os.path.join(par.image_dir, vid)))

        labels = np.load('{}{}.npy'.format(par.vel_dir, vid))
        duration = num_frames/(par.fps) # seconds, do not used
        dataset.append((vid, labels, duration, num_frames))
    return dataset


class Charades(data_utl.Dataset):

    def __init__(self, vid_folder_list, mode, transforms=None, augmentation=False, remove_cars=False):
        self.data = make_dataset(vid_folder_list)
        self.vid_folder_list = vid_folder_list
        self.transforms = transforms
        self.augmentation = augmentation
        self.mode = mode
        self.remove_cars = remove_cars

        self.transformations = albu.Compose(
            [
              albu.HorizontalFlip(p=0.5),
              albu.OneOf([
                  albu.IAAAdditiveGaussianNoise(),
                  albu.GaussNoise(),
              ], p=0.5),
              albu.OneOf([
                  albu.CLAHE(clip_limit=2),
                  albu.IAASharpen(),
                  albu.IAAEmboss(),
                  albu.RandomGamma(p=1),
                  albu.RGBShift(),
                  albu.HueSaturationValue(p=0.3),
              ], p=0.8),
              albu.ToGray(),
              albu.RandomBrightness(),
              albu.RandomContrast(),
            ], p=1
        )

    def __getitem__(self, index):
        """
        Args:
            index (int): Index

        Returns:
            tuple: (image, target)
        """

        vid, label, dur, nf = self.data[index]
        # print('\nvid fold', vid)

        if self.mode == 'train': # 24-.. frames
            first_frame = 24
        elif self.mode == 'test': # first 24 frames
            first_frame = 1
            nf = 24
        
        start_f = random.randint(first_frame, nf - (par.seq_len + 1)) # 1 .. 1990, 200 .. 1990
        imgs = load_rgb_frames(vid, start_f, par.seq_len)

        # segment cars out
        if self.remove_cars:
            mask_for_seq = np.zeros((224,224))
            for i in range(start_f, start_f + par.seq_len):
                mask = np.load(os.path.join(par.mask_dir, vid, str(i).zfill(10)+'.npy'))
                mask_for_seq += mask # accumulate the masks
            mask_for_seq[mask_for_seq > 1] = 1 # clip siquence mask

            for i,image in enumerate(imgs):
                imgs[i][mask_for_seq == 1] = 0

        if self.augmentation:
            additional_targets_names = {}
            for i in range(par.seq_len-1):
                    additional_targets_names['image' + str(i)] = 'image'

            transform = albu.Compose(
                [
                  self.transformations
                ],
                additional_targets=additional_targets_names
            )

            input_dict={'image': imgs[0]}
            for i, additional_targets_name in enumerate(additional_targets_names):
                input_dict[additional_targets_name] = imgs[i + 1]

            transformed = transform(**input_dict)
            aug_list = []
            for key in transformed:
                aug_list.append(transformed[key])
            imgs = np.asarray(aug_list)

        imgs = np.array([((img/255.)*2 - 1) for img in imgs]) # Normalize
        imgs = np.asarray(imgs, dtype=np.float32) # Convert ot float32

        # print('wanted items    :', start_f, start_f+par.seq_len)
        label = label[start_f : start_f + par.seq_len]

        return video_to_tensor(imgs), torch.from_numpy(label)

    def __len__(self):
        return len(self.data)

In [None]:
# i3d = InceptionI3d(400, in_channels=3)
# i3d.eval()
# i3d(torch.rand(1,3,64,224,224))

# Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

import numpy as np

import os
import sys
from collections import OrderedDict


class MaxPool3dSamePadding(nn.MaxPool3d):
    
    def compute_pad(self, dim, s):
        if s % self.stride[dim] == 0:
            return max(self.kernel_size[dim] - self.stride[dim], 0)
        else:
            return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)

    def forward(self, x):
        # compute 'same' padding
        (batch, channel, t, h, w) = x.size()
        #print t,h,w
        out_t = np.ceil(float(t) / float(self.stride[0]))
        out_h = np.ceil(float(h) / float(self.stride[1]))
        out_w = np.ceil(float(w) / float(self.stride[2]))
        #print out_t, out_h, out_w
        pad_t = self.compute_pad(0, t)
        pad_h = self.compute_pad(1, h)
        pad_w = self.compute_pad(2, w)
        #print pad_t, pad_h, pad_w

        pad_t_f = pad_t // 2
        pad_t_b = pad_t - pad_t_f
        pad_h_f = pad_h // 2
        pad_h_b = pad_h - pad_h_f
        pad_w_f = pad_w // 2
        pad_w_b = pad_w - pad_w_f

        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
        #print x.size()
        #print pad
        x = F.pad(x, pad)
        return super(MaxPool3dSamePadding, self).forward(x)
    

class Unit3D(nn.Module):

    def __init__(self, in_channels,
                 output_channels,
                 kernel_shape=(1, 1, 1),
                 stride=(1, 1, 1),
                 padding=0,
                 activation_fn=F.relu,
                 use_batch_norm=True,
                 use_bias=False,
                 name='unit_3d'):
        
        """Initializes Unit3D module."""
        super(Unit3D, self).__init__()
        
        self._output_channels = output_channels
        self._kernel_shape = kernel_shape
        self._stride = stride
        self._use_batch_norm = use_batch_norm
        self._activation_fn = activation_fn
        self._use_bias = use_bias
        self.name = name
        self.padding = padding
        
        self.conv3d = nn.Conv3d(in_channels=in_channels,
                                out_channels=self._output_channels,
                                kernel_size=self._kernel_shape,
                                stride=self._stride,
                                padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
                                bias=self._use_bias)
        
        if self._use_batch_norm:
            self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)

    def compute_pad(self, dim, s):
        if s % self._stride[dim] == 0:
            return max(self._kernel_shape[dim] - self._stride[dim], 0)
        else:
            return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)

            
    def forward(self, x):
        # compute 'same' padding
        (batch, channel, t, h, w) = x.size()
        #print t,h,w
        out_t = np.ceil(float(t) / float(self._stride[0]))
        out_h = np.ceil(float(h) / float(self._stride[1]))
        out_w = np.ceil(float(w) / float(self._stride[2]))
        #print out_t, out_h, out_w
        pad_t = self.compute_pad(0, t)
        pad_h = self.compute_pad(1, h)
        pad_w = self.compute_pad(2, w)
        #print pad_t, pad_h, pad_w

        pad_t_f = pad_t // 2
        pad_t_b = pad_t - pad_t_f
        pad_h_f = pad_h // 2
        pad_h_b = pad_h - pad_h_f
        pad_w_f = pad_w // 2
        pad_w_b = pad_w - pad_w_f

        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
        #print x.size()
        #print pad
        x = F.pad(x, pad)
        #print x.size()        

        x = self.conv3d(x)
        if self._use_batch_norm:
            x = self.bn(x)
        if self._activation_fn is not None:
            x = self._activation_fn(x)
        return x



class InceptionModule(nn.Module):
    def __init__(self, in_channels, out_channels, name):
        super(InceptionModule, self).__init__()

        self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0,
                         name=name+'/Branch_0/Conv3d_0a_1x1')
        self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0,
                          name=name+'/Branch_1/Conv3d_0a_1x1')
        self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3],
                          name=name+'/Branch_1/Conv3d_0b_3x3')
        self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0,
                          name=name+'/Branch_2/Conv3d_0a_1x1')
        self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3],
                          name=name+'/Branch_2/Conv3d_0b_3x3')
        self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
                                stride=(1, 1, 1), padding=0)
        self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,
                          name=name+'/Branch_3/Conv3d_0b_1x1')
        self.name = name

    def forward(self, x):    
        b0 = self.b0(x)
        b1 = self.b1b(self.b1a(x))
        b2 = self.b2b(self.b2a(x))
        b3 = self.b3b(self.b3a(x))
        return torch.cat([b0,b1,b2,b3], dim=1)


class InceptionI3d(nn.Module):
    """Inception-v1 I3D architecture.
    The model is introduced in:
        Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
        Joao Carreira, Andrew Zisserman
        https://arxiv.org/pdf/1705.07750v1.pdf.
    See also the Inception architecture, introduced in:
        Going deeper with convolutions
        Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
        Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
        http://arxiv.org/pdf/1409.4842v1.pdf.
    """

    # Endpoints of the model in order. During construction, all the endpoints up
    # to a designated `final_endpoint` are returned in a dictionary as the
    # second return value.
    VALID_ENDPOINTS = (
        'Conv3d_1a_7x7',
        'MaxPool3d_2a_3x3',
        'Conv3d_2b_1x1',
        'Conv3d_2c_3x3',
        'MaxPool3d_3a_3x3',
        'Mixed_3b',
        'Mixed_3c',
        'MaxPool3d_4a_3x3',
        'Mixed_4b',
        'Mixed_4c',
        'Mixed_4d',
        'Mixed_4e',
        'Mixed_4f',
        'MaxPool3d_5a_2x2',
        'Mixed_5b',
        'Mixed_5c',
        'Logits',
        'Predictions',
    )

    def __init__(self, num_classes=400, spatial_squeeze=True,
                 final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5):
        """Initializes I3D model instance.
        Args:
          num_classes: The number of outputs in the logit layer (default 400, which
              matches the Kinetics dataset).
          spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
              before returning (default True).
          final_endpoint: The model contains many possible endpoints.
              `final_endpoint` specifies the last endpoint for the model to be built
              up to. In addition to the output at `final_endpoint`, all the outputs
              at endpoints up to `final_endpoint` will also be returned, in a
              dictionary. `final_endpoint` must be one of
              InceptionI3d.VALID_ENDPOINTS (default 'Logits').
          name: A string (optional). The name of this module.
        Raises:
          ValueError: if `final_endpoint` is not recognized.
        """

        if final_endpoint not in self.VALID_ENDPOINTS:
            raise ValueError('Unknown final endpoint %s' % final_endpoint)

        super(InceptionI3d, self).__init__()
        self._num_classes = num_classes
        self._spatial_squeeze = spatial_squeeze
        self._final_endpoint = final_endpoint
        self.logits = None

        if self._final_endpoint not in self.VALID_ENDPOINTS:
            raise ValueError('Unknown final endpoint %s' % self._final_endpoint)

        self.end_points = {}
        end_point = 'Conv3d_1a_7x7'
        self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7],
                                            stride=(2, 2, 2), padding=(3,3,3),  name=name+end_point)
        if self._final_endpoint == end_point: return
        
        end_point = 'MaxPool3d_2a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return
        
        end_point = 'Conv3d_2b_1x1'
        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0,
                                       name=name+end_point)
        if self._final_endpoint == end_point: return
        
        end_point = 'Conv3d_2c_3x3'
        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1,
                                       name=name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'MaxPool3d_3a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return
        
        end_point = 'Mixed_3b'
        self.end_points[end_point] = InceptionModule(192, [64,96,128,16,32,32], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_3c'
        self.end_points[end_point] = InceptionModule(256, [128,128,192,32,96,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'MaxPool3d_4a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4b'
        self.end_points[end_point] = InceptionModule(128+192+96+64, [192,96,208,16,48,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4c'
        self.end_points[end_point] = InceptionModule(192+208+48+64, [160,112,224,24,64,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4d'
        self.end_points[end_point] = InceptionModule(160+224+64+64, [128,128,256,24,64,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4e'
        self.end_points[end_point] = InceptionModule(128+256+64+64, [112,144,288,32,64,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4f'
        self.end_points[end_point] = InceptionModule(112+288+64+64, [256,160,320,32,128,128], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'MaxPool3d_5a_2x2'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_5b'
        self.end_points[end_point] = InceptionModule(256+320+128+128, [256,160,320,32,128,128], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_5c'
        self.end_points[end_point] = InceptionModule(256+320+128+128, [384,192,384,48,128,128], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Logits'
        self.avg_pool = nn.AvgPool3d(kernel_size=[1, 7, 7], #[2, 7, 7]
                                     stride=(1, 1, 1))
        self.dropout = nn.Dropout(dropout_keep_prob)
        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
                             kernel_shape=[1, 1, 1],
                             padding=0,
                             activation_fn=None,
                             use_batch_norm=False,
                             use_bias=True,
                             name='logits')

        self.build()




    def replace_logits(self, num_classes):
        self._num_classes = num_classes
        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
                             kernel_shape=[1, 1, 1],
                             padding=0,
                             activation_fn=None,
                             use_batch_norm=False,
                             use_bias=True,
                             name='logits')
   


    def build(self):
        for k in self.end_points.keys():
            self.add_module(k, self.end_points[k])
        
    def forward(self, x):
        for end_point in self.VALID_ENDPOINTS:
            if end_point in self.end_points:
                x = self._modules[end_point](x) # use _modules to work with dataparallel

        x = self.logits(self.dropout(self.avg_pool(x)))
        if self._spatial_squeeze:
            logits = x.squeeze(3).squeeze(3)
        # logits is batch X time X classes, which is what we want to work with

        # add sigmoid and multiplied by the highes velocity
        sigm = nn.Sigmoid()
        logits = sigm(logits) * 40.0

        return logits
        

    def extract_features(self, x):
        for end_point in self.VALID_ENDPOINTS:
            if end_point in self.end_points:
                x = self._modules[end_point](x)
        return self.avg_pool(x)

In [None]:
# i3d = InceptionI3d(num_classes=400, in_channels=3)
# i3d.replace_logits(1)
# i3d.eval()

# input = torch.rand(2, 3, 20, 224, 224)

# output = i3d(input)

# t = input.size(2)
# output_upsampled = F.upsample(output, t, mode='linear')

# output = output.squeeze()
# output.shape, output_upsampled.shape

# Train

###main train

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
import sys
import argparse


import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable

import torchvision
from torchvision import datasets, transforms

from IPython.display import clear_output

import numpy as np

# some params
init_lr = 6.0e-06
max_steps = 800
batch_size = 30 
num_steps_per_update = 2

resume_train = False
if resume_train:
    checkpoint_name = '000700_val_loss_1.25_lr_6e-06'

# datalaoders
dataset = Charades(par.video_folders, 'train', augmentation=True, remove_cars=True)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=30, pin_memory=True)

val_dataset = Charades(par.video_folders, 'test', remove_cars=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=30, pin_memory=True)    

dataloaders = {'train': dataloader, 'val': val_dataloader}
datasets = {'train': dataset, 'val': val_dataset}

# model
i3d = InceptionI3d(400, in_channels=3)
i3d.load_state_dict(torch.load(par.data_dir + 'i3d_models/rgb_imagenet.pt'))
i3d.replace_logits(1)
if resume_train:
    i3d.load_state_dict(torch.load(par.save_model_path + checkpoint_name + '.pt'))
i3d.cuda()
i3d = nn.DataParallel(i3d)

# optimizers
# optimizer = optim.SGD(i3d.parameters(), lr=init_lr, momentum=0.9, weight_decay=0.0000001)
optimizer = torch.optim.Adam(i3d.parameters(), lr=init_lr)
if resume_train:
    optimizer.load_state_dict(torch.load(par.save_model_path + checkpoint_name + '.optimizer'))
lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [400, 700])
def get_lr(optimizer):
  for param_group in optimizer.param_groups:
    return param_group['lr']

steps = 0
if resume_train:
    steps = int(checkpoint_name[:6]) + 1

train_loss={}
train_loss['loss'] = []
val_loss={}
val_loss['loss'] = []

while steps < max_steps:

    for phase in ['train', 'val']:
        print('phase', phase)

        if phase == 'train':
            i3d.train(True)
        else:
            i3d.train(False)  # Set model to evaluate mode

        loss_mean = 0
        num_iter = 0
        optimizer.zero_grad()
        
        # Iterate over data.
        for data in dataloaders[phase]:
            num_iter += 1
            # get the inputs
            inputs, labels = data

            # wrap them in Variable
            inputs = Variable(inputs.cuda())
            t = inputs.size(2)
            labels = Variable(labels.cuda())

            per_frame_logits = i3d(inputs)
            # upsample to input size
            per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')

            loss = torch.nn.functional.mse_loss(per_frame_logits.squeeze(), labels.float())
            loss_mean += loss
            loss.backward()

            if phase == 'train' and num_iter == num_steps_per_update:
                print('make an optimizer.step()')
                steps += 1
                num_iter = 0
                optimizer.step()
                optimizer.zero_grad()
                lr_sched.step()

        loss_mean /= len(dataloaders[phase])
        loss_mean_cpu = loss_mean.data.cpu().numpy()

        if phase == 'train':
            train_loss['loss'].append(loss_mean_cpu)
        if phase == 'val':
            if steps % 10 == 0: # save the model every 10 steps
                torch.save(i3d.module.state_dict(), par.save_model_path+str(steps).zfill(6) + f'_val_loss_{loss_mean_cpu:2f}_lr_{get_lr(optimizer)}.pt')
                torch.save(optimizer.state_dict(), par.save_model_path+str(steps).zfill(6) + f'_val_loss_{loss_mean_cpu:2f}_lr_{get_lr(optimizer)}.optimizer')
            val_loss['loss'].append(loss_mean_cpu)

    clear_output(True)
    print ('Step {}/{}'.format(steps, max_steps))
    print('LR:', get_lr(optimizer))
    print ('-' * 10)

    plt.subplot(2, 1, 1) # plt.subplot(3, 1, 1)
    plt.title('Training loss')
    plt.plot(train_loss['loss'], 'o', label='train loss')
    # plt.legend(loc='lower left')
    plt.xlabel('Steps')
    plt.gcf().set_size_inches(15, 12)
    plt.grid(True)

    plt.subplot(2, 1, 2) # plt.subplot(3, 1, 1)
    plt.title('Validation loss')
    # plt.plot(train_loss['loss'], 'o', label='train loss')
    plt.plot(val_loss['loss'], 'o', label='val loss')
    # plt.legend(loc='lower left')
    plt.xlabel('Steps')
    plt.gcf().set_size_inches(15, 12)
    plt.grid(True)

    if len(val_loss['loss']) > 10:
        print('mean val loss over last 10 steps: {:.2f}'.format(sum(val_loss['loss'][-10:]) / 10.))

    plt.show()

###cross validation train

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
import sys
import argparse


import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable

import torchvision
from torchvision import datasets, transforms

from IPython.display import clear_output

import numpy as np
import itertools
import time


# params to tune:
seq_lens = [5, 10, 20, 30]
init_lrs = [0.006, 0.0010, 0.0002]
optimizers = ['SGD', 'Adam']
batch_sizes = [20, 30, 40, 50]
imagenet_usage = [True, False]
num_steps_per_update_l = [4, 3, 2]

number_of_steps_to_try = 30

video_folders = par.comma_folder_list # [str(item) for item in list(range(100,185))]

results = {}
best_val = 1000

for (seq_len, optimizer, init_lr, batch_size, init_with_imagenet) in list(itertools.product(seq_lens, optimizers, init_lrs, batch_sizes, imagenet_usage)):
# for (optimizer, init_lr, batch_size) in list(itertools.product(optimizers, init_lrs, batch_sizes)):

    # Fix some of the params if needed
    # optimizer = 'Adam'
    # init_lr = 6.0e-04
    # # seq_len = 20
    # init_with_imagenet = True
    # batch_size = 26 #30 #25
    num_steps_per_update = 2 # accumulate gradient

    par.seq_len = seq_len

    print('\nseq_len %e optimizer %e init_lr %e batch_size %e init_with_imagenet: %f' % (
                seq_len, optimizer, init_lr, batch_size, init_with_imagenet))
    # print('\noptimizer %s init_lr %e batch_size %e ' % (
    #             optimizer, init_lr, batch_size))

    dataset = Charades(video_folders, 'train') #, augmentation=True, remove_cars=True
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=30, pin_memory=True) #,augmentation=True

    val_dataset = Charades(video_folders, 'test') # , remove_cars=True
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=30, pin_memory=True)    

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    i3d = InceptionI3d(400, in_channels=3)
    if init_with_imagenet:
        i3d.load_state_dict(torch.load(par.data_dir + 'i3d_models/rgb_imagenet.pt'))
    i3d.replace_logits(1)
    i3d.cuda()
    i3d = nn.DataParallel(i3d)

    if optimizer == 'SGD':
        optimizer = optim.SGD(i3d.parameters(), lr=init_lr, momentum=0.9, weight_decay=0.0000001)
    elif optimizer == 'Adam':
        optimizer = torch.optim.Adam(i3d.parameters(), lr=init_lr)

    steps = 0
    val_loss=[]
    while steps < number_of_steps_to_try:

        for phase in ['train', 'val']:

            if phase == 'train':
                i3d.train(True)
            else:
                i3d.train(False)  # Set model to evaluate mode

            loss_mean = 0
            optimizer.zero_grad()
            
            num_iter = 0
            # Iterate over data.
            for data in dataloaders[phase]:
                num_iter += 1
                inputs, labels = data

                # wrap them in Variable
                inputs = Variable(inputs.cuda())
                t = inputs.size(2)
                labels = Variable(labels.cuda())

                per_frame_logits = i3d(inputs)
                per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')

                loss = torch.nn.functional.mse_loss(per_frame_logits.squeeze(), labels.float())
                loss_mean += loss
                loss.backward()

                if phase == 'train' and num_iter==num_steps_per_update:
                    steps += 1
                    optimizer.step()
                    optimizer.zero_grad()

            loss_mean /= len(dataloaders[phase])
            
            if phase == 'val':
                # print('loss', loss_mean.data.cpu().numpy())
                val_loss.append(loss_mean.data.cpu().numpy())

    mean_val_acc = (sum(val_loss[-10:]) / 10)
    print('took time', time.time() - start)
    print('mean val loss over last 10 steps: ', mean_val_acc)


    results[(seq_len, optimizer, init_lr, batch_size, init_with_imagenet)] = mean_val_acc
    # # results[(optimizer, init_lr, batch_size)] = mean_val_acc

    if mean_val_acc < best_val:
        print('best val acc updated with', mean_val_acc)
        best_val = mean_val_acc

# Print out results.
for seq_len, optimizer, init_lr, batch_size, init_with_imagenet in (results):
    val_accuracy = results[(seq_len, optimizer, init_lr, batch_size, init_with_imagenet)]
    print('seq_len %e optimizer %e init_lr %e batch_size %e init_with_imagenet: %f' % (
                seq_len, optimizer, init_lr, batch_size, init_with_imagenet))

print('best validation accuracy achieved during cross-validation: %f' % best_val)

# Go throught all validation set 

to check the global performance and look for typical errors

In [None]:
# with sliding window (sw), with overlap of predictions
import matplotlib.pyplot as plt

i3d = InceptionI3d(1, in_channels=3)
i3d.load_state_dict(torch.load(par.data_dir + 'i3d_checkpoints/train_final/000700_val_loss_1.25_lr_6e-06.pt'))
i3d.cuda()
i3d.eval()

# video_folders=[str(item) for item in list(range(100,185))]
video_folders = par.video_folders

vel_pred_all_sw = []
vel_gt_all_sw = []
mse_loss_vel_tot = 0
for video_folder in video_folders:
    print('video folder: ', video_folder)
    vel_gt = np.load('{}{}.npy'.format(par.vel_dir, video_folder))
    vel_gt = vel_gt[:24]
    vel_pred = np.zeros_like(vel_gt)
    num_frames = 24

    sliding_window_output=[]
    for start_f in range(num_frames - par.seq_len+1):
        # load
        imgs = load_rgb_frames(video_folder, start_f, par.seq_len)
        # mask out cars
        mask_for_seq = np.zeros((224, 224))
        for i in range(start_f, start_f + par.seq_len): # for i in range(start, start+num):
            mask = np.load(os.path.join(par.mask_dir, video_folder, str(i).zfill(10)+'.npy'))
            mask_for_seq += mask
        mask_for_seq[mask_for_seq > 1] = 1
        for i,image in enumerate(imgs):
            imgs[i][mask_for_seq==1] = 0
        # final preprocess
        imgs = np.array([((img/255.)*2 - 1) for img in imgs]) # Normalize
        imgs = np.asarray(imgs, dtype=np.float32) # Convert ot float32
        # predict
        inputs = video_to_tensor(imgs)
        inputs = inputs.unsqueeze(0).cuda()
        per_frame_logits = i3d(inputs)
        t = inputs.size(2)
        per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')
        vel_pred_seq = per_frame_logits.squeeze().data.cpu().numpy()
        # save the prediction
        if not sliding_window_output: # if sliding_window_output is still empty
            sliding_window_output = [[item] for item in list(vel_pred_seq)]
        else:
            sliding_window_output.append([vel_pred_seq[-1]]) # add the last element (new one)
            for i in range(len(vel_pred_seq) - 1):
                sliding_window_output[start_f + i].append(vel_pred_seq[i])

    # average over sliding window (sw) predictions
    vel_pred = np.array([sum(item)/len(item) for item in sliding_window_output])
    vel_pred_all_sw.append(vel_pred)
    vel_gt_all_sw.append(vel_gt)

    mse_loss_vel = np.sum((vel_pred-vel_gt) ** 2)
    mse_loss_vel /= len(vel_gt)
    print('mse_loss_vel', mse_loss_vel) # loss within folder
    mse_loss_vel_tot += mse_loss_vel

mse_loss_vel_tot /= len(video_folders)
print('\nTotal loss: ', mse_loss_vel_tot)

print('\nPlots of ', video_folder) # the last video folder, just to visualize
plt.plot(vel_gt)
plt.plot(vel_pred)

# Test

In [None]:
# with sliding window, with overlap of predictions

import matplotlib.pyplot as plt

i3d = InceptionI3d(1, in_channels=3)
i3d.load_state_dict(torch.load(par.data_dir + 'i3d_checkpoints/train_for_test_video/000900.pt'))
i3d.cuda()
i3d.eval()

# num_frames = 10798
num_frames = 5399 # 10 fps
print('num_frames',num_frames)

sliding_window_output = []
for start_f in range(num_frames - par.seq_len + 1):
    print('start_f: ', start_f)

    imgs = load_rgb_frames('000', start_f, par.seq_len) # the test folder contaains only one folder 000

    mask_for_seq = np.zeros((224, 224))
    for i in range(start_f, start_f + par.seq_len):
        mask = np.load(os.path.join(par.mask_dir, '000', str(i).zfill(10) + '.npy'))
        mask_for_seq += mask
    mask_for_seq[mask_for_seq > 1] = 1
    for i,image in enumerate(imgs):
        imgs[i][mask_for_seq == 1] = 0

    imgs = np.array([((img/255.)*2 - 1) for img in imgs]) #Normalize
    imgs = np.asarray(imgs, dtype=np.float32)             # Convert ot float32
  
    inputs = video_to_tensor(imgs)
    inputs = inputs.unsqueeze(0).cuda()
    per_frame_logits = i3d(inputs)
    t = inputs.size(2)
    per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')
    vel_pred_seq = per_frame_logits.squeeze().data.cpu().numpy()


    if not sliding_window_output: # if sliding_window_output is empty
        sliding_window_output = [[item] for item in list(vel_pred_seq)]
    else:
        sliding_window_output.append([vel_pred_seq[-1]])
        for i in range(len(vel_pred_seq) - 1):
            sliding_window_output[start_f + i].append(vel_pred_seq[i])

vel_pred_sw = np.array([sum(item)/len(item) for item in sliding_window_output])

plt.plot(vel_pred_sw)
np.save('vel_pred_test_sw.npy',vel_pred_sw)