In [1]:
import numpy as np
from torchvision import models, datasets, transforms
import torch
from torch import nn, optim
import torch.nn.functional as F
import video_dataset
import transforms_v2
import matplotlib.pyplot as plt
import transforms as vid_transforms
import time

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Computing on device : {}'.format(device))

Computing on device : cuda


In [3]:
class Network(nn.Module):
    
    def __init__(self):
        super(Network, self).__init__()
        
        self.layer1 = nn.Linear(512,5000)
        self.layer2 = nn.Linear(5000, 2500)
        self.layer3 = nn.Linear(2500, 1250)
        self.layer4 = nn.Linear(1250, 200)
        self.layer5 = nn.Linear(200, 2)

    def forward(self, x):
        
        #network 1
        x = x.view(x.shape[0], -1)
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.relu(self.layer3(x))
        x = F.relu(self.layer4(x))
        x = F.log_softmax(self.layer5(x), dim=1)
        
        return x

In [4]:
models.video.r3d_18(pretrained=True)


VideoResNet(
  (stem): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1):

In [5]:
class EnsembleNetwork(nn.Module):
    
    def __init__(self):
        
        super(EnsembleNetwork, self).__init__()
        
        self.model_1 = models.video.r3d_18(pretrained=True)
        self.model_2 = models.video.r3d_18(pretrained=True)
        self.model_3 = models.video.r3d_18(pretrained=True)
        self.model_4 = models.video.r3d_18(pretrained=True)
        self.model_5 = models.video.r3d_18(pretrained=True)
        
        self.model_1.fc = nn.Identity()
        self.model_2.fc = nn.Identity()
        self.model_3.fc = nn.Identity()
        self.model_4.fc = nn.Identity()
        self.model_5.fc = nn.Identity()
        
        for param in self.model_1.parameters():
            param.requires_grad = False
        for param in self.model_2.parameters():
            param.requires_grad = False
        for param in self.model_3.parameters():
            param.requires_grad = False
        for param in self.model_4.parameters():
            param.requires_grad = False
        for param in self.model_5.parameters():
            param.requires_grad = False
            
        self.model_1.fc = Network()
        self.model_2.fc = Network()
        self.model_3.fc = Network()
        self.model_4.fc = Network()
        self.model_5.fc = Network()

        
        self.layer1 = nn.Linear(10,2)
        
    def forward(self, x):
        
        x1 = self.model_1(x[0])
        x2 = self.model_2(x[1])
        x3 = self.model_3(x[2])
        x4 = self.model_4(x[3])
        x5 = self.model_5(x[4])
        
        output = torch.cat([x1, x2, x3, x4, x5], dim=1)
        x = F.log_softmax(self.layer1(output), dim=1)
        
        return x

In [6]:
data_transforms = transforms.Compose([transforms_v2.FaceCropper(),
                                      transforms_v2.Normalise()])

In [7]:
data_dir = './videos/test'

train_data = datasets.Kinetics400(data_dir, 10, 
                                  extensions=('.mp4'), 
                                  num_workers=2,
                                  transform=data_transforms)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

  "The pts_unit 'pts' gives wrong results and will be removed in a "
  "The pts_unit 'pts' gives wrong results and will be removed in a "





In [8]:
trainloader = torch.utils.data.DataLoader(train_data,
                                          batch_size=10,
                                          shuffle=False,
                                          num_workers=0)

In [9]:
def image_split(x):
    
    '''Splits a 224x224 image into 4 equal quarters.
    
    Returns: 
    Whole image, Top left corner, Bottom left corner,
    Top right corner, Bottom right corner.'''
    
    return x, x[:,:,:,0:112,0:112], x[:,:,:,0:112,112:224],\
           x[:,:,:,112:224,0:112], x[:,:,:,112:224,112:224]

In [10]:
def impurity(x):
    
    x = x.cpu().numpy()
    x = x.ravel()
    total = 1

    for i in np.unique(x):
        
        total -= np.mean(x == i) ** 2
        
    return total

In [11]:
ensemble_network = EnsembleNetwork()
ensemble_network.to(device)

EnsembleNetwork(
  (model_1): VideoResNet(
    (stem): BasicStem(
      (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
      (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Sequential(
          (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
          (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
        )
        (conv2): Sequential(
          (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
          (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (relu): ReLU(inplace=True)
      )
      (1): BasicBlock(
        (conv1): Sequential(
          (0): Conv3DSimple(64, 64, kernel_size=

In [12]:
criterion = nn.NLLLoss()
optimizer = optim.Adam(ensemble_network.parameters(), lr=1e-4)

In [13]:
test_transforms = transforms.Compose([transforms.ToPILImage(), transforms.Resize(224)])

In [14]:
epoch = 1

start = time.time()
for e in range(epoch):
    
    accuracy_list = []

    train_losses = []
    running_loss = 0

    for images, labels in trainloader:
        
        ensemble_network.train()
        images = images.reshape(images.shape[0], 3, 10, 224, 224).to(device, dtype=torch.float)
        labels = labels.to(device)
        splits = image_split(images)
        log_output = ensemble_network(splits)
        
        loss = criterion(log_output, labels)
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()

        running_loss += loss.item()
        train_losses.append(running_loss/len(trainloader))
        
    
        with torch.no_grad():
            ensemble_network.eval()
            preds = ensemble_network(splits)
            
            preds = torch.exp(preds)
        
            top_p, top_class = preds.topk(1, dim=1)
            
            equals = top_class == labels.view(*top_class.shape)

            accuracy_list.append(np.mean(equals.cpu().numpy()))

    print(f"Training loss: {running_loss} | Epoch Accuracy : {np.mean(accuracy_list)} | pred_impurity : {impurity(top_class)}")

end = time.time()

print(f'Time taken for {epoch} epochs : {end - start}')

  "The pts_unit 'pts' gives wrong results and will be removed in a "


10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920


  batch_boxes, batch_points = np.array(batch_boxes), np.array(batch_points)
  boxes = np.array(boxes)
  probs = np.array(probs)
  points = np.array(points)


10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920
10 1080 1920

ValueError: setting an array element with a sequence.