In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import numpy as np
import torch
from torchvision import models, transforms, datasets
import torch.nn.functional as F
from torch import nn, optim
import face_detector
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import cv2
import time
from PIL import Image

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class Network(nn.Module):
    
    def __init__(self):
        super(Network, self).__init__()
        
        self.layer1 = nn.Linear(512,5000)
        self.layer2 = nn.Linear(5000, 2500)
        self.layer3 = nn.Linear(2500, 1250)
        self.layer4 = nn.Linear(1250, 200)
        self.layer5 = nn.Linear(200, 2)

    def forward(self, x):
        
        #network 1
        x = x.view(x.shape[0], -1)
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.relu(self.layer3(x))
        x = F.relu(self.layer4(x))
        x = F.log_softmax(self.layer5(x), dim=1)
        
        return x



In [5]:
class EnsembleNetwork(nn.Module):
    
    def __init__(self):
        
        super(EnsembleNetwork, self).__init__()
        
        self.model_1 = models.resnet18(pretrained=True)
        self.model_2 = models.resnet18(pretrained=True)
        self.model_3 = models.resnet18(pretrained=True)
        self.model_4 = models.resnet18(pretrained=True)
        self.model_5 = models.resnet18(pretrained=True)
        
        self.model_1.fc = nn.Identity()
        self.model_2.fc = nn.Identity()
        self.model_3.fc = nn.Identity()
        self.model_4.fc = nn.Identity()
        self.model_5.fc = nn.Identity()
        
        for param in self.model_1.parameters():
            param.requires_grad = False
        for param in self.model_2.parameters():
            param.requires_grad = False
        for param in self.model_3.parameters():
            param.requires_grad = False
        for param in self.model_4.parameters():
            param.requires_grad = False
        for param in self.model_5.parameters():
            param.requires_grad = False
            
        self.model_1.fc = Network()
        self.model_2.fc = Network()
        self.model_3.fc = Network()
        self.model_4.fc = Network()
        self.model_5.fc = Network()

        
        self.layer1 = nn.Linear(10,2)
        
    def forward(self, x):
        
        x1 = self.model_1(x[0])
        x2 = self.model_2(x[1])
        x3 = self.model_3(x[2])
        x4 = self.model_4(x[3])
        x5 = self.model_5(x[4])
        
        output = torch.cat([x1, x2, x3, x4, x5], dim=1)
        x = F.log_softmax(self.layer1(output), dim=1)
        
        return x

In [6]:
data_transforms = transforms.Compose([face_detector.FaceCropper(),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize(mean=[0.4041, 0.3321, 0.2849],
                                                           std=[0.1585, 0.1532, 0.1337])])

In [7]:
extensions = '.mp4'

data_dir = './videos'

train_dir = data_dir + '/train_sample_videos'
test_dir = data_dir + '/test_videos'

In [8]:
train_data = datasets.DatasetFolder(train_dir,
                                    loader=face_detector.random_frame_selector,
                                    extensions=extensions,
                                    transform=data_transforms)

In [9]:
trainloader = torch.utils.data.DataLoader(train_data,
                                          batch_size=16,
                                          shuffle=True,
                                          num_workers=0)

In [10]:
ensemble_network = EnsembleNetwork()
ensemble_network.to(device)

EnsembleNetwork(
  (model_1): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, t

In [11]:
criterion = nn.NLLLoss()
optimizer = optim.Adam(ensemble_network.parameters(), lr=1e-4)

In [12]:
def image_split(x):
    
    '''Splits a 224x224 image into 4 equal quarters.
    
    Returns: 
    Whole image, Top left corner, Bottom left corner,
    Top right corner, Bottom right corner.'''
    
    return x, x[:,:,0:112,0:112], x[:,:,0:112,112:224],\
           x[:,:,112:224,0:112], x[:,:,112:224,112:224]

In [13]:
def impurity(x):
    
    x = x.cpu().numpy()
    x = x.ravel()
    total = 1

    for i in np.unique(x):
        
        total -= np.mean(x == i) ** 2
        
    return total

In [14]:
ensemble_network.to(device)

epoch = 1

In [15]:
start = time.time()
for e in range(epoch):
    
    accuracy_list = []

    train_losses = []
    running_loss = 0
    
    train_data = datasets.DatasetFolder(train_dir,
                                    loader=face_detector.random_frame_selector,
                                    extensions=extensions,
                                    transform=data_transforms)

    trainloader = torch.utils.data.DataLoader(train_data,
                                          batch_size=32,
                                          shuffle=True,
                                          num_workers=0)

    for images, labels in trainloader:
        
        ensemble_network.train()
        images = images.to(device)
        labels = labels.to(device)
        splits = image_split(images)
        log_output = ensemble_network(splits)
        
        loss = criterion(log_output, labels)
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()

        running_loss += loss.item()
        train_losses.append(running_loss/len(trainloader))
        
    
        with torch.no_grad():
            ensemble_network.eval()
            preds = ensemble_network(splits)
            
            preds = torch.exp(preds)
        
            top_p, top_class = preds.topk(1, dim=1)
            
            equals = top_class == labels.view(*top_class.shape)

            accuracy_list.append(np.mean(equals.cpu().numpy()))

    #print(f"Training loss: {running_loss} | Epoch Accuracy : {np.mean(accuracy_list)} | pred_impurity : {impurity(top_class)}")

end = time.time()

print(f'Time taken for {epoch} epochs : {end - start}')

Time taken for 1 epochs : 125.29698181152344
