In [1]:
#Input Features = 14
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset, DataLoader
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor, ToPILImage
import cv2
import os

import imageio_ffmpeg

In [2]:
from torchvision.models.inception import Inception3
from torch.hub import load_state_dict_from_url
from torch.utils.model_zoo import load_url as load_state_dict_from_url
import torch.nn.functional as F
import imageio

model_urls = {
    # Inception v3 ported from TensorFlow
    'inception_v3_google': 'https://download.pytorch.org/models/inception_v3_google-1a9a5a14.pth',
}

def inception_v3_sliced(pretrained=False, progress=True, stop_layer=3, **kwargs):
    """Inception v3 model architecture from
    `"Rethinking the Inception Architecture for Computer Vision" <http://arxiv.org/abs/1512.00567>`_.
    .. note::
        **Important**: In contrast to the other models the inception_v3 expects tensors with a size of
        N x 3 x 299 x 299, so ensure your images are sized accordingly.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
        aux_logits (bool): If True, add an auxiliary branch that can improve training.
            Default: *True*
        transform_input (bool): If True, preprocesses the input according to the method with which it
            was trained on ImageNet. Default: *False*
    """
    if pretrained:
        if 'transform_input' not in kwargs:
            kwargs['transform_input'] = True
        if 'aux_logits' in kwargs:
            original_aux_logits = kwargs['aux_logits']
            kwargs['aux_logits'] = True
        else:
            original_aux_logits = True
        kwargs['init_weights'] = False  # we are loading weights from a pretrained model
        
        class Inception3Mod(Inception3):

            def __init__(self, stop_layer, **kwargs):
                super(Inception3Mod, self).__init__(**kwargs)
                self.stop_layer = stop_layer
                
            def _forward(self, x):
                layers = [
                self.Conv2d_1a_3x3,
                self.Conv2d_2a_3x3,
                self.Conv2d_2b_3x3,
                'maxpool',
                self.Conv2d_3b_1x1,
                self.Conv2d_4a_3x3,
                'maxpool',
                self.Mixed_5b,
                self.Mixed_5c,
                self.Mixed_5d,
                self.Mixed_6a,
                self.Mixed_6b,
                self.Mixed_6c,
                self.Mixed_6d,
                self.Mixed_6e,
                self.Mixed_7a,
                self.Mixed_7b,
                self.Mixed_7c,
                ]

                for idx in range(self.stop_layer):
                    layer = layers[idx]
                    if layer == 'maxpool':
                      x = F.max_pool2d(x, kernel_size=3, stride=2)
                    else:
                      x = layer(x)
                return x, None


        model = Inception3Mod(**kwargs, stop_layer=stop_layer)
        state_dict = load_state_dict_from_url(model_urls['inception_v3_google'],
                                              progress=progress)
        model.load_state_dict(state_dict)
        if not original_aux_logits:
            model.aux_logits = False
            del model.AuxLogits
        return model

    return Inception3Mod(**kwargs)

In [3]:
from math import e
class positionalEncoder(nn.Module):

  def __init__(self, frame_length, encoding_length):
    super().__init__()

    embedding = nn.Embedding(frame_length, encoding_length)

    self.pe = embedding(torch.tensor([i for i in range(frame_length)]).unsqueeze(1)).squeeze()

  def forward(self, x):

    embedded = []

    if len(x.shape) == 3:
      for i in range(x.shape[0]):
        embedded.append(torch.cat((x[i], self.pe), 1).detach().numpy())
      return torch.tensor(embedded)
    else:
      return torch.cat((x, self.pe[0:x.shape[0]]), 1)

class encoderTransformer(nn.Module):

  def __init__(self, num_T_layers, num_fc_layers, outFeatCount, device, num_frames_max = 1000, stopLayer = 16, testLayer = None, train = True, pos_encode_size = 3, n_hidden = 2048, n_heads = 86, dropout = 0.3):
    super().__init__()

    self.posEncoder = positionalEncoder(num_frames_max, 10)
    
    self.visionEncoder = inception_v3_sliced(pretrained=True, 
                                             transform_input = True,
                                             stop_layer=stopLayer)
    
    self.transform = Compose([ToPILImage(),Resize((299, 299)), ToTensor()])

    inFeatCount = 1280 + 10
    
    encoder_layer = nn.TransformerEncoderLayer(inFeatCount, n_heads, n_hidden, dropout)
    self.encoder = nn.TransformerEncoder(encoder_layer, num_T_layers)

    self.fcLayers = []
    currInput = inFeatCount

    for i in range(num_fc_layers):
      if i == num_fc_layers - 1:
        self.fcLayers.append(nn.Linear(currInput, outFeatCount))
      else:
        self.fcLayers.append(nn.Linear(currInput, currInput // 2))
        currInput = currInput // 2

    self.device = device
    self.train = train
    
    self.outputLayer = testLayer if (testLayer and testLayer < len(self.fcLayers) - 1) else len(self.fcLayers) - 1
    
    self.init_weights()

  def init_weights(self):
      initrange = 0.1

      for i in self.fcLayers:
        i.bias.data.zero_()
        i.weight.data.uniform_(-initrange, initrange)

  def forward(self, vidLink):

    vid = imageio.get_reader(vidLink)

    imgArr = []

    frameCt = 0

    for frame_number, im in enumerate(vid):
      imgArr.append(im) 

    Arr = []

    for i in imgArr:
      img = self.transform(i)[None]

      picToVal = self.visionEncoder(img)[0]
      to1D = nn.MaxPool2d((8, 8))(picToVal).squeeze()
      Arr.append(to1D.tolist())

    encoded = self.posEncoder(torch.Tensor(Arr))
    
    result = self.encoder(encoded)

    if self.train:
      for i in self.fcLayers:
        result = i(result)
    else:
      for i in range(self.outputLayer):
        result = self.fcLayers[i](result)

    os.system("rm ./*.jpg")

    return result

class classifierTransformer(nn.Module):

  def __init__(self, inFeatCount, num_T_layers, num_fc_layers, num_frames, device, pos_encode_size = 5, n_heads = 4, n_hidden = 2048, dropout = 0.3, outFeatCount = 2):
    super().__init__()

    self.posEncoder = positionalEncoder(num_frames, pos_encode_size)

    num_features = inFeatCount + pos_encode_size

    n_hidden = max(n_hidden, 2*num_features)

    encoder_layer = nn.TransformerEncoderLayer(inFeatCount + pos_encode_size, n_heads, n_hidden, dropout)
    self.encoder = nn.TransformerEncoder(encoder_layer, num_T_layers)
    
    currInput = num_frames * num_features

    self.fcLayers = []

    for i in range(num_fc_layers):
      if i == num_fc_layers - 1:
        self.fcLayers.append(nn.Linear(currInput, outFeatCount))
      else:
        self.fcLayers.append(nn.Linear(currInput, currInput // 2))
        currInput = currInput // 2

    self.device = device

    self.init_weights()

  def init_weights(self):
      initrange = 0.1

      for i in self.fcLayers:
        i.bias.data.zero_()
        i.weight.data.uniform_(-initrange, initrange)

  def forward(self, x):
    #x.shape = [num_frames, feat_count]
    encoded = self.posEncoder(x)
    #encoded.shape = [num_frames, feat_count + pos_encoding_count]
    data = self.encoder(encoded)
    #data.shape = [num_frames, feat_count + pos_encoding_count]
    data = torch.reshape(data, (1,-1))
    #data.shape = [1, num_frames * (feat_count + pos_encoding_count)] 

    for i in self.fcLayers:
        data = i(data)     

    return torch.tensor(data.tolist()).float()

In [4]:
os.path.exists("User_0_run_0.mp4")

True

In [7]:
model = encoderTransformer(1, 1, 15, 'cpu')

res = model("trial_lie_001.mp4")

res.shape # -> (n_frames = 114, n_features = 15)

rm: ./*.jpg: No such file or directory


torch.Size([510, 15])

In [11]:
import pandas as pd
import numpy as np
openface = pd.read_csv("processed_lie/trial_lie_001.csv")
openface = openface[['gaze_0_x','gaze_0_y','gaze_0_z','gaze_angle_x', 'gaze_angle_y','AU01_r','AU02_r','AU04_r','AU05_r'\
,'AU06_r','AU07_r','AU09_r','AU10_r','AU12_r','AU14_r']]
openface = torch.tensor(np.array(openface)).float()
enoder_loss_fuction = nn.MSELoss()
print(openface.shape)
loss = enoder_loss_fuction(res, openface)
loss.backward()

torch.Size([510, 15])
