In [1]:
import torch
import torch.nn as nn
from torchvision.transforms import Compose, Resize, ToTensor, ToPILImage

from torchvision.models.inception import Inception3
from torch.utils.model_zoo import load_url as load_state_dict_from_url

import numpy as np
import pandas as pd

import os

import cv2
import imageio
import os

import warnings
warnings.filterwarnings("ignore")

In [2]:
class positionalEncoder(nn.Module):

  def __init__(self, frame_length, encoding_length):
    super().__init__()

    self.embedding = nn.Embedding(frame_length, encoding_length)
    
    self.frame_length = frame_length

  def forward(self, x):

    pe = self.embedding(torch.tensor([i for i in range(self.frame_length)]))

    pe = pe[:x.shape[0]]

    if len(x.shape) == 3:
      self.pe = pe.unsqueeze(0).repeat(x.shape[0], 1, 1)
      x = torch.cat((x, pe), 2)
    else:
      x = torch.cat((x, pe), 1)


    return x

In [3]:
inc_v3_url = 'https://download.pytorch.org/models/inception_v3_google-1a9a5a14.pth'

def inception_v3_sliced(stop_layer=3, **kwargs):

    kwargs['transform_input'] = True
    kwargs['init_weights'] = False

    class Inception3Mod(Inception3):

        def __init__(self, stop_layer, **kwargs):
            super(Inception3Mod, self).__init__(**kwargs)
            self.stop_layer = stop_layer
            
        def _forward(self, x):
            layers = [
            self.Conv2d_1a_3x3, self.Conv2d_2a_3x3, self.Conv2d_2b_3x3,
            'maxpool',
            self.Conv2d_3b_1x1, self.Conv2d_4a_3x3,
            'maxpool',
            self.Mixed_5b, self.Mixed_5c, self.Mixed_5d, self.Mixed_6a,
            self.Mixed_6b, self.Mixed_6c, self.Mixed_6d, self.Mixed_6e,
            self.Mixed_7a, self.Mixed_7b, self.Mixed_7c]

            for idx in range(self.stop_layer):
                layer = layers[idx]
                if layer == 'maxpool':
                    x = nn.functional.max_pool2d(x, kernel_size=3, stride=2)
                else:
                    x = layer(x)
            return x, None
        
    model = Inception3Mod(**kwargs, stop_layer=stop_layer)
    state_dict = load_state_dict_from_url(inc_v3_url, progress=True)
    model.load_state_dict(state_dict)
    model.aux_logits = False
    del model.AuxLogits
    return model

In [4]:
class encoderTransformer(nn.Module):

  def __init__(self, num_T_layers, num_fc_layers, outFeatCount, device, testLayer = 1, num_frames_max = 1000, stopLayer = 16, train = True, pos_encode_size = 10, n_hidden = 2048, n_heads = 86, dropout = 0.3):
    super().__init__()

    self.posEncoder = positionalEncoder(num_frames_max, pos_encode_size)
    
    self.visionEncoder = inception_v3_sliced(stop_layer=stopLayer)
    
    self.transform = Compose([ToPILImage(), Resize((299, 299)), ToTensor()])

    inFeatCount = 1280 + pos_encode_size
    
    encoder_layer = nn.TransformerEncoderLayer(inFeatCount, n_heads, n_hidden, dropout)
    self.encoder = nn.TransformerEncoder(encoder_layer, num_T_layers)

    self.fcLayers = []
    currInput = inFeatCount

    for i in range(num_fc_layers):
      if i == num_fc_layers - 1:
        self.fcLayers.append(nn.Linear(currInput, outFeatCount))
      else:
        self.fcLayers.append(nn.Linear(currInput, currInput // 2))
        currInput = currInput // 2

    print(f"{len(self.fcLayers)} fc layers")

    self.device = device
    self.train = train
    
    self.outputfc = min(testLayer, len(self.fcLayers))

    print(f'output layer count: {self.outputfc}')
    
    self.init_weights()

  def init_weights(self):
      initrange = 0.1

      for i in self.fcLayers:
        i.bias.data.zero_()
        i.weight.data.uniform_(-initrange, initrange)

  def forward(self, data, vid = False, toCSV = False, toCSVName = None, toCSVPath = None):

    arr = None
    
    if vid:
      imgArr = []
      vid = imageio.get_reader(data)
      for i, im in enumerate(vid):
        imgArr.append(im)
    
    elif not vid:
      imgArr = data

    for i in imgArr:
      img = self.transform(i)[None]
      picToVal = self.visionEncoder(img)[0]
      to1D = nn.MaxPool2d((8, 8))(picToVal).squeeze()
      if arr == None:
        arr = to1D[None, :]
      else:
        arr = torch.cat((arr,to1D[None, :]), 0)

    if vid: os.system("rm ./*.jpg")
  
    encoded = self.posEncoder(arr) #make sure this works

    result = self.encoder(encoded)

    if self.train:
      for i in self.fcLayers:
        result = i(result)
    else:
      for i in range(min(self.outputfc, len(self.fcLayers))):
        result = self.fcLayers[i](result)
        if toCSV:
          pd.DataFrame(result.numpy()).to_csv(f'{toCSVPath}{toCSVName}.csv')
          return
          
    return result

In [5]:
openface = pd.read_csv("data/OpenFace/trial/lie/trial_lie_001.csv")
openface = openface[['gaze_0_x','gaze_0_y','gaze_0_z','gaze_angle_x', 'gaze_angle_y','AU01_r','AU02_r','AU04_r','AU05_r'\
,'AU06_r','AU07_r','AU09_r','AU10_r','AU12_r','AU14_r']]
openface = torch.tensor(np.array(openface)).float()

encoder_loss_fuction = nn.MSELoss()

In [6]:
def preprocess(path):
    imgArr = []
    Openface_Arr = pd.DataFrame()
    idx = 0
    for video in os.listdir(path):
        if video == 'trial_lie_043.mp4':
            continue
        if idx == 10:
            break
        if video.endswith(".mp4"):
            print('Processing video: ', video, 'finished: ', idx)
            vid = imageio.get_reader(path + "/" + video)
            for frame_number, im in enumerate(vid):
                im = cv2.resize(im, (240, 320))
                imgArr.append(im)
            
            if video[:-4] + ".csv" in os.listdir("data/OpenFace/trial/lie/"):
                file = pd.read_csv("data/OpenFace/trial/lie/" + video[:-4] + ".csv")
                file = file[['gaze_0_x','gaze_0_y','gaze_0_z','gaze_angle_x',
                                'gaze_angle_y','AU01_r','AU02_r','AU04_r','AU05_r',
                                'AU06_r','AU07_r','AU09_r','AU10_r','AU12_r','AU14_r']]
                Openface_Arr = pd.concat([Openface_Arr, file])
            if video[:-4] + ".csv" in os.listdir("data/OpenFace/trial/truth/"):
                file = pd.read_csv("data/OpenFace/trial/truth/" + video[:-4] + ".csv")
                file = file[['gaze_0_x','gaze_0_y','gaze_0_z','gaze_angle_x',
                                'gaze_angle_y','AU01_r','AU02_r','AU04_r','AU05_r',
                                'AU06_r','AU07_r','AU09_r','AU10_r','AU12_r','AU14_r']]
                Openface_Arr = pd.concat([Openface_Arr, file])

            idx += 1

    return imgArr, np.array(Openface_Arr)

imgArr, Openface_Arr = preprocess("../Videos/")

Processing video:  trial_lie_041.mp4 finished:  0
Processing video:  trial_lie_055.mp4 finished:  1
Processing video:  trial_truth_056.mp4 finished:  2
Processing video:  trial_truth_042.mp4 finished:  3
Processing video:  trial_truth_043.mp4 finished:  4
Processing video:  trial_truth_057.mp4 finished:  5
Processing video:  trial_lie_054.mp4 finished:  6
Processing video:  trial_lie_040.mp4 finished:  7
Processing video:  trial_lie_056.mp4 finished:  8
Processing video:  trial_lie_042.mp4 finished:  9


In [7]:
def train_test_split(imgArr, Openface_Arr, test_size, random_state):
    np.random.seed(random_state)
    idx = np.random.permutation(len(imgArr))
    test_size = int(len(imgArr) * test_size)
    X_train = [imgArr[i] for i in idx[test_size:]]
    X_test = [imgArr[i] for i in idx[:test_size]]
    y_train = [Openface_Arr[i] for i in idx[test_size:]]
    y_test = [Openface_Arr[i] for i in idx[:test_size]]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(imgArr, Openface_Arr, test_size=0.2, random_state=42)

In [8]:
model = encoderTransformer(1, 1, 15, 'cpu')

print(np.array(X_train[0:10]).shape)
model(X_train[0:10])

1 fc layers
output layer count: 1
(10, 320, 240, 3)


tensor([[ 3.3334, -0.6740, -0.9687,  0.6636,  0.1556,  1.0081, -1.2435,  0.1888,
         -3.2547, -0.8482,  0.5522,  0.5798,  0.4512, -0.5924, -2.5648],
        [ 3.2803,  0.8815, -1.3916,  1.1645,  3.5791, -0.4872, -2.6105, -0.1547,
         -2.1169,  3.3896,  0.4593,  1.9007, -1.0901,  3.8276, -1.6141],
        [ 2.2881,  0.0915,  1.7063,  0.3307,  0.4165,  0.4064,  0.2147,  1.9171,
          0.5738, -0.0935, -0.4436,  2.0648, -1.6303,  0.7222,  0.3801],
        [ 0.9102, -2.3913,  0.2048, -1.0383,  0.8094, -0.8167, -1.0226,  1.6954,
          0.9251, -0.2036, -0.2613, -0.1793, -1.8403,  1.4284,  0.7816],
        [ 4.1569, -0.3388, -1.3424,  0.2793,  1.2752, -0.6394, -0.7906, -0.5279,
         -1.2171,  1.3674, -0.4504, -1.7296, -1.8088,  0.5283, -2.3353],
        [ 3.7187,  2.8601,  0.5067,  0.2309,  1.7436,  0.8203, -2.3202,  0.7853,
         -1.0152, -0.0242,  2.1018,  4.1005, -2.0456,  1.4224,  1.6945],
        [ 2.6632,  1.1737, -2.1844, -1.3761,  0.3152,  0.3916, -1.1029,  1.5

In [9]:
model = encoderTransformer(1, 1, 15, 'cpu')

def train(model, xtrain, ytrain, xtest, ytest, epochs, batch_size):

    train_loss = []
    val_loss = []

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_function = nn.CrossEntropyLoss()

    for _ in range(epochs):
        tot_loss = 0
        for i in range(0, len(xtrain), batch_size):
            optimizer.zero_grad()
            x = xtrain[i:min(i+batch_size, len(xtrain))]
            y = torch.tensor(ytrain[i:min(i+batch_size, len(ytrain))]).float()
            y_pred = model(x)
            loss = loss_function(y_pred, y)
            tot_loss += loss.item()
            loss.backward()
            optimizer.step()

            print("I am here")

        print(f"Epoch: {_+1}, Train Loss: {tot_loss/len(xtrain)}")
        train_loss.append(tot_loss/len(xtrain))

        if _ % 10 == 0:
            tot_loss = 0
            for i in range(0, len(xtest), batch_size):
                x = xtest[i:min(i+batch_size, len(xtest))]
                y = torch.tensor(ytest[i:min(i+batch_size, len(ytest))]).float()
                y_pred = model(x)
                loss = loss_function(y_pred, y)
                tot_loss += loss.item()
            print(f"Epoch: {_+1}, Validation Loss: {tot_loss/len(xtest)}")
            print("--------------------------------------------------")
            print("Real label:" + str(y[0]))
            print("Predicted label:" + str(y_pred[0]))
            print("--------------------------------------------------")
            val_loss.append(tot_loss/len(xtest))

    return train_loss, val_loss

train(model, X_train, y_train, X_test, y_test, 20, 10)

1 fc layers
output layer count: 1
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am here
I am h

KeyboardInterrupt: 