<a href="https://colab.research.google.com/github/mrpranti/Face/blob/main/PredictionOnWebcamVideos_VGG_Vers3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd drive/MyDrive/DL Group Project
%ls

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1q1oh4KNPvyl3ACTsbffzNd5ALcMrB42e/DL Group Project
[0m[01;34m'Draft Version'[0m/      [01;34mProject_Code[0m/           [01;34m'Project Paper'[0m/
 [01;34mLecture_Notebooks[0m/  [01;34m'Project Organization'[0m/


In [None]:
import numpy as np
import pandas as pd
import matplotlib
matplotlib.style.use('ggplot')
import matplotlib.pyplot as plt
import cv2
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch

In [None]:
test_csv_path = 'Project_Code/Data/test.csv'
keypoint_model_path = 'Project_Code/OutputFolder/Model03_400epochs'
face_model_path = 'Project_Code/FaceRecognizerModel'
output_path = 'Project_Code/WebcamTest/'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Size of images
SIZE = 96

In [None]:
# load model
modelFile = face_model_path + "/res10_300x300_ssd_iter_140000.caffemodel"
configFile = face_model_path + "/deploy.prototxt.txt"
net = cv2.dnn.readNetFromCaffe(configFile, modelFile)

# 1) Prepare Facial Keypoints Detection Model

In [None]:
# load additional libraries 
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# to understand the idea behind the following structure: 
# https://pytorch.org/docs/stable/generated/torch.nn.Module.html
class FaceKeypointModel(nn.Module):
    def __init__(self):
        super(FaceKeypointModel, self).__init__()
        
        # modle adapted from the concept of VGG net and facial-keypoint-detection.ipynb
        self.conv1 = nn.Conv2d(1, 32, kernel_size = 3, padding = 2)
        self.batch_normal1 = nn.BatchNorm2d(32)
        
        self.conv2 = nn.Conv2d(32, 32, kernel_size=3, padding =2 ) 
        self.batch_normal2 = nn.BatchNorm2d(32)

        self.conv3= nn.Conv2d(32, 64, kernel_size = 3, padding = 2)
        self.batch_normal3 = nn.BatchNorm2d(64)
         
        self.conv4= nn.Conv2d(64, 64, kernel_size = 3, padding = 2)
        self.batch_normal4 = nn.BatchNorm2d(64)

        self.conv5= nn.Conv2d(64, 96, kernel_size = 3, padding = 2)
        self.batch_normal5 = nn.BatchNorm2d(96)

        self.conv6= nn.Conv2d(96, 96, kernel_size = 3, padding = 2)
        self.batch_normal6 = nn.BatchNorm2d(96)

        self.conv7= nn.Conv2d(96, 128, kernel_size = 3, padding = 2)
        self.batch_normal7 = nn.BatchNorm2d(128)

        self.conv8= nn.Conv2d(128, 128, kernel_size = 3, padding = 2)
        self.batch_normal8 = nn.BatchNorm2d(128)

        self.conv9= nn.Conv2d(128, 256, kernel_size = 3, padding = 2)
        self.batch_normal9 = nn.BatchNorm2d(256)

        self.conv10= nn.Conv2d(256, 256, kernel_size = 3, padding = 2)
        self.batch_normal10 = nn.BatchNorm2d(256) 

        self.conv11= nn.Conv2d(256, 512, kernel_size = 3, padding = 2)
        self.batch_normal11 = nn.BatchNorm2d(512)

        self.conv12= nn.Conv2d(512, 512, kernel_size = 3, padding = 2)
        self.batch_normal12 = nn.BatchNorm2d(512)  

        self.fc1 = nn.Linear(512, 512)
        self.fc2 = nn.Linear(512, 30 )
        self.pool = nn.MaxPool2d(2, 2)
        self.activ = nn.LeakyReLU(0.1)
        self.dropout = nn.Dropout2d(p=0.2)
    def forward(self, x):
         x = self.conv1(x)
         x = self.activ(x)
         x = self.batch_normal1(x)

         x = self.conv2(x)
         x = self.activ(x)
         x = self.batch_normal2(x)
         x = self.pool(x)

         x = self.conv3(x)
         x = self.activ(x)
         x = self.batch_normal3(x)

         x = self.conv4(x)
         x = self.activ(x)
         x = self.batch_normal4(x)
         x = self.pool(x)

         x = self.conv5(x)
         x = self.activ(x)
         x = self.batch_normal5(x)

         x = self.conv6(x)
         x = self.activ(x)
         x = self.batch_normal6(x)
         x = self.pool(x)

         x = self.conv7(x)
         x = self.activ(x)
         x = self.batch_normal7(x)

         x = self.conv8(x)
         x = self.activ(x)
         x = self.batch_normal8(x)
         x = self.pool(x)

         x = self.conv9(x)
         x = self.activ(x)
         x = self.batch_normal9(x)

         x = self.conv10(x)
         x = self.activ(x)
         x = self.batch_normal10(x)
         x = self.pool(x)

         x = self.conv11(x)
         x = self.activ(x)
         x = self.batch_normal11(x)

         x = self.conv12(x)
         x = self.activ(x)
         x = self.batch_normal12(x)

        
      

         # bs is the batch size or respectively the number of instances 
         # simultaneously loaded into the model
         bs, _, _, _ = x.shape 

         # The next step finally transforms the images into 1-dim vectors of lenght 128
         x = F.adaptive_avg_pool2d(x, 1).reshape(bs, -1)
         # Apply dropout for regularization and preventing the co-adaptation of neurons
         # https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html?highlight=dropout#torch.nn.Dropout
         x = self.dropout(x)
         # apply a classic linear function
         x = self.fc1(x)
         out = self.fc2(x) 
         return out

In [None]:
model = FaceKeypointModel().to(DEVICE)
# load the model checkpoint
checkpoint = torch.load(f"{keypoint_model_path}/model.pth")
# load model weights state_dict
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

FaceKeypointModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
  (batch_normal1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
  (batch_normal2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
  (batch_normal3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
  (batch_normal4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv5): Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
  (batch_normal5): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv6): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
  (batch_normal6): BatchN

# 2) Face Recognition on Webcam Image using the new model

In [None]:
# import dependencies
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import cv2
import numpy as np
import PIL
import io
import html
import time

In [None]:
# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

# function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream
def bbox_to_bytes(bbox_array):
  """
  Params:
          bbox_array: Numpy array (pixels) containing rectangle to overlay on video stream.
  Returns:
        bytes: Base64 image byte string
  """
  # convert array into PIL image
  bbox_PIL = PIL.Image.fromarray(bbox_array, 'RGBA')
  iobuf = io.BytesIO()
  # format bbox into png for return
  bbox_PIL.save(iobuf, format='png')
  # format return string
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return bbox_bytes

In [None]:
def take_photo(filename='photo.jpg', quality=0.8):
  js = Javascript('''
    async function takePhoto(quality) {
      const div = document.createElement('div');
      const capture = document.createElement('button');
      capture.textContent = 'Capture';
      div.appendChild(capture);

      const video = document.createElement('video');
      video.style.display = 'block';
      const stream = await navigator.mediaDevices.getUserMedia({video: true});

      document.body.appendChild(div);
      div.appendChild(video);
      video.srcObject = stream;
      await video.play();

      // Resize the output to fit the video element.
      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      // Wait for Capture to be clicked.
      await new Promise((resolve) => capture.onclick = resolve);

      const canvas = document.createElement('canvas');
      canvas.width = video.videoWidth;
      canvas.height = video.videoHeight;
      canvas.getContext('2d').drawImage(video, 0, 0);
      stream.getVideoTracks()[0].stop();
      div.remove();
      return canvas.toDataURL('image/jpeg', quality);
    }
    ''')
  display(js)

  ###### start: new code #####
  # get photo data
  data = eval_js('takePhoto({})'.format(quality))
  # get OpenCV format image
  img = js_to_image(data) 

  # trun color picture to grayscale image -> needed for keypoint prediction
  gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

  h, w = img.shape[:2]
  blob = cv2.dnn.blobFromImage(cv2.resize(img, (300, 300)), 1.0, (300, 300), (104.0, 117.0, 123.0))
  # what is blob?
  # shape: (1, 3, 300, 300) -> 3 color channels, size: 300 x 300
  net.setInput(blob)
  faces = net.forward()
  # what is faces?
  # faces.shape: (1, 1, 200, 7)
  # Theoretically, 200 faces could be identified. 
  # For i-th face we have the following attributes: 
  #  - confidence that this is really a face: [0,0,i-1,2]
  #  - coordinates for the face bounding box: [0,0,i-1,3:7] (x,y,x1,y2)

  # add face bounding boxes to image
  for i in range(faces.shape[2]):
    confidence = faces[0, 0, i, 2]
    if confidence > 0.5:
      box = faces[0, 0, i, 3:7] * np.array([w, h, w, h])
      (x1, y1, x2, y2) = box.astype("int")
      cv2.rectangle(img, (x1,y1), (x2, y2), (255, 0, 0), 2) 

      # keypoint prediction
      x_coord, y_coord = make_prediction(gray, x1, y1, x2, y2)
      for i in range(len(x_coord)):
          img = cv2.circle(img, (x_coord[i],y_coord[i]), radius=3, color=(0, 0, 255), thickness=-1)

  cv2.imwrite(output_path + filename, img)

  return filename

In [None]:
def make_prediction(gray_img, x1, y1, x2, y2):
  # extract face area
  face_box_orig=gray_img[y1:y2,x1:x2]
  h_face, w_face = face_box_orig.shape[:2]
  # resize this face image to 96x96
  face_box_resized, vert_black, border_width = resize_image(face_box_orig, size=(SIZE,SIZE))
  # if face_box_orig is not squared, a black border is added to make it squared before it is resized to SIZExSIZE
  # there are two options: the image is higher than wide
  # -> black border on the left and right side -> vert_black is 'True'
  # -> black border on the bottom and the top -> vert_black is 'False'
  # border_width: width of border on each side

  # predict keypoints
  keypoints = predict_keypoints(face_box_resized)
  keypoints_reshaped = keypoints.reshape(-1,2)
  # now the keypoints coordinates have to be transformed back into the size of 
  # the original webcam image:
  # if black border had to be added to the left and right side 
  if vert_black:
    x_coord = (keypoints_reshaped[:,0] * h_face)/SIZE  - border_width + x1
    y_coord = (keypoints_reshaped[:,1] * h_face)/SIZE + y1
  # or at the top and bottom:
  else:
    x_coord = (keypoints_reshaped[:,0] * w_face)/SIZE  + x1
    y_coord = (keypoints_reshaped[:,1] * w_face)/SIZE - border_width + y1

  x_coord = x_coord.round().astype(int)
  y_coord = y_coord.round().astype(int) 
  
  return x_coord, y_coord

We have one problem: The face recognized by the detector is not necessarily squared. If we take this part of the original image and resize it to 96x96 we receive a distorted image. That could deteriorate the accuracy of our keypoint detector. 
Idea: resize the face image while preserving the aspect ratio and fill the remaining pixels with black. As our model was trained with images that also contained black areas, that should not be a problem. 

In [None]:
# https://stackoverflow.com/questions/44650888/resize-an-image-without-distortion-opencv
def resize_image(img, size=(SIZE,SIZE)):

    h, w = img.shape[:2]
    c = img.shape[2] if len(img.shape)>2 else 1. 
    # because we have grayscale image as input -> c = 1

    if h == w: 
        return cv2.resize(img, size, cv2.INTER_AREA)

    vert_black = h > w
    if vert_black:
      dif = h
    else:
      dif = w

    interpolation = cv2.INTER_AREA if dif > (size[0]+size[1])//2 else cv2.INTER_CUBIC

    x_pos = (dif - w)//2
    y_pos = (dif - h)//2

    if vert_black:
      border_width = x_pos
    else:
      border_width = y_pos

    if len(img.shape) == 2:
        mask = np.zeros((dif, dif), dtype=img.dtype)
        mask[y_pos:y_pos+h, x_pos:x_pos+w] = img[:h, :w]
    else:
        mask = np.zeros((dif, dif, c), dtype=img.dtype)
        mask[y_pos:y_pos+h, x_pos:x_pos+w, :] = img[:h, :w, :]

    return cv2.resize(mask, size, interpolation), vert_black, border_width

In [None]:
def predict_keypoints(img):
  model.eval()
  with torch.no_grad():
    orig_img = img.reshape(SIZE, SIZE).copy()
    img = img.reshape(1, SIZE, SIZE)
    img = img / 255.0
    img = torch.tensor(img, dtype=torch.float)
    img = img.unsqueeze(0).to(DEVICE)
        
    # forward pass through the model
    keypoints = model(img).cpu().detach().numpy()
  return keypoints

In [None]:
try:
  filename = take_photo()
  print('Saved to {}'.format(filename))
  
  # Show the image which was just taken.
  display(Image(output_path + filename))

except Exception as err:
  # Errors will be thrown if the user does not have a webcam or if they do not
  # grant the page permission to access it.
  print(str(err))

<IPython.core.display.Javascript object>

KeyboardInterrupt: ignored

# 3) Now, let's try a live video stream

In [None]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;
    
    var pendingResolve = null;
    var shutdown = false;
    
    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }
    
    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }
    
    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);
      
      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);
           
      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);
      
      const instruction = document.createElement('div');
      instruction.innerHTML = 
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };
      
      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);
      
      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();
      
      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }
            
      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }
      
      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;
      
      return {'create': preShow - preCreate, 
              'show': preCapture - preShow, 
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)
  
def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

Prediction without facial keypoints

In [None]:
# start streaming video from webcam
video_stream()
# label for video
label_html = 'Capturing...'
# initialze bounding box to empty
bbox = ''
count = 0 
while True:
    js_reply = video_frame(label_html, bbox)
    if not js_reply:
        break

    # convert JS response to OpenCV Image
    img = js_to_image(js_reply["img"])

    # create transparent overlay for bounding box
    bbox_array = np.zeros([480,640,4], dtype=np.uint8)

    h, w = img.shape[:2]
    blob = cv2.dnn.blobFromImage(cv2.resize(img, (300, 300)), 1.0, (300, 300), (104.0, 117.0, 123.0))
    net.setInput(blob)
    faces = net.forward()

    # add face bounding boxes to image
    for i in range(faces.shape[2]):
      confidence = faces[0, 0, i, 2]
      if confidence > 0.5:
        box = faces[0, 0, i, 3:7] * np.array([w, h, w, h])
        (x, y, x1, y1) = box.astype("int")
        cv2.rectangle(bbox_array, (x,y), (x1, y1), (255, 0, 0), 2) 

    bbox_array[:,:,3] = (bbox_array.max(axis = 2) > 0 ).astype(int) * 255
    # convert overlay of bbox into bytes
    bbox_bytes = bbox_to_bytes(bbox_array)
    # update bbox so next frame gets new overlay
    bbox = bbox_bytes

<IPython.core.display.Javascript object>

Prediction with facial keypoints

In [None]:
# start streaming video from webcam
video_stream()
# label for video
label_html = 'Capturing...'
# initialze bounding box to empty
bbox = ''
count = 0 
while True:
    js_reply = video_frame(label_html, bbox)
    if not js_reply:
        break

    # convert JS response to OpenCV Image
    img = js_to_image(js_reply["img"])

    # create transparent overlay for bounding box
    bbox_array = np.zeros([480,640,4], dtype=np.uint8)

    # grayscale image for face detection
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    h, w = img.shape[:2]
    blob = cv2.dnn.blobFromImage(cv2.resize(img, (300, 300)), 1.0, (300, 300), (104.0, 117.0, 123.0))
    net.setInput(blob)
    faces = net.forward()

    # add face bounding boxes to image
    for i in range(faces.shape[2]):
      confidence = faces[0, 0, i, 2]
      if confidence > 0.5:
        box = faces[0, 0, i, 3:7] * np.array([w, h, w, h])
        (x1, y1, x2, y2) = box.astype("int")
        cv2.rectangle(bbox_array, (x1,y1), (x2, y2), (255, 0, 0), 2) 

        # keypoint prediction
        x_coord, y_coord = make_prediction(gray, x1, y1, x2, y2)
        # add keypoints to image
        for i in range(len(x_coord)):
            bbox_array = cv2.circle(bbox_array, (x_coord[i],y_coord[i]), radius=3, color=(0, 0, 255), thickness=-1)

    bbox_array[:,:,3] = (bbox_array.max(axis = 2) > 0 ).astype(int) * 255
    # convert overlay of bbox into bytes
    bbox_bytes = bbox_to_bytes(bbox_array)
    # update bbox so next frame gets new overlay
    bbox = bbox_bytes

<IPython.core.display.Javascript object>

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
