# Project - Deep Learning for Autonomous Vehicules
### Group 11 :  Luca Rezzonico, Aya Zghari, Arwen Giraud

## Milestone 1 - Detection

### Initialization

#### Required instalations

In [None]:
!pip install mediapipe opencv-python
!pip install "opencv-python-headless<4.3"

Collecting mediapipe
  Downloading mediapipe-0.8.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32.7 MB)
[K     |████████████████████████████████| 32.7 MB 136 kB/s 
Installing collected packages: mediapipe
Successfully installed mediapipe-0.8.9.1
Collecting opencv-python-headless<4.3
  Downloading opencv_python_headless-4.2.0.34-cp37-cp37m-manylinux1_x86_64.whl (21.6 MB)
[K     |████████████████████████████████| 21.6 MB 1.4 MB/s 
Installing collected packages: opencv-python-headless
Successfully installed opencv-python-headless-4.2.0.34


#### Add dependencies

In [None]:
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode

import PIL
import io
import html
import cv2
import time
import numpy as np
import mediapipe as mp
import matplotlib.pyplot as plt
%matplotlib inline



```
# Ce texte est au format code
```

####Import mediapipe library for joint detection

In [None]:
mp_holistic = mp.solutions.holistic

### Constants

In [None]:
# Store the indexes of the tips landmarks of each finger of a hand in a list.
fingers_tips_ids = [mp_holistic.HandLandmark.INDEX_FINGER_TIP, mp_holistic.HandLandmark.MIDDLE_FINGER_TIP,
                    mp_holistic.HandLandmark.RING_FINGER_TIP, mp_holistic.HandLandmark.PINKY_TIP]

### Function definition

Setting Up Darknet for YOLOv4


In [None]:
# clone darknet repo
!git clone https://github.com/AlexeyAB/darknet

Cloning into 'darknet'...
remote: Enumerating objects: 15412, done.[K
remote: Total 15412 (delta 0), reused 0 (delta 0), pack-reused 15412[K
Receiving objects: 100% (15412/15412), 14.02 MiB | 19.51 MiB/s, done.
Resolving deltas: 100% (10356/10356), done.


In [None]:
# change makefile to have GPU, OPENCV and LIBSO enabled
%cd darknet
!sed -i 's/OPENCV=0/OPENCV=1/' Makefile
!sed -i 's/GPU=0/GPU=1/' Makefile
!sed -i 's/CUDNN=0/CUDNN=1/' Makefile
!sed -i 's/CUDNN_HALF=0/CUDNN_HALF=1/' Makefile
!sed -i 's/LIBSO=0/LIBSO=1/' Makefile

/content/darknet


In [None]:
# make darknet (builds darknet so that you can then use the darknet.py file and have its dependencies)
!make

In [None]:
# get bthe scaled yolov4 weights file that is pre-trained to detect 80 classes (objects) from shared google drive
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1V3vsIaxAlGWvK4Aar9bAiK5U0QFttKwq' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1V3vsIaxAlGWvK4Aar9bAiK5U0QFttKwq" -O yolov4-csp.weights && rm -rf /tmp/cookies.txt

In [None]:
# import darknet functions to perform object detections
from darknet import *
# load in our YOLOv4 architecture network
network, class_names, class_colors = load_network("cfg/yolov4-csp.cfg", "cfg/coco.data", "yolov4-csp.weights")
width = network_width(network)
height = network_height(network)

# darknet helper function to run detection on image
def darknet_helper(img, width, height):
  darknet_image = make_image(width, height, 3)
  img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  img_resized = cv2.resize(img_rgb, (width, height),
                              interpolation=cv2.INTER_LINEAR)

  # get image ratios to convert bounding boxes to proper size
  img_height, img_width, _ = img.shape
  width_ratio = img_width/width
  height_ratio = img_height/height

  # run model on darknet style image to get detections
  copy_image_from_bytes(darknet_image, img_resized.tobytes())
  detections = detect_image(network, class_names, darknet_image)
  free_image(darknet_image)
  return detections, width_ratio, height_ratio

#### Convertion functions



Function to convert JavaScript object into OpenCV image

In [None]:
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream

In [None]:
def bbox_to_bytes(bbox_array):
  """
  Params:
          bbox_array: Numpy array (pixels) containing rectangle to overlay on video stream.
  Returns:
        bytes: Base64 image byte string
  """
  # convert array into PIL image
  bbox_PIL = PIL.Image.fromarray(bbox_array, 'RGBA')
  iobuf = io.BytesIO()
  # format bbox into png for return
  bbox_PIL.save(iobuf, format='png')
  # format return string
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return bbox_bytes

#### Video functions

JavaScript to properly create our live video stream using our webcam as input

In [None]:
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;
    
    var pendingResolve = null;
    var shutdown = false;
    
    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }
    
    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }
    
    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);
      
      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);
           
      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);
      
      const instruction = document.createElement('div');
      instruction.innerHTML = 
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };
      
      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);
      
      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();
      
      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }
            
      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }
      
      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;
      
      return {'create': preShow - preCreate, 
              'show': preCapture - preShow, 
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)

Function to add the bbox to the live video

In [None]:
def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

Take a photo and analyse the symbol on it

In [None]:
def take_photo(filename='photo.jpg', quality=0.8):
  js = Javascript('''
    async function takePhoto(quality) {
      const div = document.createElement('div');
      const capture = document.createElement('button');
      capture.textContent = 'Capture';
      div.appendChild(capture);

      const video = document.createElement('video');
      video.style.display = 'block';
      const stream = await navigator.mediaDevices.getUserMedia({video: true});

      document.body.appendChild(div);
      div.appendChild(video);
      video.srcObject = stream;
      await video.play();

      // Resize the output to fit the video element.
      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      // Wait for Capture to be clicked.
      await new Promise((resolve) => capture.onclick = resolve);

      const canvas = document.createElement('canvas');
      canvas.width = video.videoWidth;
      canvas.height = video.videoHeight;
      canvas.getContext('2d').drawImage(video, 0, 0);
      stream.getVideoTracks()[0].stop();
      div.remove();
      return canvas.toDataURL('image/jpeg', quality);
    }
    ''')
  display(js)

  # get photo data
  data = eval_js('takePhoto({})'.format(quality))

  with mp_holistic.Holistic(min_detection_confidence = 0.5, min_tracking_confidence = 0.5) as holistic:
      # get OpenCV format image
      img = js_to_image(data)
      #img = cv2.flip(img, 1)

      bbox_array, fingers_statuses, image = process_img(img, holistic)

      # Image back to BGR for rendering
      image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

      # save image
      cv2.imwrite(filename, image)

  return filename, fingers_statuses

#### Functions for symbole recognition

This function will count the number of fingers up for each hand in the image.

In [None]:
def countFingers(image, hand_landmarks, hand_label, fingers_statuses):
    '''
    Params:
        image:            The image of the hands on which the fingers counting is required to be performed.
        hand_landmarks:   The output of the hands landmarks detection performed on the image of the hands.
        hand_label:       'Left' or 'Right'
        fingers_statuses: A dictionary containing the status (i.e., open or close) of each finger of both hands.
    Returns:
        fingers_statuses: A dictionary containing the status (i.e., open or close) of each finger of both hands.
    '''    
    # Iterate over the indexes of the tips landmarks of each finger of the hand.
    for tip_index in fingers_tips_ids:
        
        # Retrieve the label (i.e., index, middle, etc.) of the finger on which we are iterating upon.
        finger_name = tip_index.name.split("_")[0]
        
        # Check if the finger is up by comparing the y-coordinates of the tip and pip landmarks.
        if (hand_landmarks.landmark[tip_index].y < hand_landmarks.landmark[tip_index - 2].y):
            
            # Update the status of the finger in the dictionary to true.
            fingers_statuses[hand_label.upper()+"_"+finger_name] = True
    
    # Retrieve the y-coordinates of the tip and mcp landmarks of the thumb of the hand.
    thumb_tip_x = hand_landmarks.landmark[4].x
    thumb_mcp_x = hand_landmarks.landmark[2].x
    
    # Check if the thumb is up by comparing the hand label and the x-coordinates of the retrieved landmarks.
    if (hand_label=='Right' and (thumb_tip_x > thumb_mcp_x)) or (hand_label=='Left' and (thumb_tip_x < thumb_mcp_x)):
        
        # Update the status of the thumb in the dictionary to true.
        fingers_statuses[hand_label.upper()+"_THUMB"] = True

    # Return the status of each finger
    return fingers_statuses

Analyse image

In [None]:
def process_img(img, holistic):
    # create transparent overlay for bounding box
    bbox_array = np.zeros([480,640,4], dtype=np.uint8)

    # Convert an image from one color space to another => BGE to RGB
    image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    w, h, c = image.shape

    # Make Detections
    results = holistic.process(image)

    # Initialize a dictionary to store the status (i.e., True for open and False for close) of each finger of both hands.
    fingers_statuses = {'RIGHT_THUMB': False, 'RIGHT_INDEX': False, 'RIGHT_MIDDLE': False, 'RIGHT_RING': False,
                        'RIGHT_PINKY': False, 'LEFT_THUMB': False, 'LEFT_INDEX': False, 'LEFT_MIDDLE': False,
                        'LEFT_RING': False, 'LEFT_PINKY': False}

    # get face bounding box for overlay
    if results.right_hand_landmarks:
        x_min = round(results.right_hand_landmarks.landmark[5].x * h)
        x_max = round(results.right_hand_landmarks.landmark[17].x * h)
        y_min = round(results.right_hand_landmarks.landmark[12].y * w)
        y_max = round(results.right_hand_landmarks.landmark[0].y * w)

        bbox_array = cv2.rectangle(bbox_array,(x_min, y_min),(x_max, y_max),(255,0,0),2)

        fingers_statuses = countFingers(image, results.right_hand_landmarks, 'Right', fingers_statuses)

    if results.left_hand_landmarks:
        x_min = round(results.left_hand_landmarks.landmark[5].x * h)
        x_max = round(results.left_hand_landmarks.landmark[17].x * h)
        y_min = round(results.left_hand_landmarks.landmark[12].y * w)
        y_max = round(results.left_hand_landmarks.landmark[0].y * w)

        bbox_array = cv2.rectangle(bbox_array,(x_min, y_min),(x_max, y_max),(0, 0, 255),2)

        fingers_statuses = countFingers(image, results.left_hand_landmarks, 'Left', fingers_statuses)

    return bbox_array, fingers_statuses, image

Check if the symbole shown is the correct one

In [None]:
def check_symbol():
  correct_symbol = True
  for hand_label in ['Right', 'Left']:
      for tip_index in fingers_tips_ids:
        # Retrieve the label (i.e., index, middle, etc.) of the finger on which we are iterating upon.
        finger_name = tip_index.name.split("_")[0]
            
        # Compare the status of the fingers.
        if fingers_statuses[hand_label.upper()+"_"+finger_name] != model_statuses[hand_label.upper()+"_"+finger_name]:
            correct_symbol = False

  return correct_symbol

### Main Program



```
# Ce texte est au format code
```

#### Step 1 - Define symbol

Take a photo in order to have the reference symbol

In [None]:
try:
  filename, model_statuses = take_photo('photo.jpg')
  print('Saved to {}'.format(filename))
  
  # Show the image which was just taken.
  display(Image(filename))
except Exception as err:
  # Errors will be thrown if the user does not have a webcam or if they do not
  # grant the page permission to access it.
  print(str(err))

Sanity check : Just to verify fingers were correctly identified



In [None]:
print(model_statuses)

#### Step 2 - Detection during live stream

In [None]:
# start streaming video from webcam
video_stream()
# label for video
label_html = 'Capturing...'
# initialze bounding box to empty
bbox = ''
count = 0 

with mp_holistic.Holistic(min_detection_confidence = 0.5, min_tracking_confidence = 0.5) as holistic:
    while True:
        js_reply = video_frame(label_html, bbox)
        if not js_reply:
            break

        # convert JS response to OpenCV Image
        img = js_to_image(js_reply["img"])


        # call our darknet helper on video frame
        detections, width_ratio, height_ratio = darknet_helper(frame, width, height)

        # loop through detections and draw them on transparent overlay image
        for label, confidence, bbox in detections:
            left, top, right, bottom = bbox2points(bbox)
            left, top, right, bottom = int(left * width_ratio), int(top * height_ratio), int(right * width_ratio), int(bottom * height_ratio)
            bbox_array = cv2.rectangle(bbox_array, (left, top), (right, bottom), class_colors[label], 2)
            bbox_array = cv2.putText(bbox_array, "{} [{:.2f}]".format(label, float(confidence)),
                                     (left, top - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, class_colors[label], 2)


        bbox_array, fingers_statuses, image = process_img(img, holistic)

        correct_symbol = check_symbol()

        if not correct_symbol:
          bbox_array = np.zeros([480,640,4], dtype=np.uint8)

        bbox_array[:,:,3] = (bbox_array.max(axis = 2) > 0 ).astype(int) * 255
        # convert overlay of bbox into bytes
        bbox_bytes = bbox_to_bytes(bbox_array)
        # update bbox so next frame gets new overlay
        bbox = bbox_bytes

        # Image back to BGR for rendering
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)