# Setup

## Imports

In [None]:
# %pip install opencv-python
# %pip install numpy==1.23.0
# %pip install matplotlib
# %pip install pandas
# %pip install matrepr
# %pip install mediapipe opencv-python-headless
# !wget -q https://storage.googleapis.com/mediapipe-models/gesture_recognizer/gesture_recognizer/float16/1/gesture_recognizer.task

In [None]:
import cv2
import os
import shutil
import numpy as np
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

## Globals

In [None]:
input_video = "new_video.mov" # Replace with path of input video
output_video = "output_video_clean" 
output_directory = "videoframes"
visualize_tracking = True # Set whether or not you want tracking info in the output video
base_options = python.BaseOptions(model_asset_path='gesture_recognizer.task')
options = vision.GestureRecognizerOptions(base_options=base_options)
recognizer = vision.GestureRecognizer.create_from_options(options)

## Video processing

In [None]:
def create_video(frames, output_name):
    height, width, _ = frames[0].shape
    output_video_path = "{}.mp4".format(output_name)

    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video = cv2.VideoWriter(output_video_path, fourcc, 50, (width, height)) 

    for frame in frames:
        video.write(frame)
    
    video.release()

In [None]:
# Split video into frames in the videoframes directory
# Runtime: ~60 seconds

# Delete directory if it already exists and make a new one
if os.path.exists(output_directory):
   shutil.rmtree(output_directory)
os.makedirs(output_directory)

vidcap = cv2.VideoCapture(input_video)

if vidcap.isOpened():
  frame_count = 0

  while True:
      ret, frame = vidcap.read()
      
      if not ret:
          break

      # Save each frame as an image in the output folder
      frame_name = f"frame_{frame_count:04d}.jpg"
      frame_path = os.path.join(output_directory, frame_name)
      cv2.imwrite(frame_path, frame)

      frame_count += 1

  vidcap.release()

  print(f"Video frames saved in '{output_directory}'")

## Page boundary detection

In [None]:
first_frame_path = os.path.join(output_directory, "frame_0000.jpg")

def get_paper_bounds(image_path):
        image = cv2.imread(image_path)
        cv2.namedWindow('Image')
        cv2.imshow('Image', image)

        # Select box using a mouse event callback
        box = cv2.selectROI('Image', image, fromCenter=False, showCrosshair=True)
        cv2.destroyWindow('Image')

        # Return bounds: x_min, y_min, x_max, y_max
        return int(box[0]), int(box[1]), int(box[0] + box[2]), int(box[1] + box[3])

threshold_x_min, threshold_y_min, threshold_x_max, threshold_y_max = get_paper_bounds(first_frame_path)
print(threshold_x_max, threshold_x_min, threshold_y_max, threshold_y_min)

# Annotator

In [None]:
video_frames = []

selecting  = False
highlighted = False
colored = False

highlight_color = [255, 233, 160]
changed_color = [255, 0, 150]

max_x, max_y, min_x, min_y = float('-inf'), float('-inf'), float('inf'), float('inf')
threshold_x_min, threshold_y_min, threshold_x_max, threshold_y_max = 430, 200, 1200, 800
last_gesture = ""
all_boxes = []

for filename in sorted(os.listdir(output_directory)):
    print(filename)
    frame_path = os.path.join(output_directory, filename)
    
    if os.path.isfile(frame_path):  # Check if it's a file (not a subdirectory)
        # Create image for gesture recognition
        image = mp.Image.create_from_file(frame_path)
        # Create image that we can draw on
        img = cv2.imread(frame_path)
        img_height, img_width = img.shape[:2]
        
        # For each frame, identify the hand gesture in the input image
        recognition_result = recognizer.recognize(image)
        
        if recognition_result.gestures:
            # Get the most probable gesture of the hand
            # Format: Category(index, score, display_name, category_name)
            top_gesture = recognition_result.gestures[0][0] 
        else:
            top_gesture = "None"
        
        # Perform an action for each given gesture
        if top_gesture != "None":
            gesture = top_gesture.category_name
            hand_landmarks = recognition_result.hand_landmarks

            if gesture == "Open_Palm": # Start or stop selecting 
                if not selecting and last_gesture != "Closed":
                    # Start selecting and reset variables
                    selecting = True

                    highlighted = False
                    colored = False
                    mask = np.zeros((img.shape[0], img.shape[1], 3), dtype="uint8")
                    max_x, max_y, min_x, min_y = float('-inf'), float('-inf'), float('inf'), float('inf')
                    all_boxes = []

                elif selecting and last_gesture == "Selecting":
                    # Stop selecting and assign the bounding  box
                    selecting = False

                    if max_x > min_x and max_y > min_y:
                        all_boxes.append((min_x, max_x, min_y, max_y))   

                last_gesture = "Closed"
            elif gesture == "None" and last_gesture == "Closed":
                # Sometimes there are gestures during a sequence of open_palm frames that get misidentified as "None", so assign them as "Closed" to keep the selection going
                last_gesture = "Closed"

            elif selecting: # Speecify the selection box coordinates
                # If we are selecting, this should be a Thumbs_Up gesture, but don't check in case it gets misclassified
                # Thumb tip is landmark index 4
                thumb_landmark = hand_landmarks[0][4] 
                ind_x = int(thumb_landmark.x * img_width)
                ind_y = int(thumb_landmark.y * img_height)

                if visualize_tracking:
                    cv2.putText(img, f"Thumb coordinates: ({ind_x}, {ind_y})",  (50, 150), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.circle(img, (ind_x, ind_y), 5, (0, 0, 255), -1)  # -1 means a filled circle

                # If the thumb coordinate is not an outlier, update the max and min values
                if (threshold_x_min <= ind_x <= threshold_x_max) and (threshold_y_min <= ind_y <= threshold_y_max):
                    max_x = max(max_x, ind_x)
                    max_y = max(max_y, ind_y)
                    min_x = min(min_x, ind_x)
                    min_y = min(min_y, ind_y)

                last_gesture = "Selecting"

            elif not selecting: # Perform an annotation action
                if gesture == "Victory": # Highlight
                    if not highlighted: 
                        # Get the most recent box
                        min_x, max_x, min_y, max_y = all_boxes[len(all_boxes)-1]

                        # Update mask: iterate through box and set pixels above a certain intensity (white) to the highlight color
                        for i_i in range(min_y, max_y):
                            for j in range(min_x, max_x):
                                if sum(img[i_i][j]) > 660: # 220*3
                                    mask[i_i][j] = highlight_color
                        
                        highlighted = True 
                        
                elif gesture == "ILoveYou": # Change text color
                    if not colored: 
                        # Get the most recent box
                        min_x, max_x, min_y, max_y = all_boxes[len(all_boxes)-1]
                        
                        # Update mask: iterate through box and set pixels below a certain intensity (black) to the new text color
                        for i_i in range(min_y, max_y):
                            for j in range(min_x, max_x):
                                if sum(img[i_i][j]) < 300:
                                    mask[i_i][j] = changed_color
                        
                        colored = True
                
                elif gesture == "Closed_Fist": # Erase
                    if highlighted or colored:
                        # Reset booleans and mask
                        highlighted = False
                        colored = False
                        mask = np.zeros((img.shape[0], img.shape[1], 3), dtype="uint8")
 
                last_gesture = "Nonselecting"
            
            # Apply mask to frame if we have highlighted or changed color
            if highlighted or colored:
                updated_mask_sum = np.sum(mask, axis = -1)
                for i_i in range(updated_mask_sum.shape[0]):
                    for j in range(updated_mask_sum.shape[1]):
                        if updated_mask_sum[i_i][j] > 0:
                            if list(img[i_i][j]) != list(mask[i_i][j]):
                                img[i_i][j] = mask[i_i][j]
            
            if visualize_tracking:
                # Add text to the image
                cv2.putText(img, f"Gesture: {gesture}",  (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

                # Add selection box to the image
                for min_x, max_x, min_y, max_y in all_boxes:
                    cv2.rectangle(img, (int(min_x), int(min_y)), (int(max_x), int(max_y)), (0, 255, 0), 2)

                # # Display the image with the added text
                # cv2.imshow('Image with Text', img)
                # cv2.waitKey(5)
                # cv2.destroyAllWindows()

            # Add frame to video_frames array
            video_frames.append(img)

In [None]:
# Compile video from saved output frames
create_video(video_frames, output_video)