In [71]:
import cv2 as cv
import numpy as np
import keras
from keras.models import load_model
from pynput.keyboard import Key, Controller

In [72]:
global background
background = None
calibration_frames = 120
frame_count = 0
recalibrating = True
cooldown = False
detected = False
accum_weight = 0.1
top, right, bottom, left = 0, 300, 300, 600
cooldown_frames = 30
frames_since_gesture = 0
last_gesture = 'None'
detection_delay_frames = 30
frames_since_detection = 0
model = load_model('./hand_recognition_model.keras')

def calibrate_background(region, accum_weight):
    global background
    if background is None:
        background = region.copy().astype('float')
        return
    cv.accumulateWeighted(region, background, accum_weight)
    
def segment(region, threshold=25):
    diff = cv.absdiff(background.astype('uint8'), region)
    thresholded = cv.threshold(diff, threshold, 255, cv.THRESH_BINARY)[1]
    
    contours, _ = cv.findContours(thresholded.copy(), cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
    if len(contours) == 0:
        return
    else:
        segmented = max(contours, key=cv.contourArea)
        return thresholded, segmented
    
def get_predicted_gesture(model):
    image = cv.imread('temp.jpg')
    image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    image = cv.resize(image, (100, 100))

    image = image.reshape(1, 100, 100, 1)

    prediction = model.predict_on_batch(image)
    gesture = np.argmax(prediction)

    match gesture:
        case 0:
            return 'None'
        case 1:
            return 'Play/Pause' # Ok
        case 2:
            return 'Play Next' # Thumbs up
        case 3:
            return 'Play Previous' # Thumbs down
        case 4:
            return 'Play/Pause' # Fist
        case 5:
            return 'Play/Pause' # Five
        case _:
            return 'None'

In [73]:
keyboard = Controller()
cap = cv.VideoCapture(0)

while True:
    _, frame = cap.read()
    frame = cv.flip(frame, 1)
    frame_copy = frame.copy()
    
    region = cv.cvtColor(frame[top:bottom, right:left], cv.COLOR_BGR2GRAY)
    region = cv.GaussianBlur(region, (7, 7), 1.0)
    
    if recalibrating:
        if frame_count < calibration_frames:
            calibrate_background(region, accum_weight)
            cv.putText(frame, 'Recalibrating...', (50, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            frame_count += 1
        else:
            recalibrating = False
    elif not cooldown:
        segmented_region = segment(region)
        if segmented_region:
            thresholded, segmented = segmented_region
            cv.drawContours(
                frame, [segmented + (right, top)], -1, (0, 0, 255)
            )
            cv.imwrite('temp.jpg', thresholded)
            gesture = get_predicted_gesture(model)
            cv.putText(
                frame, gesture, (50, 50),
                cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2
            )
            cv.imshow('thresholded', thresholded)
            
            if gesture != 'None' and not detected:
                detected = True
                frames_since_detection = 0
            elif frames_since_detection < detection_delay_frames:
                frames_since_detection += 1
            else:
                cooldown = True
                detected = False
                frames_since_gesture = 0
                last_gesture = gesture
                match gesture:
                    case 'Play/Pause':
                        keyboard.tap(Key.media_play_pause)
                    case 'Play Next':
                        keyboard.tap(Key.media_next)
                    case 'Play Previous':
                        keyboard.tap(Key.media_previous)
    elif frames_since_gesture < cooldown_frames:
        cv.putText(
                frame, last_gesture, (50, 50),
                cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2
            )
        frames_since_gesture += 1
    else:
        cooldown = False
        last_gesture = 'None'
                        
    
    cv.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 2)
    cv.imshow("video feed", frame)
    
    keypress = cv.waitKey(1) & 0xFF
    if keypress == ord("q"):
        break
    elif keypress == ord("r"):
        recalibrating = True
        background = None
        frame_count = 0

cap.release()
cv.destroyAllWindows()