# FINAL SOLUTION (RUN BELOW 2 CELLS)

In [None]:
import cv2
import mediapipe as mp
import numpy as np

def extract_landmarks_from_frame(frame):
    mp_pose = mp.solutions.pose
    pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5)

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = pose.process(frame_rgb)

    landmarks = []
    if results.pose_landmarks:
        for idx, lmk in enumerate(results.pose_landmarks.landmark):
            # Increase weightage for hands and legs
            if idx in [15, 17, 19, 21, 13, 16, 18, 20, 22, 14]:
                landmarks.append([5*lmk.x, 5*lmk.y, lmk.z])
            else:
                landmarks.append([lmk.x, lmk.y, lmk.z])
    else:
        print("No pose landmarks detected in the frame.")
    landmarks = np.array(landmarks).flatten()

    return np.array(landmarks)

def resize_frame(frame, target_width):
    height, width = frame.shape[:2]
    aspect_ratio = width / height
    target_height = int(target_width / aspect_ratio)
    return cv2.resize(frame, (target_width, target_height))

def compare_landmarks(landmarks_1, landmarks_2):
    # Make sure both arrays have the same length
  
    min_length = min(len(landmarks_1), len(landmarks_2))
    landmarks_1 = landmarks_1[:min_length]
    landmarks_2 = landmarks_2[:min_length]

    # Compute similarity
    similarity = np.dot(landmarks_1.flatten(), landmarks_2.flatten()) / \
                 (np.linalg.norm(landmarks_1) * np.linalg.norm(landmarks_2))

    return similarity


In [None]:

gesture_image_path = "gesture_image.png"
gesture_frame = cv2.imread(gesture_image_path)
gesture_landmarks = extract_landmarks_from_frame(gesture_frame)

mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5)

record_video = input("Do you want to record a test video? (y/n): ").lower() == 'y'

if record_video:
    video_output_path = "user_recorded_test_video.mp4"
    frame_width = 640
    frame_height = 480
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(video_output_path, fourcc, 30.0, (frame_width, frame_height))

    cap = cv2.VideoCapture(0)  # Open default camera (index 0)
    print("Recording video. Press 'q' to stop recording.")
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        out.write(frame)
        cv2.imshow("Recording", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    out.release()
    cv2.destroyAllWindows()
    test_video_path = "user_recorded_test_video.mp4"

else:
    test_video_path = "test_video_waving.mp4"

test_video = cv2.VideoCapture(test_video_path)
threshold = float(input("Enter number between 0 and 1 for precision: "))

frame_width = int(test_video.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(test_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
target_width = 640  # Set the target width for resizing
output_path = "output_vid_final.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, 30.0, (target_width, frame_height))

while test_video.isOpened():
    ret, frame = test_video.read()
    if not ret:
        break
    
    frame = resize_frame(frame, target_width)  # Resize frame to fit within target width

    landmarks = extract_landmarks_from_frame(frame)
    if landmarks is not None:
        similarity = compare_landmarks(landmarks, gesture_landmarks)
        if similarity > threshold:
            cv2.putText(frame, "DETECTED", (target_width - 150, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
    out.write(frame)
    cv2.imshow("Output Video", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

test_video.release()
out.release()
cv2.destroyAllWindows()


# Experimentations

##  Using OpenCV MatchTemplate

In [6]:
import cv2
import numpy as np

def detect_gesture(frame, gesture_template):
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    result = cv2.matchTemplate(gray_frame, gesture_template, cv2.TM_CCOEFF_NORMED)
    threshold = 0.5
    locations = np.where(result >= threshold)
    if locations[0].size > 0:
        return True, (locations[1][0], locations[0][0])
    else:
        return False, None

def annotate_frame(frame):
    font = cv2.FONT_HERSHEY_SIMPLEX
    text = 'DETECTED'
    position = (frame.shape[1] - 150, 50)
    font_scale = 1
    font_color = (0, 255, 0)
    thickness = 2
    cv2.putText(frame, text, position, font, font_scale, font_color, thickness, cv2.LINE_AA)
    return frame

def compute_cosine_similarity(frame, gesture_template):
    if gesture_template is None:
        return 0.0
    
    # Flatten the vectors
    vector1 = frame.flatten()
    vector2 = gesture_template.flatten()
    
    # Compute dot product
    dot_product = np.dot(vector1, vector2)
    
    # Compute magnitudes of the vectors
    magnitude1 = np.linalg.norm(vector1)
    magnitude2 = np.linalg.norm(vector2)
    
    # Compute cosine similarity
    cosine_similarity = dot_product / (magnitude1 * magnitude2)
    
    return cosine_similarity

# Read the test video
test_video = cv2.VideoCapture('test_video2.mp4')

# Read the gesture image
gesture_template = cv2.imread('skipping_gesture_image.jpg', cv2.IMREAD_GRAYSCALE)

# Get the dimensions of the test video frames
frame_width = int(test_video.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(test_video.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Resize the gesture template image to match the dimensions of the video frames
gesture_template = cv2.resize(gesture_template, (frame_width, frame_height))

# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
input_frame_rate = 10
output_video = cv2.VideoWriter('output_video.mp4', fourcc, input_frame_rate, (frame_width, frame_height))

while True:
    ret, frame = test_video.read()
    if not ret:
        break

    # Compute similarity
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    similarity = compute_cosine_similarity(gray_frame, gesture_template)

    threshold = 0.5
    if similarity > threshold:
        frame = annotate_frame(frame)

    # Write the annotated frame to the output video
    output_video.write(frame)

    cv2.imshow('Frame', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release everything when done
test_video.release()
output_video.release()
cv2.destroyAllWindows()


# Using cosine similarity

In [27]:
import cv2
import numpy as np

def detect_gesture(frame, gesture_template):
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    result = cv2.matchTemplate(gray_frame, gesture_template, cv2.TM_CCOEFF_NORMED)
    threshold = 0.8
    locations = np.where(result >= threshold)
    if locations[0].size > 0:
        return True, (locations[1][0], locations[0][0])
    else:
        return False, None

def annotate_frame(frame):
    font = cv2.FONT_HERSHEY_SIMPLEX
    text = 'DETECTED'
    position = (frame.shape[1] - 150, 50)
    font_scale = 1
    font_color = (0, 255, 0)
    thickness = 2
    cv2.putText(frame, text, position, font, font_scale, font_color, thickness, cv2.LINE_AA)
    return frame

def compute_cosine_similarity(frame, gesture_template):
    # Flatten the vectors
    vector1 = frame.flatten()
    vector2 = gesture_template.flatten()
    
    # Compute dot product
    dot_product = np.dot(vector1, vector2)
    
    # Compute magnitudes of the vectors
    magnitude1 = np.linalg.norm(vector1)
    magnitude2 = np.linalg.norm(vector2)
    
    # Compute cosine similarity
    cosine_similarity = dot_product / (magnitude1 * magnitude2)
    
    return cosine_similarity

# Open test video
test_video = cv2.VideoCapture('test_video.mp4')

# Open gesture video
gesture_video = cv2.VideoCapture('test_video copy.mp4')

# Read the first frame of the gesture video and use it as a template
ret, gesture_frame = gesture_video.read()
if not ret:
    print("Error: Could not read gesture video.")
    exit()

gesture_template = cv2.cvtColor(gesture_frame, cv2.COLOR_BGR2GRAY)
input_frame_rate = test_video.get(cv2.CAP_PROP_FPS)

# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_video = cv2.VideoWriter('output_video.mp4', fourcc, 10.0, (int(test_video.get(3)), int(test_video.get(4))))

while True:
    ret, frame = test_video.read()
    if not ret:
        break
    
    # Compare each frame of test video with each frame of gesture video
    while True:
        ret_gesture, gesture_frame = gesture_video.read()
        if not ret_gesture:
            gesture_video.set(cv2.CAP_PROP_POS_FRAMES, 0)  # Reset gesture video to beginning
            break
        
        gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        gesture_template = cv2.cvtColor(gesture_frame, cv2.COLOR_BGR2GRAY)
        similarity = compute_cosine_similarity(gray_frame, gesture_template)

        threshold = 0.5  # Adjust threshold as needed
        if similarity > threshold:
            frame = annotate_frame(frame)
            break  # If gesture detected, no need to compare with remaining frames of gesture video

    # Write the annotated frame to the output video
    output_video.write(frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release everything when done
test_video.release()
output_video.release()
cv2.destroyAllWindows()

# Display the output video
output_video = cv2.VideoCapture('output_video.mp4')
while output_video.isOpened():
    ret, frame = output_video.read()
    if not ret:
        break
    cv2.imshow('Output Video', frame)
    if cv2.waitKey(25) & 0xFF == ord('q'):
        break
output_video.release()
cv2.destroyAllWindows()


KeyboardInterrupt: 

# Using BG Removal (dropped)

In [30]:
import cv2

# Load the video
video_path = 'test_video2.mp4'
cap = cv2.VideoCapture(video_path)

# Create background subtractor
bg_subtractor = cv2.createBackgroundSubtractorMOG2()

# Get video properties
fps = cap.get(cv2.CAP_PROP_FPS)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_path = 'preprocessed_test_video.mp4'
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

while True:
    # Read a frame from the video
    ret, frame = cap.read()
    if not ret:
        break
    
    # Apply background subtraction
    fg_mask = bg_subtractor.apply(frame)
    
    # Apply morphological operations for noise reduction
    fg_mask = cv2.morphologyEx(fg_mask, cv2.MORPH_OPEN, None)
    
    # Apply thresholding to get binary mask
    _, fg_mask = cv2.threshold(fg_mask, 128, 255, cv2.THRESH_BINARY)
    
    # Invert the mask
    fg_mask = cv2.bitwise_not(fg_mask)
    
    # Apply the mask to extract the foreground
    foreground = cv2.bitwise_and(frame, frame, mask=fg_mask)
    
    # Find contours in the foreground mask
    contours, _ = cv2.findContours(fg_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if contours:
        # Get the bounding box of the largest contour (assuming it's the person)
        bounding_box = cv2.boundingRect(contours[0])
        x, y, w, h = bounding_box
        
        # Calculate the center of the bounding box
        center_x = x + w // 2
        center_y = y + h // 2
        
        # Calculate the shift needed to center the person
        shift_x = frame.shape[1] // 2 - center_x
        shift_y = frame.shape[0] // 2 - center_y
        
        # Shift the foreground image to center the person
        foreground_centered = cv2.warpAffine(foreground, 
                                              M=np.float32([[1, 0, shift_x], [0, 1, shift_y]]), 
                                              dsize=(frame.shape[1], frame.shape[0]))
        
        # Write the centered foreground frame to the output video
        out.write(foreground_centered)
    else:
        # If no contours found, just write the original foreground frame
        out.write(foreground)
    
    cv2.imshow('Centered Foreground', foreground_centered)
    
    if cv2.waitKey(30) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
out.release()
cv2.destroyAllWindows()


# Using YOLO for Detection first

In [35]:
import cv2
import numpy as np

net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")
classes = []
with open("coco.names", "r") as f:
    classes = [line.strip() for line in f.readlines()]

layer_names = net.getLayerNames()
output_layers_names = net.getUnconnectedOutLayersNames()
output_layers_indices = [layer_names.index(layer) for layer in output_layers_names]
output_layers = [layer_names[i] for i in output_layers_indices]

video_capture = cv2.VideoCapture('test_video2.mp4')
frame_count = 0

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_video = cv2.VideoWriter('YOLO_preprocess_video.mp4', fourcc, 30.0, (int(video_capture.get(3)), int(video_capture.get(4))))

while True:
    ret, frame = video_capture.read()
    if not ret:
        break
    frame_count += 1
    height, width, channels = frame.shape

    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)

    class_ids = []
    confidences = []
    boxes = []
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5 and class_id == 0:
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)

                x = int(center_x - w / 2)
                y = int(center_y - h / 2)

                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)

    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

    for i in range(len(boxes)):
        if i in indexes:
            x, y, w, h = boxes[i]
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

    output_video.write(frame)

video_capture.release()
output_video.release()


## Now there can be two approaches here:-
 1. I can take two similarities one of bounding box coordinates and other one of the grayscaled content inside the bounding box (which might be unecessarily complex)
 2. Or simply MatchTemplate each frame of gesture and test video, if the gesture_video is considerably long we can take 10-20 sample frames and compare each frame with the whole test video

In [None]:
import cv2
import numpy as np

def detect_gesture(gesture_frame, test_frame):
    # Convert frames to grayscale
    gesture_gray = cv2.cvtColor(gesture_frame, cv2.COLOR_BGR2GRAY)
    test_gray = cv2.cvtColor(test_frame, cv2.COLOR_BGR2GRAY)
    
    # Compute the absolute difference between the frames
    diff = cv2.absdiff(gesture_gray, test_gray)
    
    # Threshold the difference image
    _, thresh = cv2.threshold(diff, 30, 255, cv2.THRESH_BINARY)
    
    # Find contours in the thresholded image
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # If contours are found, consider it a match
    if len(contours) > 0:
        return True
    else:
        return False

# Load YOLO
net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")
classes = []
with open("coco.names", "r") as f:
    classes = [line.strip() for line in f.readlines()]

layer_names = net.getLayerNames()
output_layers_names = net.getUnconnectedOutLayersNames()
output_layers_indices = [layer_names.index(layer) for layer in output_layers_names]
output_layers = [layer_names[i] for i in output_layers_indices]

# Load video
video_capture = cv2.VideoCapture('test_video2.mp4')

# Load gesture frame
gesture_frame = cv2.imread('gesture_frame.jpg')

# Process each frame
frame_count = 0
detected_flag = False
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_video = cv2.VideoWriter('YOLO_DETECTED_video.mp4', fourcc, 30.0, (int(video_capture.get(3)), int(video_capture.get(4))))

while True:
    ret, frame = video_capture.read()
    if not ret:
        break
    frame_count += 1
    height, width, channels = frame.shape

    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)

    class_ids = []
    confidences = []
    boxes = []
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5 and class_id == 0:
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)

                x = int(center_x - w / 2)
                y = int(center_y - h / 2)

                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)

    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

    for i in range(len(boxes)):
        if i in indexes:
            x, y, w, h = boxes[i]
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

    if detect_gesture(gesture_frame, frame):
        detected_flag = True
        cv2.putText(frame, 'DETECTED', (width - 150, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

    if detected_flag:
        cv2.putText(frame, 'DETECTED', (width - 150, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

    output_video.write(frame)

video_capture.release()
output_video.release()


### Creating Sample Frames from gesture video

In [7]:
import cv2
import random


def sample_gesture_video(gesture_video_path):

    gesture_video = cv2.VideoCapture(gesture_video_path)

    gesture_frames = []

    num_frames_to_sample = 10
    total_frames_seen = 0

    while len(gesture_frames) < num_frames_to_sample:
        ret, frame = gesture_video.read()
        total_frames_seen += 1
        
        if ret:
            if random.random() < num_frames_to_sample / total_frames_seen:
                gesture_frames.append(frame)

    gesture_video.release()

    # Get the dimensions of the video frames
    height, width, _ = gesture_frames[0].shape

    # Define the output video codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter('gesture_sample_frames.mp4', fourcc, 3.0, (width, height))

    # Write sampled frames to the output video
    for frame in gesture_frames:
        out.write(frame)

    # Release the VideoWriter object
    out.release()
    return out


## YOLO Frame Creator

In [9]:
import cv2
import numpy as np

def process_video_with_YOLO(video_path):
    net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")
    classes = []
    with open("coco.names", "r") as f:
        classes = [line.strip() for line in f.readlines()]

    layer_names = net.getLayerNames()
    output_layers_names = net.getUnconnectedOutLayersNames()
    output_layers_indices = [layer_names.index(layer) for layer in output_layers_names]
    output_layers = [layer_names[i] for i in output_layers_indices]

    video_capture = cv2.VideoCapture(video_path)
    frame_count = 0

    processed_frames = []

    while True:
        ret, frame = video_capture.read()
        if not ret:
            break
        frame_count += 1
        height, width, channels = frame.shape

        blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
        net.setInput(blob)
        outs = net.forward(output_layers)

        class_ids = []
        confidences = []
        boxes = []
        for out in outs:
            for detection in out:
                scores = detection[5:]
                class_id = np.argmax(scores)
                confidence = scores[class_id]
                if confidence > 0.5 and class_id == 0:
                    center_x = int(detection[0] * width)
                    center_y = int(detection[1] * height)
                    w = int(detection[2] * width)
                    h = int(detection[3] * height)

                    x = int(center_x - w / 2)
                    y = int(center_y - h / 2)

                    boxes.append([x, y, w, h])
                    confidences.append(float(confidence))
                    class_ids.append(class_id)

        indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

        for i in range(len(boxes)):
            if i in indexes:
                x, y, w, h = boxes[i]
                cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
                processed_frames.append(frame)

    video_capture.release()

    return processed_frames



In [10]:
processed_frames = process_video_with_YOLO("test_video2.mp4")
output_video_path = "yolo_test_video2.mp4"
frame_width, frame_height = processed_frames[0].shape[1], processed_frames[0].shape[0]
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, 30.0, (frame_width, frame_height))

for frame in processed_frames:
    out.write(frame)

out.release()


In [None]:
import cv2
import numpy as np

def process_image_with_YOLO(image_path):
    net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")
    classes = []
    with open("coco.names", "r") as f:
        classes = [line.strip() for line in f.readlines()]

    layer_names = net.getLayerNames()
    output_layers_names = net.getUnconnectedOutLayersNames()
    output_layers_indices = [layer_names.index(layer) for layer in output_layers_names]
    output_layers = [layer_names[i] for i in output_layers_indices]

    image = cv2.imread(image_path)
    height, width, channels = image.shape

    blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)

    class_ids = []
    confidences = []
    boxes = []
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5 and class_id == 0:  # We consider only the class "person"
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)

                x = int(center_x - w / 2)
                y = int(center_y - h / 2)

                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)

    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

    for i in range(len(boxes)):
        if i in indexes:
            x, y, w, h = boxes[i]
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

    return image

# Process the image
input_image_path = "skipping_gesture_image.png"
output_image = process_image_with_YOLO(input_image_path)

# Save the output image
output_image_path = "yolo_skipping_gesture_image.png"
cv2.imwrite(output_image_path, output_image)


In [3]:
processed_frames = process_video_with_YOLO('test_video2.mp4')
len(processed_frames)


360

## With Mediapipe pose landmark detector

In [13]:
import cv2
import mediapipe as mp
import numpy as np

def extract_landmarks_from_frame(frame):
    mp_pose = mp.solutions.pose
    pose = mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5, min_tracking_confidence=0.5)

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = pose.process(frame_rgb)

    landmarks = None
    if results.pose_landmarks:
        landmarks = np.array([[lmk.x, lmk.y, lmk.z] for lmk in results.pose_landmarks.landmark]).flatten()

    return landmarks

def save_gesture_landmarks(landmarks, save_path):
    np.save(save_path, landmarks)

def load_gesture_landmarks(load_path):
    return np.load(load_path)

def compare_landmarks(landmarks_1, landmarks_2):
    # Make sure both arrays have the same length
    min_length = min(len(landmarks_1), len(landmarks_2))
    landmarks_1 = landmarks_1[:min_length]
    landmarks_2 = landmarks_2[:min_length]

    # Compute similarity
    similarity = np.dot(landmarks_1.flatten(), landmarks_2.flatten()) / \
                 (np.linalg.norm(landmarks_1) * np.linalg.norm(landmarks_2))

    return similarity



In [15]:

gesture_image_path = "skipping_gesture_image.jpg"
gesture_frame = cv2.imread(gesture_image_path)
gesture_landmarks = extract_landmarks_from_frame(gesture_frame)
save_path = "gesture_landmarks.npy"
save_gesture_landmarks(gesture_landmarks, save_path)

gesture_landmarks = load_gesture_landmarks(save_path)

mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5)

test_video_path = "test_video2.mp4"
test_video = cv2.VideoCapture(test_video_path)

threshold = 0.9

frame_width = int(test_video.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(test_video.get(cv2.CAP_PROP_FRAME_HEIGHT))

output_path = "mp_output_vid.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, 30.0, (frame_width, frame_height))

while test_video.isOpened():
    ret, frame = test_video.read()
    if not ret:
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    landmarks = extract_landmarks_from_frame(frame)

    if landmarks is not None:
        similarity = compare_landmarks(landmarks, gesture_landmarks)

        if similarity > threshold:
            cv2.putText(frame, "DETECTED", (frame_width - 150, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

    out.write(frame)

    cv2.imshow("Output Video", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

test_video.release()
out.release()
cv2.destroyAllWindows()


## Added function of user recording

In [19]:
import cv2
import mediapipe as mp
import numpy as np

def extract_landmarks_from_frame(frame):
    mp_pose = mp.solutions.pose
    pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5)

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = pose.process(frame_rgb)

    landmarks = []
    if results.pose_landmarks:
        
        for idx, lmk in enumerate(results.pose_landmarks.landmark):
            # Increase weightage for hands
            if idx in [15, 17, 19, 21, 13, 16, 18, 20, 22, 14]:
                landmarks.append([5*lmk.x , 5*lmk.y , lmk.z])
            else:
                landmarks.append([0.1*lmk.x, 0.1*lmk.y, lmk.z])
        landmarks = np.array(landmarks).flatten()
        

    return np.array(landmarks) 



def save_gesture_landmarks(landmarks, save_path):
    np.save(save_path, landmarks)

def load_gesture_landmarks(load_path):
    return np.load(load_path)

def compare_landmarks(landmarks_1, landmarks_2):
    # Make sure both arrays have the same length
  
    min_length = min(len(landmarks_1), len(landmarks_2))
    landmarks_1 = landmarks_1[:min_length]
    landmarks_2 = landmarks_2[:min_length]

    # Compute similarity
    similarity = np.dot(landmarks_1.flatten(), landmarks_2.flatten()) / \
                 (np.linalg.norm(landmarks_1) * np.linalg.norm(landmarks_2))

    return similarity


In [20]:
gesture_image_path = "wave_gesture.png"
gesture_frame = cv2.imread(gesture_image_path)
gesture_landmarks = extract_landmarks_from_frame(gesture_frame)
# save_path = "gesture_landmarks.npy"
# save_gesture_landmarks(gesture_landmarks, save_path)

# gesture_landmarks = load_gesture_landmarks(save_path)

mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5)

record_video = input("Do you want to record a test video? (y/n): ").lower() == 'y'

if record_video:
    video_output_path = "user_recorded_test_video.mp4"
    frame_width = 640
    frame_height = 480
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(video_output_path, fourcc, 30.0, (frame_width, frame_height))

    cap = cv2.VideoCapture(0)  # Open default camera (index 0)
    print("Recording video. Press 'q' to stop recording.")
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        out.write(frame)
        cv2.imshow("Recording", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    out.release()
    cv2.destroyAllWindows()
    test_video_path = "user_recorded_test_video.mp4"

else:
    test_video_path = "waving_test_video.mp4"

test_video = cv2.VideoCapture(test_video_path)
#Set threshold as needed
threshold = float(input("Enter number between 0 and 1 for precision: "))

frame_width = int(test_video.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(test_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
output_path = "output_vid_final.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, 30.0, (frame_width, frame_height))
while test_video.isOpened():
    ret, frame = test_video.read()
    if not ret:
        break
    
    landmarks = extract_landmarks_from_frame(frame)
    if landmarks is not None:
        similarity = compare_landmarks(landmarks, gesture_landmarks)
        if similarity > threshold:
            cv2.putText(frame, "DETECTED", (frame_width - 150, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
    out.write(frame)
    cv2.imshow("Output Video", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
test_video.release()
out.release()
cv2.destroyAllWindows()


  similarity = np.dot(landmarks_1.flatten(), landmarks_2.flatten()) / \


In [32]:
cap = cv2.VideoCapture(0) 
if not cap.isOpened():
    print("Error: Could not open camera.")
cap.release()
cv2.destroyAllWindows() 


# Final Solution

In [3]:

gesture_image_path = "gesture_image.png"
gesture_frame = cv2.imread(gesture_image_path)
gesture_landmarks = extract_landmarks_from_frame(gesture_frame)

mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5)

record_video = input("Do you want to record a test video? (y/n): ").lower() == 'y'

if record_video:
    video_output_path = "user_recorded_test_video.mp4"
    frame_width = 640
    frame_height = 480
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(video_output_path, fourcc, 30.0, (frame_width, frame_height))

    cap = cv2.VideoCapture(0)  # Open default camera (index 0)
    print("Recording video. Press 'q' to stop recording.")
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        out.write(frame)
        cv2.imshow("Recording", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    out.release()
    cv2.destroyAllWindows()
    test_video_path = "user_recorded_test_video.mp4"

else:
    test_video_path = "test_video_waving.mp4"

test_video = cv2.VideoCapture(test_video_path)
threshold = float(input("Enter number between 0 and 1 for precision: "))

frame_width = int(test_video.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(test_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
target_width = 640  # Set the target width for resizing
output_path = "output_vid_final.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, 30.0, (target_width, frame_height))

while test_video.isOpened():
    ret, frame = test_video.read()
    if not ret:
        break
    
    frame = resize_frame(frame, target_width)  # Resize frame to fit within target width

    landmarks = extract_landmarks_from_frame(frame)
    if landmarks is not None:
        similarity = compare_landmarks(landmarks, gesture_landmarks)
        if similarity > threshold:
            cv2.putText(frame, "DETECTED", (target_width - 150, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
    out.write(frame)
    cv2.imshow("Output Video", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

test_video.release()
out.release()
cv2.destroyAllWindows()
