In [3]:
pip install opencv-python

Collecting opencv-python
  Using cached opencv_python-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting numpy>=1.21.2 (from opencv-python)
  Using cached numpy-2.2.1-cp310-cp310-win_amd64.whl.metadata (60 kB)
Using cached opencv_python-4.10.0.84-cp37-abi3-win_amd64.whl (38.8 MB)
Using cached numpy-2.2.1-cp310-cp310-win_amd64.whl (12.9 MB)
Installing collected packages: numpy, opencv-python
Successfully installed numpy-2.2.1 opencv-python-4.10.0.84
Note: you may need to restart the kernel to use updated packages.




In [4]:
pip install mediapipe

Collecting mediapipe
  Using cached mediapipe-0.10.20-cp310-cp310-win_amd64.whl.metadata (9.9 kB)
Collecting absl-py (from mediapipe)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting attrs>=19.1.0 (from mediapipe)
  Using cached attrs-24.3.0-py3-none-any.whl.metadata (11 kB)
Collecting flatbuffers>=2.0 (from mediapipe)
  Using cached flatbuffers-24.12.23-py2.py3-none-any.whl.metadata (876 bytes)
Collecting jax (from mediapipe)
  Using cached jax-0.4.38-py3-none-any.whl.metadata (22 kB)
Collecting jaxlib (from mediapipe)
  Using cached jaxlib-0.4.38-cp310-cp310-win_amd64.whl.metadata (1.1 kB)
Collecting matplotlib (from mediapipe)
  Using cached matplotlib-3.10.0-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting numpy<2 (from mediapipe)
  Using cached numpy-1.26.4-cp310-cp310-win_amd64.whl.metadata (61 kB)
Collecting opencv-contrib-python (from mediapipe)
  Using cached opencv_contrib_python-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting prot



In [5]:
import numpy as np
import cv2 as cv
import mediapipe as mp

IP_CAMERA_URL = "http://192.168.1.78:8080/video"


cap = cv.VideoCapture(IP_CAMERA_URL)

# Initialize MediaPipe Hands and Drawing modules
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

# Define landmark indices for key points in the thumb
THUMB_TIP = 4
THUMB_MCP = 2

if not cap.isOpened():
    print("Cannot open camera")
    exit()

# Initialize the MediaPipe Hands module
with mp_hands.Hands(
    model_complexity=0,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5) as hands:

    while cap.isOpened():
        # Capture frame-by-frame
        ret, frame = cap.read()
        if not ret:
            print("Can't receive frame (stream end?). Exiting ...")
            break
        
        # Convert the BGR frame to RGB as required by MediaPipe
        image_rgb = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
        
        # Process the frame and recognize hands
        results = hands.process(image_rgb)
        
        # Initialize thumbs-up check flag
        thumbs_up = False
        
        # If hands are detected, draw landmarks on the frame
        hand_count = 0
        if results.multi_hand_landmarks:
            hand_count = len(results.multi_hand_landmarks)
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    frame,
                    hand_landmarks,
                    mp_hands.HAND_CONNECTIONS)
                # Get thumb tip and thumb MCP coordinates
                thumb_tip = hand_landmarks.landmark[THUMB_TIP]
                thumb_mcp = hand_landmarks.landmark[THUMB_MCP]
                
                # For thumbs-up, the thumb tip should be higher than thumb MCP
                if thumb_tip.y < thumb_mcp.y:  # because higher y value is lower on image
                    thumbs_up = True
     
                cv.putText(frame, f'Thumb TIP: {hand_landmarks.landmark[THUMB_TIP]}', (10, 60), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv.LINE_AA)
                cv.putText(frame, f'Thumb MCP: {hand_landmarks.landmark[THUMB_MCP]}', (10, 90), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv.LINE_AA)
                cv.putText(frame, f'Thumbs-up: {thumbs_up}', (10, 120), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv.LINE_AA)
        
        # Overlay the hand count on the frame
        cv.putText(frame, f'Hands detected: {hand_count}', (10, 30), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv.LINE_AA)


        # Display the frame with hand landmarks
        cv.imshow('Hand Recognition', frame)
        
        if cv.waitKey(1) == ord('q'):
            break

# Release the capture and destroy all OpenCV windows
cap.release()
cv.destroyAllWindows()

KeyboardInterrupt: 

In [2]:
#@markdown We implemented some functions to visualize the hand landmark detection results. <br/> Run the following cell to activate the functions.

from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np
import cv2 as cv2


MARGIN = 10  # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
HANDEDNESS_TEXT_COLOR = (88, 205, 54) # vibrant green

def draw_landmarks_on_image(rgb_image, detection_result):
  hand_landmarks_list = detection_result.hand_landmarks
  handedness_list = detection_result.handedness
  annotated_image = np.copy(rgb_image)

  # Loop through the detected hands to visualize.
  for idx in range(len(hand_landmarks_list)):
    hand_landmarks = hand_landmarks_list[idx]
    handedness = handedness_list[idx]

    # Draw the hand landmarks.
    hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
    hand_landmarks_proto.landmark.extend([
      landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
    ])
    solutions.drawing_utils.draw_landmarks(
      annotated_image,
      hand_landmarks_proto,
      solutions.hands.HAND_CONNECTIONS,
      solutions.drawing_styles.get_default_hand_landmarks_style(),
      solutions.drawing_styles.get_default_hand_connections_style())

    # Get the top left corner of the detected hand's bounding box.
    height, width, _ = annotated_image.shape
    x_coordinates = [landmark.x for landmark in hand_landmarks]
    y_coordinates = [landmark.y for landmark in hand_landmarks]
    text_x = int(min(x_coordinates) * width)
    text_y = int(min(y_coordinates) * height) - MARGIN

    # Draw handedness (left or right hand) on the image.
    cv2.putText(annotated_image, f"{handedness[0].category_name}",
                (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
                FONT_SIZE, HANDEDNESS_TEXT_COLOR, FONT_THICKNESS, cv2.LINE_AA)

  return annotated_image

In [4]:
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

# Initialize the hand landmarker
base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
options = vision.HandLandmarkerOptions(base_options=base_options, num_hands=2)
detector = vision.HandLandmarker.create_from_options(options)

IP_CAMERA_URL = "http://192.168.1.78:8080/video"
cap = cv2.VideoCapture(IP_CAMERA_URL)

while True:
    # Capture frame-by-frame
    success, frame = cap.read()
    if not success:
        print("Ignoring empty camera frame.")
        continue

    # Convert the BGR image from OpenCV to an RGB image for mediapipe
    image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)

    # Detect hand landmarks
    detection_result = detector.detect(image)

    # Visualize the landmarks on the image
    annotated_image = draw_landmarks_on_image(image.numpy_view(), detection_result)

    # Convert the image to BGR format for OpenCV visualization
    annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)

    # Display the annotated image
    cv2.imshow("Hand Landmarks", annotated_image)

    # Break the loop when 'q' key is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the capture and close windows
cap.release()
cv2.destroyAllWindows()

# GESTURE


In [7]:
#@markdown We implemented some functions to visualize the gesture recognition results. <br/> Run the following cell to activate the functions.
from matplotlib import pyplot as plt
import mediapipe as mp
from mediapipe.framework.formats import landmark_pb2
import math

plt.rcParams.update({
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.spines.left': False,
    'axes.spines.bottom': False,
    'xtick.labelbottom': False,
    'xtick.bottom': False,
    'ytick.labelleft': False,
    'ytick.left': False,
    'xtick.labeltop': False,
    'xtick.top': False,
    'ytick.labelright': False,
    'ytick.right': False
})

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles


def display_one_image(image, title, subplot, titlesize=16):
    """Displays one image along with the predicted category name and score."""
    plt.subplot(*subplot)
    plt.imshow(image)
    if len(title) > 0:
        plt.title(title, fontsize=int(titlesize), color='black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5))
    return (subplot[0], subplot[1], subplot[2]+1)


def display_batch_of_images_with_gestures_and_hand_landmarks(images, results):
    """Displays a batch of images with the gesture category and its score along with the hand landmarks."""
    # Images and labels.
    images = [image.numpy_view() for image in images]
    gestures = [top_gesture for (top_gesture, _) in results]
    multi_hand_landmarks_list = [multi_hand_landmarks for (_, multi_hand_landmarks) in results]

    # Auto-squaring: this will drop data that does not fit into square or square-ish rectangle.
    rows = int(math.sqrt(len(images)))
    cols = len(images) // rows

    # Size and spacing.
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot=(rows,cols, 1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))

    # Display gestures and hand landmarks.
    for i, (image, gestures) in enumerate(zip(images[:rows*cols], gestures[:rows*cols])):
        title = f"{gestures.category_name} ({gestures.score:.2f})"
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols) * 40 + 3
        annotated_image = image.copy()

        for hand_landmarks in multi_hand_landmarks_list[i]:
          hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
          hand_landmarks_proto.landmark.extend([
            landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
          ])

          mp_drawing.draw_landmarks(
            annotated_image,
            hand_landmarks_proto,
            mp_hands.HAND_CONNECTIONS,
            mp_drawing_styles.get_default_hand_landmarks_style(),
            mp_drawing_styles.get_default_hand_connections_style())

        subplot = display_one_image(annotated_image, title, subplot, titlesize=dynamic_titlesize)

    # Layout.
    plt.tight_layout()
    plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()

In [1]:
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
import cv2
import time
model_path = 'gesture_recognizer.task'
base_options = python.BaseOptions(model_asset_path=model_path)
IP_CAMERA_URL = "http://192.168.1.78:8080/video"

In [None]:
BaseOptions = mp.tasks.BaseOptions
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
GestureRecognizerResult = mp.tasks.vision.GestureRecognizerResult
VisionRunningMode = mp.tasks.vision.RunningMode
# Create a gesture recognizer instance with the live stream mode:
recognized_gesture = ""
def print_result(result: GestureRecognizerResult, output_image: mp.Image, timestamp_ms: int):
    global recognized_gesture
    global landmarks
    #print(result)
    if result.gestures:
        #print("Gestures detected:")
        # Take the gesture with the highest confidence
        recognized_gesture = ""
        for gesture in result.gestures:
            #print(f"Gesture: {gesture[0].category_name} ({gesture[0].score:.2f})")
            # Append each gesture to the recognized_gesture variable
            recognized_gesture += f"{gesture[0].category_name} ({gesture[0].score:.2f}), "
    else:
        recognized_gesture = "Porcaccio iddio"
    if result.hand_landmarks:
        landmarks = result.hand_landmarks
    #if result.hand_landmarks:
    #    hand_landmarks = result.hand_landmarks[0].landmark  # Take the first hand detected
    #else:
    #    hand_landmarks = []
    #print('gesture recognition result: {}'.format(result))


options = GestureRecognizerOptions(
    base_options=BaseOptions(model_asset_path='gesture_recognizer.task'),
    running_mode=VisionRunningMode.LIVE_STREAM,
    num_hands=2,
    result_callback=print_result)

# Initialize MediaPipe and OpenCV
mp_image = mp.Image
mp_image_format = mp.ImageFormat

with GestureRecognizer.create_from_options(options) as recognizer:
    # Start capturing video from the webcam
    cap = cv2.VideoCapture(IP_CAMERA_URL)

    if not cap.isOpened():
        print("Error: Could not open the webcam.")
        exit()
    
    start_time = time.time()  # Tempo di riferimento iniziale
    
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Error: Could not read frame.")
            break
        
        # Convert the frame (OpenCV image) to MediaPipe's Image object
        numpy_frame_from_opencv = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
        mp_image_object = mp_image(image_format=mp_image_format.SRGB, data=numpy_frame_from_opencv)


        frame_timestamp_ms = int((time.time() - start_time) * 1000)

        recognizer.recognize_async(mp_image_object, timestamp_ms=frame_timestamp_ms)

                # Overlay the recognized gesture on the frame
        cv2.putText(frame, f"Gesture: {recognized_gesture}", (10, 50), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
                # Draw hand landmarks on the frame

        if landmarks:
            for hand in landmarks:
                for landmark in hand:
                    #print(f"Landmark: {landmark.x}, {landmark.y}, {landmark.z}")
                    x = int(landmark.x * frame.shape[1])
                    y = int(landmark.y * frame.shape[0])
                    cv2.circle(frame, (x, y), 5, (0, 255, 0), -1)
    
        # Draw connections
        connections = [
            (0, 1), (1, 2), (2, 3), (3, 4),  # Thumb
            (0, 5), (5, 6), (6, 7), (7, 8),  # Index finger
            (0, 9), (9, 10), (10, 11), (11, 12),  # Middle finger
            (0, 13), (13, 14), (14, 15), (15, 16),  # Ring finger
            (0, 17), (17, 18), (18, 19), (19, 20)  # Pinky
        ]
        
        for hand in landmarks:
            for start, end in connections:
                start_x = int(hand[start].x * frame.shape[1])
                start_y = int(hand[start].y * frame.shape[0])
                end_x = int(hand[end].x * frame.shape[1])
                end_y = int(hand[end].y * frame.shape[0])
                cv2.line(frame, (start_x, start_y), (end_x, end_y), (255, 0, 0), 2)
        
        # Display the frame using OpenCV (optional for visualization)
        cv2.imshow('Webcam Feed', frame)

        # Break the loop if the user presses the 'q' key
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Release the webcam and close OpenCV windows
    cap.release()
    cv2.destroyAllWindows()


# TRAIN