import the libraries needed

In [None]:
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

import cv2

from pyo import *
from music21 import pitch

from math import sqrt

Function to convert frequencies into legible notes

In [None]:
def frequency_to_note(freq):
    p = pitch.Pitch()
    p.frequency= freq
    return p.nameWithOctave

Preparation of the reading gestures

In [None]:
model_path = r"gesture_recognizer.task"
BaseOptions = mp.tasks.BaseOptions
GestureRecognizer = vision.GestureRecognizer
GestureRecognizerOptions = vision.GestureRecognizerOptions
VisionRunningMode = vision.RunningMode

options = GestureRecognizerOptions(
    base_options=BaseOptions(model_asset_buffer = open(model_path, "rb").read()),
    running_mode=VisionRunningMode.IMAGE
)

Inicialization of the library that will play the sound

In [None]:
# Initialize Pyo
s = Server().boot()
s.setOutputDevice(3)
s.start()

# Create Pyo objects for sound synthesis
# base tone to be played
oscillator = Sine()
# vibrato wave
vibrato = Sine(freq=5, mul=1)
# tone with vibrato used as input
osc_with_vibrato = oscillator * vibrato
# ditorsion output
disto = Disto(osc_with_vibrato,drive=0.75, slope=0.5, mul=1, add=1)
# no distorsion output
noDisto = oscillator * vibrato
Disto_bool = False

inicialization of the hand pose points detection

In [None]:
mphands = mp.solutions.hands
hands = mphands.Hands()

## Videocapture loop
press escape to exit

In [None]:
cap = cv2.VideoCapture(0)
with GestureRecognizer.create_from_options(options) as recognizer:
    try:
        while True:
            data, image = cap.read()
            image = cv2.flip(image, 1)

            #processes the frame to look for hands and their position
            results = hands.process(image)
            if results.multi_hand_landmarks:
                # if there are hands on the image read the gesture being made
                mp_image = mp.Image(image_format=mp.ImageFormat.SRGB,data=image)
                gesture_recognition_result = recognizer.recognize(mp_image)
                
                if(gesture_recognition_result.gestures):
                    for gesture in gesture_recognition_result.gestures:
                        #set the output to be played depending on the hand gestures
                        if gesture[0].category_name == 'Closed_Fist':
                            Disto_bool = True
                        elif gesture[0].category_name == 'Open_Palm':
                            Disto_bool = False
                    
                for hand_landmarks in results.multi_hand_landmarks:
                    # draws the hands points position and the lines connecting them
                    mp.solutions.drawing_utils.draw_landmarks(
                        image,
                        hand_landmarks,
                        mphands.HAND_CONNECTIONS,
                        mp.solutions.drawing_styles.get_default_hand_landmarks_style(),
                    )
                    # extract both index and thumb positions
                    indexFinger = hand_landmarks.landmark[8]
                    thumbFinger = hand_landmarks.landmark[4]
                    x_index, y_index = int(indexFinger.x * image.shape[1]), int(indexFinger.y * image.shape[0])
                    x_thumb, y_thumb = int(thumbFinger.x * image.shape[1]), int(thumbFinger.y * image.shape[0])

                    # gets the thumb to index x distance and adapts the result to be used later
                    thumb_distance = sqrt(abs((x_index-x_thumb)**2))
                    thumb_distance = thumb_distance/image.shape[1] * 8
                    if x_index > image.shape[0]/2 and x_thumb > image.shape[0]/2:
                        # if the hand is on the right side the index position will update the tone
                        # the distance between index and thumb will increase vibrato rate on the tone
                        oscillator.freq=y_index+100
                        vibrato.setFreq(thumb_distance)
                    else: 
                        # left hand's index finger will update the volume
                        oscillator.setMul(y_index/image.shape[1] * 10 + 1)

                    #displays note played and if there is distorssion or not
                    cv2.putText(image,frequency_to_note(oscillator.freq),(10,40),cv2.FONT_HERSHEY_PLAIN,fontScale=1.2,color=(255,255,0),thickness=2)
                    if Disto_bool:
                        cv2.putText(image,'Distorsion On',(10,60),cv2.FONT_HERSHEY_PLAIN,fontScale=1.2,color=(255,255,0),thickness=2)
            else:
                #if there are no hands on the image stop playing
                oscillator.setFreq(0)

            #Control if distorsion is to be played or not    
            if Disto_bool:
                disto.play()
                disto.out()
                noDisto.stop()
            else:
                noDisto.play()
                noDisto.out()
                disto.stop()

            cv2.imshow('Theremin', image)
            if cv2.waitKey(1) == 27:
                cv2.destroyAllWindows()
                break

    except Exception as e:
        print(f"Error: {e}")

    finally:
        # Stop the Pyo server when the program ends
        s.stop()