In [1]:
import mediapipe as mp
import cv2 as cv
import numpy as np
import time
import sounddevice as sd
import librosa

In [2]:
class HandDetector:
    color_scheme = cv.COLOR_BGR2RGB
    hands = mp.solutions.hands.Hands()
    landmarks = []
    closure_threshold = 0.25

    @staticmethod
    def getLandmarks(frame):
        imgColor = cv.cvtColor(frame, HandDetector.color_scheme)
        processed = HandDetector.hands.process(imgColor)
        result = processed.multi_hand_landmarks
        HandDetector.landmarks = result

    @staticmethod
    def findThumb() -> np.array:
        if HandDetector.landmarks != None:
            thumbLocation = HandDetector.landmarks[0].landmark[mp.solutions.hands.HandLandmark.THUMB_TIP]

            return np.array([thumbLocation.x, thumbLocation.y])
        return np.array([])
    
    @staticmethod
    def findIndex() -> np.array:
        if HandDetector.landmarks != None:
            indexLocation = HandDetector.landmarks[0].landmark[mp.solutions.hands.HandLandmark.INDEX_FINGER_TIP]
            
            return np.array([indexLocation.x, indexLocation.y])
        
        return np.array([])
    

    @staticmethod
    def getMiddleToWristDistance() -> np.array:
        if HandDetector.landmarks != None:
            middleTipLocation = HandDetector.landmarks[0].landmark[mp.solutions.hands.HandLandmark.MIDDLE_FINGER_TIP]
            wristLocation = HandDetector.landmarks[0].landmark[mp.solutions.hands.HandLandmark.WRIST]

            middleTipCoordinate = np.array([middleTipLocation.x, middleTipLocation.y])
            wristCoordinate = np.array([wristLocation.x, wristLocation.y])

            return np.linalg.norm(middleTipCoordinate - wristCoordinate)
        
        return 1000

    
    @staticmethod
    def isClosed() -> bool:
        return HandDetector.getMiddleToWristDistance() <= HandDetector.closure_threshold


In [3]:
sensitivity = 12.32
intensity = 1000

def getShift(dist):
    if dist < 0.045:
        return intensity
    elif 0.045 <= dist <= 0.3:
        return intensity * np.cos(sensitivity * (dist - 0.045))
    else:
        return -1 * intensity

In [7]:
def draw_freqShift(freq, frame):
    position = (10, 25)
    font = cv.FONT_HERSHEY_SIMPLEX
    font_scale = 1

    if freq > 0:
        color = (0, 100, freq * 255)
    else:
        color = (-1 * freq * 255, 100, 0)

    thickness = 1

    cv.putText(frame, str(freq), position, font, font_scale, color, thickness)

In [8]:
cam = cv.VideoCapture(0)

samplerate = 44100
chunk_duration = 0.5
chunk_size = int(samplerate * chunk_duration)
volume = 3.0

while True:
    ret, frame = cam.read()
    frame = cv.flip(frame, 1)
    HandDetector.getLandmarks(frame)

    handClosed = HandDetector.isClosed()
    if handClosed:
        audio_chunks = []
        pitch_shifts = []

        def audio_callback(indata, frames, time, status):
            audio_chunks.append(indata.copy())

        stream = sd.InputStream(samplerate=samplerate, channels=1, blocksize=chunk_size, callback=audio_callback)
        stream.start()

        start_time = cv.getTickCount()
        while handClosed:
            ret, frame = cam.read()
            frame = cv.flip(frame, 1)
            HandDetector.getLandmarks(frame)
            
            thumbLoc = HandDetector.findThumb()
            indexLoc = HandDetector.findIndex()

            frequency_shift = getShift(np.linalg.norm(thumbLoc - indexLoc))
            pitch_shifts.append(frequency_shift)
            draw_freqShift(int(100 * (frequency_shift / intensity)) / 100, frame)

            handClosed = HandDetector.isClosed()

            cv.imshow('cam', frame)
            if cv.waitKey(1) == ord('q'):
                exit()
        
        # playback recording
        stream.stop()
        processed_audio = [[]]

        min_len = min(len(audio_chunks), len(pitch_shifts))
        for i in range(min_len):
            chunk = audio_chunks[i].flatten()
            shift = pitch_shifts[i]

            if shift < 0:
                semitones = -1 * np.log2(-1 * shift)
            elif shift > 0:
                semitones = np.log2(shift) 
            else:
                semitones = 0

            try:
                shifted = librosa.effects.pitch_shift(chunk, sr=samplerate, n_steps=semitones)
                processed_audio.append(shifted)
            except:
                processed_audio.append(chunk)
        
        print("Playing back")
        output = np.concatenate(processed_audio)
        output *= volume
        
        sd.play(output, samplerate)
        sd.wait()

    if not ret:
        break

    if cv.waitKey(1) == ord('q'):
        cam.release()
        cv.destroyAllWindows()
        exit()

    cv.imshow('cam', frame)



Playing back
Playing back
Playing back
Playing back


KeyboardInterrupt: 