In [2]:
import mediapipe as mp
import cv2 as cv
import numpy as np
import time
import sounddevice as sd
import librosa

In [3]:
class HandDetector:
    color_scheme = cv.COLOR_BGR2RGB
    hands = mp.solutions.hands.Hands()
    landmarks = []
    closure_threshold = 0.25

    @staticmethod
    def getLandmarks(frame):
        imgColor = cv.cvtColor(frame, HandDetector.color_scheme)
        processed = HandDetector.hands.process(imgColor)
        result = processed.multi_hand_landmarks
        HandDetector.landmarks = result

    @staticmethod
    def findThumb() -> np.array:
        if HandDetector.landmarks != None:
            thumbLocation = HandDetector.landmarks[0].landmark[mp.solutions.hands.HandLandmark.THUMB_TIP]

            return np.array([thumbLocation.x, thumbLocation.y])
        return np.array([])
    
    @staticmethod
    def findIndex() -> np.array:
        if HandDetector.landmarks != None:
            indexLocation = HandDetector.landmarks[0].landmark[mp.solutions.hands.HandLandmark.INDEX_FINGER_TIP]
            
            return np.array([indexLocation.x, indexLocation.y])
        
        return np.array([])
    

    @staticmethod
    def getMiddleToWristDistance() -> np.array:
        if HandDetector.landmarks != None:
            middleTipLocation = HandDetector.landmarks[0].landmark[mp.solutions.hands.HandLandmark.MIDDLE_FINGER_TIP]
            wristLocation = HandDetector.landmarks[0].landmark[mp.solutions.hands.HandLandmark.WRIST]

            middleTipCoordinate = np.array([middleTipLocation.x, middleTipLocation.y])
            wristCoordinate = np.array([wristLocation.x, wristLocation.y])

            return np.linalg.norm(middleTipCoordinate - wristCoordinate)
        
        return 1000

    
    @staticmethod
    def isClosed() -> bool:
        return HandDetector.getMiddleToWristDistance() <= HandDetector.closure_threshold


In [4]:
class Recorder:
    def __init__(self, recording):
        pass
    
    def record(self) -> None:
        pass

    def changeFrequency(self, newFrequency) -> None:
        pass

    def playback(self) -> None:
        pass

In [None]:
cam = cv.VideoCapture(0)

sensitivity = 25
intensity = 2

epsilon = 0.1


samplerate = 44100
chunk_duration = 0.5
chunk_size = int(samplerate * chunk_duration)
volume = 3.0

while True:
    ret, frame = cam.read()
    frame = cv.flip(frame, 1)
    HandDetector.getLandmarks(frame)

    handClosed = HandDetector.isClosed()
    if handClosed:
        audio_chunks = []
        pitch_shifts = []

        def audio_callback(indata, frames, time, status):
            audio_chunks.append(indata.copy())

        stream = sd.InputStream(samplerate=samplerate, channels=1, blocksize=chunk_size, callback=audio_callback)
        stream.start()

        start_time = cv.getTickCount()
        while handClosed:
            ret, frame = cam.read()
            frame = cv.flip(frame, 1)
            HandDetector.getLandmarks(frame)
            
            thumbLoc = HandDetector.findThumb()
            indexLoc = HandDetector.findIndex()

            frequency_shift = 10000 * (np.exp(-1 * sensitivity * np.linalg.norm(indexLoc - thumbLoc)))
            pitch_shifts.append(frequency_shift)
            print(frequency_shift)

            # record, change frequency

            handClosed = HandDetector.isClosed()

            cv.imshow('cam', frame)
            if cv.waitKey(1) == ord('q'):
                exit()
        
        # playback recording
        stream.stop()
        processed_audio = [[]]

        min_len = min(len(audio_chunks), len(pitch_shifts))
        for i in range(min_len):
            chunk = audio_chunks[i].flatten()
            shift = pitch_shifts[i]
            semitones = intensity * np.log2(shift) if shift > 0 else 0
            try:
                shifted = librosa.effects.pitch_shift(chunk, sr=samplerate, n_steps=semitones)
                processed_audio.append(shifted)
            except:
                processed_audio.append(chunk)
        
        print("Playing back")
        output = np.concatenate(processed_audio)
        output *= volume
        
        sd.play(output, samplerate)
        sd.wait()

    if not ret:
        break

    if cv.waitKey(1) == ord('q'):
        cam.release()
        cv.destroyAllWindows()
        exit()

    cv.imshow('cam', frame)



94.13026040839989
100.52525449115223
95.21155351864618
54.862401107211156
56.09907691241127
47.67241695385338
43.403714369447634
36.76261721744163
30.454220244606404
15.0591098377365
13.962514090894173
12.744165909889821
11.48770972414814
9.943297631351252
9.160765366284311
8.389142543108399
7.381271824606963
6.858583220370524
6.511294879293391
5.940109895163551
5.356024921728641
5.47940005443541
5.309494203826892
5.538581244600092
5.538058380342028
5.5050716011472005
5.3998304284960374
5.619724639162352
4.98090423729946
5.52225994255583
4.993396531604405
5.134960994452202
5.074657461984561
4.878379211474169
4.991432128245769
4.89084938800341
4.651597594471334
4.50901641130188
4.642758390714349
4.600374233256506
4.584131744190069
5.059107462088599
4.6652822095939595
4.851778836098202
5.055955023949996
4.854022192830215
6.882158521219551
14.371915993760798
46.33886981560573
704.3819783716735
735.2525557833221
816.0389466237477
927.1900698048153
917.9326221058949
924.0607347207178
875.34

KeyboardInterrupt: 