In [3]:
import gradio as gr
from transformers import pipeline
import pandas as pd
import numpy as np
from pynput import keyboard
import time
import json
from pynput.keyboard import Key, Controller, KeyCode

In [5]:

pipe = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-fused")

In [6]:
# Load the macros from the JSON file
with open('macros.json', 'r') as file:
    data = json.load(file)

# Extract macros from the loaded data
macros = data["macros"]

# Creates a controller
board = Controller()

# Create a dictionary to store macros
macro_functions = {}

# Iterates over every macro
for macro in macros:
    # Macro details
    name = macro["name"]
    keycodes = macro["keycodes"]
    delay = macro["delay"]

# Defines the macro function
    macro_function = lambda keycodes=keycodes, delay=delay: [
        # Press the key according to the keycode
        (board.press(Key.enter) if keycode == Key.enter else
        board.press(Key.space) if keycode == Key.space else
        # If keycode is a single character then press that key
        board.press(keycode) if len(keycode) == 1 else
        # Else if the key is in the form <...> then press that key
        board.press(KeyCode.from_vk(int(keycode[1:-1]))),
        # Delay
        time.sleep(delay),
        # Release the key according to the key code
        board.release(Key.enter) if keycode == Key.enter else
        board.release(Key.space) if keycode == Key.space else
        # If keycode is a single character then release that key
        board.release(keycode) if len(keycode) == 1 else
        # Else if the key is in the form <...> then release that key
        board.release(KeyCode.from_vk(int(keycode[1:-1]))))
        # Iterate over every keycode in the macro
        for keycode in keycodes
    ]

    # Add the macro function to the dictionary with the macro name as the key
    macro_functions[name] = macro_function
    

In [11]:
#empty array 
audio_array = np.empty([])

word_detected = False


actions = []
scores = [0.998,0.01,0.22]
for x in macro_functions.keys():
    actions.append(x)

#actions = ["nothing","clap","snap",]


#CLAP model's prediction function
def classify(audio):
    global audio_array, word_detected, actions
    i = 0
    
    samplerate, array = audio
    
    #converts the input live audio array to numpy array and appends it to current audio array
    live_array = np.array(array)
    audio_array = np.append(audio_array, live_array)
    
    dimension = audio_array.shape
    
    
    #removes 5 second of audio from array if it's longer than 5 seconds
    if((dimension[0]) > (240000)):
        audio_array = audio_array[-240000:]
        print("Audio size: ", audio_array.shape)
    
    #Pass audio to model for predictions
    result = pipe(audio_array, candidate_labels=actions)
    
    #Formats the result to a dataframe and retrives the highest scoring label
    df = pd.DataFrame(result)
    max = df.idxmax(numeric_only=True)
    index = int(max[0])
    word = df.iat[index,1]
    print(df)
    print(word)
    
    #Checks if the highest scored label was an action label
    if word != "nothing":
        word_detected = True
        print("word was detected: ", word)
        print("pressing macros...")
        macro_functions[word]()
        
   
    #if word is detected or if the audio reaches 5 seconds, 
    #then reset the audio array to empty
    if (word_detected == True):
        audio_array = np.empty([1,1])
        print("All contents of audio array was removed.")
        word_detected = False
    
    return df

In [29]:
#Input - list of strings, list of corresponding scores
#Output - list of dictionaries where 'score' contains score corresponding to 'label' value
def joinDictionary (labels, scores):
    combinedDictionary = []
    dictionary = {'score': 0 ,'label':'test'}
    for i in range(len(labels)):
        #Create new dictionary for each label
        dictionary['score'] = scores[i]
        dictionary['label'] = labels[i]
        #add new dictionary to combined list
        combinedDictionary.append(dictionary.copy())
    return combinedDictionary

In [30]:
result = joinDictionary(actions,scores)
result

[{'score': 0.998, 'label': 'snap'}, {'score': 0.01, 'label': 'clap'}]

In [17]:
#Settings for Gradio Interface
demo = gr.Interface(
    fn=classify,
    inputs=gr.Audio(sources=["microphone"],streaming=True),
    outputs=[gr.Dataframe()],
    live=True,
    
)

In [18]:
#Launches Gradio Interface
demo.launch()

Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




  index = int(max[0])


      score    label
0  0.985457  nothing
1  0.012450     clap
2  0.002093     snap
nothing


  index = int(max[0])


      score    label
0  0.984766  nothing
1  0.007645     clap
2  0.007589     snap
nothing


  index = int(max[0])


      score    label
0  0.815339  nothing
1  0.124464     clap
2  0.060197     snap
nothing


  index = int(max[0])


      score    label
0  0.907062  nothing
1  0.049106     clap
2  0.043832     snap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.997823  nothing
1  0.002063     snap
2  0.000114     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.999049  nothing
1  0.000853     snap
2  0.000098     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.987953  nothing
1  0.011319     snap
2  0.000728     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.990216  nothing
1  0.008543     snap
2  0.001242     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.907811  nothing
1  0.074624     snap
2  0.017565     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.667695  nothing
1  0.249116     snap
2  0.083189     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.661870  nothing
1  0.254722     snap
2  0.083407     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.992509  nothing
1  0.005514     snap
2  0.001976     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.994151  nothing
1  0.003092     snap
2  0.002757     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.666250  nothing
1  0.280024     snap
2  0.053726     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.566406  nothing
1  0.333300     snap
2  0.100294     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.567533  nothing
1  0.324499     snap
2  0.107968     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.433014     snap
1  0.378117  nothing
2  0.188869     clap
snap
word was detected:  snap
pressing macros...
All contents of audio array was removed.
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.781083  nothing
1  0.116421     snap
2  0.102496     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.646688  nothing
1  0.252831     snap
2  0.100482     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.897787  nothing
1  0.082384     snap
2  0.019829     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.886087  nothing
1  0.100275     snap
2  0.013638     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.987812  nothing
1  0.009721     snap
2  0.002466     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.972438  nothing
1  0.023564     snap
2  0.003998     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.986834  nothing
1  0.009905     snap
2  0.003261     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.987566  nothing
1  0.009104     snap
2  0.003331     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.977448  nothing
1  0.017496     snap
2  0.005056     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.989988  nothing
1  0.006669     snap
2  0.003343     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.982173  nothing
1  0.013370     snap
2  0.004457     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.598388  nothing
1  0.312322     snap
2  0.089290     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.539362  nothing
1  0.268283     snap
2  0.192356     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.477945  nothing
1  0.386492     snap
2  0.135564     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.433478  nothing
1  0.387610     snap
2  0.178911     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.561025     snap
1  0.248823  nothing
2  0.190152     clap
snap
word was detected:  snap
pressing macros...
All contents of audio array was removed.
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.799757  nothing
1  0.107882     clap
2  0.092362     snap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.805852  nothing
1  0.097949     snap
2  0.096200     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.791509  nothing
1  0.152730     snap
2  0.055761     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.768896  nothing
1  0.194687     snap
2  0.036416     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.949550  nothing
1  0.041568     snap
2  0.008882     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.986738  nothing
1  0.012034     snap
2  0.001228     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.997641  nothing
1  0.001880     snap
2  0.000479     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.996247  nothing
1  0.003282     snap
2  0.000471     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.997347  nothing
1  0.002475     snap
2  0.000178     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.992797  nothing
1  0.005993     snap
2  0.001211     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.966876  nothing
1  0.027527     snap
2  0.005596     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.988042  nothing
1  0.006499     clap
2  0.005459     snap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.984959  nothing
1  0.007805     clap
2  0.007236     snap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.993608  nothing
1  0.004030     snap
2  0.002361     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.993374  nothing
1  0.005826     snap
2  0.000800     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.998080  nothing
1  0.001735     snap
2  0.000185     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.998043  nothing
1  0.001752     snap
2  0.000204     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.995226  nothing
1  0.004398     snap
2  0.000376     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.998329  nothing
1  0.001463     snap
2  0.000208     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.998071  nothing
1  0.001284     snap
2  0.000645     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.996904  nothing
1  0.002581     snap
2  0.000515     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.996392  nothing
1  0.002989     snap
2  0.000619     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


     score    label
0  0.99391  nothing
1  0.00499     snap
2  0.00110     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.963997  nothing
1  0.027330     snap
2  0.008674     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.949888  nothing
1  0.034074     snap
2  0.016038     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.864726  nothing
1  0.088720     snap
2  0.046554     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.824729  nothing
1  0.096748     clap
2  0.078523     snap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.825330  nothing
1  0.126974     clap
2  0.047696     snap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.723237  nothing
1  0.152603     clap
2  0.124159     snap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.939379  nothing
1  0.032878     clap
2  0.027743     snap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.911107  nothing
1  0.048734     snap
2  0.040159     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.926012  nothing
1  0.040159     snap
2  0.033830     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.942254  nothing
1  0.031245     clap
2  0.026501     snap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.962089  nothing
1  0.024177     snap
2  0.013734     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.854061  nothing
1  0.093128     snap
2  0.052811     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.810118  nothing
1  0.116175     snap
2  0.073707     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.832078  nothing
1  0.119724     snap
2  0.048198     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.962130  nothing
1  0.025601     snap
2  0.012269     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.938408  nothing
1  0.044543     snap
2  0.017049     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.977673  nothing
1  0.014915     snap
2  0.007412     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.974650  nothing
1  0.016721     snap
2  0.008629     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.989532  nothing
1  0.006993     snap
2  0.003475     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.983471  nothing
1  0.011633     snap
2  0.004896     clap
nothing
Audio size:  (240000,)


  index = int(max[0])


      score    label
0  0.963917  nothing
1  0.020486     snap
2  0.015597     clap
nothing
