In [200]:
import gradio as gr
from transformers import pipeline
import pandas as pd
import numpy as np
from pynput import keyboard
import time
import json
from pynput.keyboard import Key, Controller, KeyCode
import eng_to_ipa as ipa
import librosa

In [201]:
import sys 
sys.path.append("./clap-ipa")
from clap.encoders import *
import torch.nn.functional as F
from transformers import DebertaV2Tokenizer, AutoProcessor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

speech_encoder = SpeechEncoder.from_pretrained('anyspeech/clap-ipa-tiny-speech')
phone_encoder = PhoneEncoder.from_pretrained('anyspeech/clap-ipa-tiny-phone')
phone_encoder.eval().to(device)
speech_encoder.eval().to(device)

tokenizer = DebertaV2Tokenizer.from_pretrained('charsiu/IPATokenizer')
processor = AutoProcessor.from_pretrained('openai/whisper-tiny')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [202]:
pipe = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-fused")

In [203]:
import time

def ipa_converted_vals(labels, audio):
    ipa_val_list = []

    for x in range(len(labels)):
        start = time.time()
        audio_input = processor(audio, return_attention_mask=True, sampling_rate=16000,max_length=32000,return_tensors='pt')
        #ipa_input = tokenizer(ipa.convert(labels[x], keep_punct=True), return_token_type_ids=False, return_tensors='pt')
        ipa_input = tokenizer(labels[x], return_token_type_ids=False, return_tensors='pt')
        with torch.no_grad():
            speech_embed = speech_encoder(**audio_input).pooler_output
            phone_embed = phone_encoder(**ipa_input).pooler_output

        similarity = F.cosine_similarity(speech_embed,phone_embed,dim=-1)
        ipa_val_list.append((similarity.cpu().detach().numpy())[0])

    return ipa_val_list

In [204]:
clap_ipa_sr = 16000
some_audio,sampling_rate = librosa.load("tutorials/audio-samples/angelo-charge.mp3")
#some_audio = librosa.resample(some_audio, orig_sr=sampling_rate, target_sr=16000)
#sampling_rate = 16000
ipa_converted_vals(["ˌriːˈləʊdɪŋ"], some_audio)

tensor([[[-0.0621, -0.0983, -0.1066,  ..., -0.0855, -0.1235, -0.1019],
         [-0.2386, -0.2121, -0.2164,  ..., -0.1248, -0.2898, -0.1806],
         [-0.4460, -0.3629, -0.4164,  ..., -0.2980, -0.4323, -0.3578],
         ...,
         [-0.9251, -0.9251, -0.9251,  ..., -0.9251, -0.9251, -0.9251],
         [-0.9251, -0.9251, -0.9251,  ..., -0.9251, -0.9251, -0.9251],
         [-0.9251, -0.9251, -0.9251,  ..., -0.9251, -0.9251, -0.9251]]])


[0.2577246]

In [205]:
np.mean(some_audio)

-1.5587171e-06

In [206]:
sampling_rate

22050

In [207]:
# Load the macros from the JSON file
with open('macros.json', 'r') as file:
    data = json.load(file)

# Extract macros from the loaded data
macros = data["macros"]

# Creates a controller
board = Controller()

# Create a dictionary to store macros
macro_functions = {}

# Iterates over every macro
for macro in macros:
    # Macro details
    name = macro["name"]
    keycodes = macro["keycodes"]
    delay = macro["delay"]

# Defines the macro function
    macro_function = lambda keycodes=keycodes, delay=delay: [
        # Press the key according to the keycode
        (board.press(Key.enter) if keycode == Key.enter else
        board.press(Key.space) if keycode == Key.space else
        # If keycode is a single character then press that key
        board.press(keycode) if len(keycode) == 1 else
        # Else if the key is in the form <...> then press that key
        board.press(KeyCode.from_vk(int(keycode[1:-1]))),
        # Delay
        time.sleep(delay),
        # Release the key according to the key code
        board.release(Key.enter) if keycode == Key.enter else
        board.release(Key.space) if keycode == Key.space else
        # If keycode is a single character then release that key
        board.release(keycode) if len(keycode) == 1 else
        # Else if the key is in the form <...> then release that key
        board.release(KeyCode.from_vk(int(keycode[1:-1]))))
        # Iterate over every keycode in the macro
        for keycode in keycodes
    ]

    # Add the macro function to the dictionary with the macro name as the key
    macro_functions[name] = macro_function
    

In [208]:
#Input - list of strings, list of corresponding scores
#Output - list of dictionaries where 'score' contains score corresponding to 'label' value
def joinDictionary (labels, scores):
    combinedDictionary = []
    dictionary = {'score': 0 ,'label':'test'}
    for i in range(len(labels)):
        #Create new dictionary for each label
        dictionary['score'] = scores[i]
        dictionary['label'] = labels[i]
        #add new dictionary to combined list
        combinedDictionary.append(dictionary.copy())
    return combinedDictionary

In [209]:
#empty array 
audio_array = np.empty([])

word_detected = False

actions = ["silence"]
#scores = [0.998,0.01,0.22]
for x in macro_functions.keys():
    actions.append(x)



#CLAP model's prediction function
def classify(audio):
    global audio_array, word_detected, actions
    i = 0
    
    samplerate, array = audio
    
    #converts the input live audio array to numpy array and appends it to current audio array
    live_array = np.array(array)
    audio_array = np.append(audio_array, live_array)
    
    dimension = audio_array.shape
    
    
    #removes 5 second of audio from array if it's longer than 5 seconds
    if((dimension[0]) > (240000)):
        audio_array = audio_array[-240000:]
        print("Audio size: ", audio_array.shape)
    
    #Pass audio to model for predictions
    result = pipe(audio_array/32768, candidate_labels=actions)
    
    #Formats the result to a dataframe and retrives the highest scoring label
    df = pd.DataFrame(result)
    max = df.idxmax(numeric_only=True)
    index = int(max[0])
    word = df.iat[index,1]
    print(df)
    print(word)
    
    #Checks if the highest scored label was an action label
    if word != "silence":
        word_detected = True
        print("word was detected: ", word)
        print("pressing macros...")
        macro_functions[word]()
        
   
    #if word is detected or if the audio reaches 5 seconds, 
    #then reset the audio array to empty
    if (word_detected == True):
        audio_array = np.empty([1,1])
        print("All contents of audio array was removed.")
        word_detected = False
    
    return df

In [210]:
#Settings for Gradio Interface
demo = gr.Interface(
    fn=classify,
    inputs=gr.Audio(sources=["microphone"],streaming=True),
    outputs=[gr.Dataframe()],
    live=True,
    
)

In [211]:
#Launches Gradio Interface
demo.launch()

Running on local URL:  http://127.0.0.1:7899

To create a public link, set `share=True` in `launch()`.




In [212]:
ncoamencoamerelncoamerelncoamencoamerel

NameError: name 'ncoamencoame' is not defined

  index = int(max[0])


      score          label
0  0.933101        silence
1  0.054790  a single clap
2  0.012109          voice
silence


  index = int(max[0])


      score          label
0  0.989742        silence
1  0.009274          voice
2  0.000984  a single clap
silence


  index = int(max[0])


      score          label
0  0.997286        silence
1  0.001994          voice
2  0.000719  a single clap
silence
Audio size:  (240000,)


  index = int(max[0])


      score          label
0  0.996898        silence
1  0.001915          voice
2  0.001187  a single clap
silence
Audio size:  (240000,)


  index = int(max[0])


      score          label
0  0.896285        silence
1  0.098730  a single clap
2  0.004985          voice
silence
Audio size:  (240000,)


  index = int(max[0])


      score          label
0  0.759861          voice
1  0.239913        silence
2  0.000226  a single clap
voice
word was detected:  voice
pressing macros...
