## This code is for Training and Evaluation of Custom Multilingual KWS
Audio File should be in the format of **1-sec wav files**.


Datasets needs:
1. Background noise from Google Speech Command v2
2. Trained Multilingual Embedding Features (ref:https://github.com/harvard-edge/multilingual_kws) 
3. Target keyword for **training** from **microphone**
4. Unknow_file as non-Target audio file for **training**
5. Testing from **real-time audio**

_training KWS code based on https://github.com/harvard-edge/multilingual_kws_

In [1]:
# Run  this
#!pip install ipywidgets widgetsnbextension pandas-profiling
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


### 1) Ask User to Type Custom Keyword
keyword name should be without any space

In [74]:
KEYWORD = "heybonnie"

### 2) Record audio from microphone

In [75]:
import os
from func import audio_record, kws_train, input_data
from pathlib import Path

pwd_path = !pwd
BASE_DIR = pwd_path[0]


keyword_dir = os.path.join(BASE_DIR , './content/target_kw/recording/',KEYWORD )
print(keyword_dir)
if not os.path.exists(keyword_dir):
    os.mkdir(keyword_dir)
record_name = KEYWORD+'.wav'
print(os.path.join(keyword_dir, record_name))

/Users/nasim/Documents/Active_2023/MLEng_bootcampt/Spotify/code/multilingual_kws_v3/./content/target_kw/recording/heybonnie
/Users/nasim/Documents/Active_2023/MLEng_bootcampt/Spotify/code/multilingual_kws_v3/./content/target_kw/recording/heybonnie/heybonnie.wav


call record function:

### 3) Audio Segmentation
via pyannote speaker segmentation

In [77]:
from func import spk_segment
print(keyword_dir)

spk_segments_path = spk_segment.segment (KEYWORD,keyword_dir)

/Users/nasim/Documents/Active_2023/MLEng_bootcampt/Spotify/code/multilingual_kws_v3/./content/target_kw/recording/heybonnie


wav_path: No such file or directory


1.5746875 2.4859375000000004 0.08868749999999954
3.4478125000000004 4.0046875 0.44306250000000047
5.034062500000001 6.147812500000001 0.44306250000000047
6.8734375 7.5315625 0.3418125
8.560937500000001 9.1515625 0.4093125000000018
10.518437500000001 11.1596875 0.3586875000000018
12.3746875 13.2184375 0.1561875
14.4840625 15.1759375 0.3080625
16.9478125 17.6903125 0.2574375
19.5634375 20.2721875 0.2911875
21.892187500000002 22.6515625 0.24056250000000365
23.9846875 24.524687500000002 0.45993749999999634
26.4484375 27.0390625 0.4093125
28.844687500000003 29.688437500000003 0.1561875
Speaker Segmentation Completed.


### 4) Training

In [78]:
spk_segments_path

'/Users/nasim/Documents/Active_2023/MLEng_bootcampt/Spotify/code/multilingual_kws_v3/./content/target_kw/recording/heybonnie/extractions'

In [79]:

base_model_dir = Path(BASE_DIR).joinpath('./content/multilingual_context_73_0.8011')
unknown_words_dir = Path(BASE_DIR).joinpath('./content/unknown_files/')
background_noise_dir =  Path(BASE_DIR).joinpath('./content/speech_commands_v0.02/_background_noise_/')


model, five_samples, dev_samples, test_samples = kws_train.train(keyword= KEYWORD,
                 samples_dir= spk_segments_path,
                 embedding= base_model_dir,
                 unknown_words= unknown_words_dir,
                 background_noise= background_noise_dir)

14 training samples found:

train_percent: 36.0
size of validation and test sets: 9 9
Training model




shape_train_init: 5
shape_valid_init: 9
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [80]:
# keyword_dir = os.path.join(pwd_path[0] + '/content/target_kw/recording/',KEYWORD )

#import model
#model, five_samples, dev_samples, test_samples= kws_train.train(KEYWORD, spk_segments_path)


offline evaluation

In [81]:
nontarget_dir = Path(BASE_DIR).joinpath('./content/nontarget_mswc_microset_wav/en/clips/')
background_noise_dir =  Path(BASE_DIR).joinpath('./content/speech_commands_v0.02/_background_noise_/')


target_pred, nontarget_pred= kws_train.predict(KEYWORD, model, test_samples,
                                   nontarget_dir, background_noise_dir )
frr_val,far_val, _, _= kws_train.report_results (target_pred, nontarget_pred)
frr_val,far_val

Number of non-target examples 9
Test accuracy on testset: 0.78
Estimated accuracy on non-target samples: 1.00
FRR: 22.222 %
FAR: 0.0 %


(0.2222222222222222, 0.0)

In [82]:
import glob
import tensorflow as tf
import numpy as np
file_lst = glob.glob(os.path.join(spk_segments_path, "*.wav"))
filepath = file_lst[0]
filepath

'/Users/nasim/Documents/Active_2023/MLEng_bootcampt/Spotify/code/multilingual_kws_v3/./content/target_kw/recording/heybonnie/extractions/009_heybonnie_detection_16.948sec.wav'

In [83]:

audio_binary = tf.io.read_file(filepath)

#===========================================
model_settings = input_data.standard_microspeech_model_settings(label_count=1)

audio, _ = tf.audio.decode_wav(
audio_binary,
desired_channels=1,
desired_samples=model_settings["desired_samples"]
)

audio = tf.squeeze(audio, axis=-1)

#===========================================
# Test the trained FS-KWS model on test sets
test_spectrograms = input_data.to_micro_spectrogram(model_settings, audio)
print(np.asarray(test_spectrograms).shape)
test_spectrograms = test_spectrograms[np.newaxis, :, :, np.newaxis]
print(np.asarray(test_spectrograms).shape)

# fetch softmax predictions from the finetuned model:
predictions = model.predict(test_spectrograms)
categorical_predictions = np.argmax(predictions[0])

predictions[0], categorical_predictions

(49, 40)
(1, 49, 40, 1)


(array([0.00607816, 0.02762094, 0.9663009 ], dtype=float32), 2)

### Evaluation 
(from mic and process in 1 sec audio chunk)


Configure the KWS model with
* detection threshold
* wav file location
* Keyword to use as a label
* Filename to save results to

In [84]:
import sys
import pickle


In [87]:
import pyaudio
import numpy as np
import struct
import time


def stream_proc_audio (duration: int, model):
    p = pyaudio.PyAudio()

    def _find_input_device():
        device_index = None            
        for i in range( p.get_device_count() ):     
            devinfo = p.get_device_info_by_index(i)   
            print( "Device %d: %s"%(i,devinfo["name"]) )

            for record_name in ["mic","input"]:
                if record_name in devinfo["name"].lower():
                    print( "Found an input: device %d - %s"%(i,devinfo["name"]) )
                    device_index = i
                    return device_index

        if device_index == None:
            print( "No preferred input found; using default input device." )

        return device_index

    device_index = _find_input_device()


    CHANNELS = 1
    RATE = 16000
    INPUT_BLOCK_TIME = 0.0125
    CHUNK = int(RATE*INPUT_BLOCK_TIME) # FRAMES_PER_BUFFER

    def _eval_stream (KEYWORD, model,frames):
        model_settings = input_data.standard_microspeech_model_settings(label_count=3)

        #===========================================
        # Test the trained FS-KWS model on test sets
        test_spectrograms = input_data.to_micro_spectrogram(model_settings, frames)
        #print(np.asarray(test_spectrograms).shape)
        test_spectrograms = test_spectrograms[np.newaxis, :, :, np.newaxis]
        #print(np.asarray(test_spectrograms).shape)

        # fetch softmax predictions from the finetuned model:
        pred = model.predict(test_spectrograms)
        categorical_pred = np.argmax(pred, axis=1)
        return pred,categorical_pred

    stream = p.open(format=pyaudio.paFloat32,
                    channels=CHANNELS,
                    rate=RATE,
                    output=True,
                    input=True,
                    input_device_index = device_index,
                    frames_per_buffer=CHUNK)

    print("Start Streaming...")
    frames = np.array([])
    for i in range(int(duration*RATE/CHUNK)): #go for a LEN seconds
        block = np.fromstring(stream.read(CHUNK),dtype=np.float32)
        frames = np.append(frames, block)
        
        
        if len(frames) == RATE: # 1-sec audio captures
            t = time.time()
            #pred, categorical_pred = kws_train.eval_stream(KEYWORD, model,frames)
            pred, categorical_pred = _eval_stream (KEYWORD, model,frames)
            
            if categorical_pred == 1:
                if pred[0][categorical_pred] >= 0.8:
                    print( "Other words")
            elif categorical_pred == 2:
                if pred[0][categorical_pred] >= 0.8:
                    print( "KEYWORD")
            elif categorical_pred == 0:
                if pred[0][categorical_pred] >= 0.55:
                    print("Background Noise/Silence")
            frames = []
            print("processing time for a chunk:", time.time() - t)



    stream.stop_stream()
    stream.close()
    p.terminate()
    #write(os.path.join(record_save_path,record_name),  sample_rate, wav)
    print("Sreaming Completed")
    return False

In [88]:
# processing time for 1-sec audio is on average 60 ms
duration = 20
stream_flag = stream_proc_audio(duration, model)

Device 0: MacBook Air Microphone
Found an input: device 0 - MacBook Air Microphone
Start Streaming...
Background Noise/Silence
processing time for a chunk: 0.056452035903930664
Other words
processing time for a chunk: 0.06367111206054688
processing time for a chunk: 0.06404590606689453
processing time for a chunk: 0.06337714195251465
processing time for a chunk: 0.06424808502197266
Background Noise/Silence
processing time for a chunk: 0.059992074966430664
Background Noise/Silence
processing time for a chunk: 0.06065011024475098
Other words
processing time for a chunk: 0.05787372589111328
processing time for a chunk: 0.058476924896240234
Background Noise/Silence
processing time for a chunk: 0.05749201774597168
Other words
processing time for a chunk: 0.06404995918273926
processing time for a chunk: 0.05981302261352539
processing time for a chunk: 0.06360292434692383
KEYWORD
processing time for a chunk: 0.05605816841125488
Background Noise/Silence
processing time for a chunk: 0.058536052