<a href="https://www.kaggle.com/code/joshuaokolo/device-voice-remover?scriptVersionId=104052792" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Device Voice Remover

Removes the device's voice (eg Siri, Google, Alexa), from an audio. It would be more accurate to say that it selects the user's (speaker) from the audio sample.

# Import Dataset

In [None]:
# Modified File from origial verson of noisyspeech_synthesizer.py 
# Source: https://github.com/microsoft/MS-SNSD/blob/master/noisyspeech_synthesizer.py
# @author: chkarada

import glob
import numpy as np
import soundfile as sf
import os
import argparse
import configparser as CP
from audiolib import audioread, audiowrite, snr_mixer

def main(cfg):

snr_lower = float(cfg["snr_lower"])
snr_upper = float(cfg["snr_upper"])
total_snrlevels = float(cfg["total_snrlevels"])

clean_dir = os.path.join(os.path.dirname(__file__), 'clean_train')
if cfg["speech_dir"]!='None':
clean_dir = cfg["speech_dir"]
if not os.path.exists(clean_dir):
assert False, ("Clean speech data is required")

noise_dir = os.path.join(os.path.dirname(__file__), 'noise_train')
if cfg["noise_dir"]!='None':
noise_dir = cfg["noise_dir"]
if not os.path.exists(noise_dir):
assert False, ("Noise data is required")

fs = float(cfg["sampling_rate"])
fs = 16000 # change
audioformat = cfg["audioformat"]
total_hours = float(cfg["total_hours"])
audio_length = float(cfg["audio_length"])
silence_length = float(cfg["silence_length"])
noisyspeech_dir = os.path.join(os.path.dirname(__file__), 'NoisySpeech_training')
if not os.path.exists(noisyspeech_dir):
os.makedirs(noisyspeech_dir)
clean_proc_dir = os.path.join(os.path.dirname(__file__), 'CleanSpeech_training')
if not os.path.exists(clean_proc_dir):
os.makedirs(clean_proc_dir)
noise_proc_dir = os.path.join(os.path.dirname(__file__), 'Noise_training')
if not os.path.exists(noise_proc_dir):
os.makedirs(noise_proc_dir)

total_secs = total_hours*60*60
total_samples = int(total_secs * fs)
audio_length = int(audio_length*fs)
# Modification
# Original: SNR = np.linspace(snr_lower, snr_upper, total_snrlevels) 
# Change: replace: ‘total_snrlevels’ with ‘int(total_snrlevels)’
# Reason: To avoid an error as np.linspace expects an integer input

SNR = np.linspace(snr_lower, snr_upper, int(total_snrlevels)) #change with int
# Modification
# New Line after line 47: ‘SNR = np.round(SNR, 5)’
# Reason: To set SNR Levels at consistent intervals
SNR = np.round(SNR, 5) # added

cleanfilenames = glob.glob(os.path.join(clean_dir, audioformat))
if cfg["noise_types_excluded"]=='None':
noisefilenames = glob.glob(os.path.join(noise_dir, audioformat))
else:
filestoexclude = cfg["noise_types_excluded"].split(',')
noisefilenames = glob.glob(os.path.join(noise_dir, audioformat))
for i in range(len(filestoexclude)):
noisefilenames = [fn for fn in noisefilenames if not os.path.basename(fn).startswith(filestoexclude[i])]

filecounter = 0
num_samples = 0
# Modification
# Original: while num_samples < total_samples: (line 85)
# Change: 
# ran_num = 0 # add
# while num_samples < total_samples:
# np.random.seed(ran_num) # add
# ran_num = ran_num + 1 # add
# Reason: This program randomly uses the files provided to create a database. Each time the program 
# is run a new set of clean files are created. However, the clean files are used to create a transcript
# to calculate Word Error Rate (wer) scores. By using a random seed, the program can be run multiple times 
# and no changes to the transcript file are needed.

ran_num = 0 # add
while num_samples < total_samples:
np.random.seed(ran_num) # add
ran_num = ran_num + 1 # add
idx_s = np.random.randint(0, np.size(cleanfilenames))
clean, fs = audioread(cleanfilenames[idx_s])

if len(clean)>audio_length:
clean = clean

else:

while len(clean)<=audio_length:
idx_s = idx_s + 1
if idx_s >= np.size(cleanfilenames)-1:
idx_s = np.random.randint(0, np.size(cleanfilenames)) 
newclean, fs = audioread(cleanfilenames[idx_s])
cleanconcat = np.append(clean, np.zeros(int(fs*silence_length)))
clean = np.append(cleanconcat, newclean)

idx_n = np.random.randint(0, np.size(noisefilenames))
noise, fs = audioread(noisefilenames[idx_n])

if len(noise)>=len(clean):
noise = noise[0:len(clean)]

else:

while len(noise)<=len(clean):
idx_n = idx_n + 1
if idx_n >= np.size(noisefilenames)-1:
idx_n = np.random.randint(0, np.size(noisefilenames))
newnoise, fs = audioread(noisefilenames[idx_n])
noiseconcat = np.append(noise, np.zeros(int(fs*silence_length)))
noise = np.append(noiseconcat, newnoise)
noise = noise[0:len(clean)]
filecounter = filecounter + 1

for i in range(np.size(SNR)):
text1 = str(cleanfilenames[idx_s])
clean_snr, noise_snr, noisy_snr = snr_mixer(clean=clean, noise=noise, snr=SNR[i])
noisyfilename = 'noisy'+ str(filecounter)+'_SNRdb_'+str(SNR[i])+'_clnsp'+str(filecounter)+'.wav'
cleanfilename = 'clnsp'+ str(filecounter)+'.wav'
noisefilename = 'noisy'+ str(filecounter)+'_SNRdb_'+str(SNR[i])+'.wav'
noisypath = os.path.join(noisyspeech_dir, noisyfilename)
cleanpath = os.path.join(clean_proc_dir, cleanfilename)
noisepath = os.path.join(noise_proc_dir, noisefilename)
audiowrite(noisy_snr, fs, noisypath, norm=False)
audiowrite(clean_snr, fs, cleanpath, norm=False)
audiowrite(noise_snr, fs, noisepath, norm=False)
num_samples = num_samples + len(noisy_snr)

if __name__=="__main__":
parser = argparse.ArgumentParser()
# Configurations: read noisyspeech_synthesizer.cfg
parser.add_argument("--cfg", default = "noisyspeech_synthesizer.cfg", help = "Read noisyspeech_synthesizer.cfg for all the details")
parser.add_argument("--cfg_str", type=str, default = "noisy_speech" )
args = parser.parse_args()
cfgpath = os.path.join(os.path.dirname(__file__), args.cfg)
assert os.path.exists(cfgpath), f"No configuration file as [{cfgpath}]"
cfg = CP.ConfigParser()
cfg._interpolation = CP.ExtendedInterpolation()
cfg.read(cfgpath)

main(cfg._sections[args.cfg_str])

## Randomly sample files to make a database

In [None]:
85. ran_num = 0 # add
          while num_samples < total_samples:

             np.random.seed(ran_num) # add

                  ran_num = ran_num + 1 # add

In [None]:
# Modified file noisyspeech_synthesizer.cfg
# Source: https://github.com/microsoft/MS-SNSD/blob/master/noisyspeech_synthesizer.cfg
# Configuration for generating Noisy Speech Dataset

# - sampling_rate: Specify the sampling rate. Default is 16 kHz
# - audioformat: default is .wav
# - audio_length: Minimum Length of each audio clip (noisy and clean speech) in seconds that will be generated by augmenting utterances. 
# - silence_length: Duration of silence introduced between clean speech utterances.
# - total_hours: Total number of hours of data required. Units are in hours. 
# - snr_lower: Lower bound for SNR required (default: 0 dB)
# - snr_upper: Upper bound for SNR required (default: 40 dB)
# - total_snrlevels: Number of SNR levels required (default: 5, which means there are 5 levels between snr_lower and snr_upper)
# - noise_dir: Default is None. But specify the noise directory path if noise files are not in the source directory
# - Speech_dir: Default is None. But specify the speech directory path if speech files are not in the source directory
# - noise_types_excluded: Noise files starting with the following tags to be excluded in the noise list. Example: noise_types_excluded: Babble, AirConditioner
# Specify 'None' if no noise files to be excluded.

[noisy_speech]

sampling_rate: 16000
audioformat: *.wav
audio_length: 10
silence_length: 0.2
total_hours: 3.0
snr_lower: 0.1
snr_upper: 10.1
total_snrlevels: 10
    
# Modification
# Original: 
# noise_dir: None
# Speech_dir: None
# Change:
# noise_dir: your directory for noisy speech wav files (sample rate = 16000 samples/s)
# speech_dir: your directory for clean speech wav files (sample rate = 16000 samples/s)
noise_dir: your directory for noisy speech wav files (sample rate = 16000 samples/s)
speech_dir: your directory for clean speech wav files (sample rate = 16000 samples/s)
noise_types_excluded: None

In [None]:
import os

from pathlib import Path

from google.colab import drive

drive.mount('/content/drive/')

!python/content/drive/MyDrive/YourPath/noisyspeech_synthesizer.py

# Algorithm to find user's voice

In [None]:
# Define Lists

mlt = list of floats from 0.001 to 2.000 

# eg (0.001, 0.002,  …  2.000)

# storing  absolute value correlation values

corr_abs = empty list 

# storing m values, where m is a float

m_val = empty list

# note y1 = Device’s Voice

y1 = normalize(noisy28_SNRdb_0.1.wav)

# note y2 = Mixed Voices    

y2 = normalize(noisy28_SNRdb_0.1_clnsp28.wav)# 

# normalize files between - 1 to 1, ie they both share the same scale

for m in mlt do:

X = normalize(y2 - m * y1) 

y = y1

corr = correlation between X and y

      append absolute value of corr to corr_abs

ind = index of min value in corr_abs

m1 = value of mlt at index ind

user_voice = normalize(y2 - m1 * y1)

In [None]:
text = Speech Recognition API(S_new.wav)

print(text)

# References

http://festvox.org/cmu_arctic/

https://github.com/microsoft/MS-SNSD

http://festvox.org/cmu_arctic/cmu_arctic_report.pdf

https://omdena.com/blog/how-to-remove-a-devices-voice-from-a-mixed-audio-signal/