In [68]:
from scipy.stats import norm
import scipy.signal as sig
from scipy.io import wavfile
import copy
import os
import random
from pydub import AudioSegment
from pydub.utils import make_chunks
from tqdm import tqdm
import numpy as np

In [2]:
# create a folder for the CH 10 sampled audio

In [3]:
# get the root dir first
ROOT_DIR = '/preproc/datasets_silence_removed/mms_batch_1'
subdir = os.listdir(ROOT_DIR)
subdir

['mms_20220417',
 'mms_20220610',
 'mms_20220520',
 'mms_20220529',
 'mms_20220430',
 'mms_20220501',
 'mms_20220404']

In [4]:
os.listdir(ROOT_DIR+'/'+subdir[0]+'/CH 10')[:5]

['0000000997_CHDIR_495_2022-04-17_19_16_56_0.wav',
 '0000001226_CHDIR_495_2022-04-17_23_32_39_17.wav',
 '0000000239_CHDIR_495_2022-04-17_04_42_58_6.wav',
 '0000000056_CHDIR_495_2022-04-17_01_00_29_0.wav',
 '0000001200_CHDIR_495_2022-04-17_22_59_16_4.wav']

In [5]:
# get the overall batch 1 CH 10 list with all the filepaths
wavfile_list = []

# sample 1000 audio files from each of the batch 1 CH 10 mms data
for sub in subdir:
    subroot_dir = f'{ROOT_DIR}/{sub}/CH 10/'
    wavfile_list_one_date = [subroot_dir+s for s in os.listdir(f'{subroot_dir}') if s.endswith('.wav')]
    wavfile_list.extend(wavfile_list_one_date)
    
len(wavfile_list)

26492

In [6]:
# shuffle the list
random.shuffle(wavfile_list)
wavfile_list[:10]

['/preproc/datasets_silence_removed/mms_batch_1/mms_20220520/CH 10/0000001264_CHDIR_495_2022-05-20_16_18_50_4.wav',
 '/preproc/datasets_silence_removed/mms_batch_1/mms_20220610/CH 10/0000001348_CHDIR_495_2022-06-09_16_31_38_0.wav',
 '/preproc/datasets_silence_removed/mms_batch_1/mms_20220529/CH 10/0000000078_CHDIR_495_2022-05-29_02_29_18_36.wav',
 '/preproc/datasets_silence_removed/mms_batch_1/mms_20220404/CH 10/0000000135_CHDIR_495_2022-04-04_02_38_39_9.wav',
 '/preproc/datasets_silence_removed/mms_batch_1/mms_20220520/CH 10/0000001026_CHDIR_495_2022-05-20_12_57_44_0.wav',
 '/preproc/datasets_silence_removed/mms_batch_1/mms_20220430/CH 10/0000000642_CHDIR_495_2022-04-30_17_37_15_9.wav',
 '/preproc/datasets_silence_removed/mms_batch_1/mms_20220404/CH 10/0000000806_CHDIR_495_2022-04-04_17_32_47_23.wav',
 '/preproc/datasets_silence_removed/mms_batch_1/mms_20220417/CH 10/0000000999_CHDIR_495_2022-04-17_19_20_51_5.wav',
 '/preproc/datasets_silence_removed/mms_batch_1/mms_20220404/CH 10/000

In [7]:
## split into batches of 2.5k audio -> 10 batches
SIZE = 2500
BATCH = 10
wavfile_batch_list = []
for batch_num in range(BATCH):
    wavfile_batch = wavfile_list[batch_num*SIZE: (batch_num+1)*SIZE]
    wavfile_batch_list.append(wavfile_batch)
    
len(wavfile_batch_list)

10

In [None]:
wavfile_batch_list[0][0]

'/preproc/datasets_silence_removed/mms_batch_1/mms_20220520/CH 10/0000001264_CHDIR_495_2022-05-20_16_18_50_4.wav'

In [153]:
# concat the audio files together
sample_sliced_list = []
for i in range(2):
    for idx, audio in tqdm(enumerate(wavfile_batch_list[i])):
        if idx == 0:
            sound = AudioSegment.from_wav(audio)
        else:
            sound += AudioSegment.from_wav(audio)

        # if over 1hr, break
        if sound.duration_seconds > 1*60*60:
            break

    # convert the AudioSegment object into the sound array 
    sample = sound.get_array_of_samples()

    # get the first one hr of audio by slicing according to the sampling rate
    SAMPLING_RATE = 16000
    sample_sliced = sample[:SAMPLING_RATE*3600]
    sample_sliced_list.append(sample_sliced)

933it [00:32, 29.03it/s] 
934it [00:29, 31.40it/s] 


In [154]:
len(sample_sliced_list[0]), len(sample_sliced_list[1])

(57600000, 57600000)

In [147]:
sound.export('audio_chunk.wav', format='wav')

<_io.BufferedRandom name='audio_chunk.wav'>

In [163]:
# specially for CH 14 to upsample till 1hr
sound = AudioSegment.from_wav('/preproc/batched_1_hr_audio/CH14_batch_0.wav')
while True:
    if sound.duration_seconds > 1*60*60:
        break
    sound += sound
    
sound.export('/preproc/batched_1_hr_audio/CH14_batch_0_upsampled.wav', format='wav')

<_io.BufferedRandom name='/preproc/batched_1_hr_audio/CH14_batch_0_upsampled.wav'>

In [141]:
# helper function to get the SNR from the 'cleaned' audio --> taking CH 10 as the baseline
def signalPower(x):
    return np.average(x**2)

def SNRsystem(inputSig, outputSig):
    noise = outputSig-inputSig
    powS = signalPower(outputSig)
    powN = signalPower(noise)
    print(powS, powN)
    print((powS-powN)/powN)
    print(np.abs((powS-powN)/powN))
    return 10*np.log10(np.abs((powS-powN)/powN))

In [144]:
# test out the functions of the snr first
snr = SNRsystem(np.array(sample_sliced_list[0]), np.array(sample_sliced_list[1]))
snr

3223664.6499034897 237977.0019732118
12.546118419738585
12.546118419738585


10.985093824198952