In [1]:
import os
import time
import argparse
from random import choice, randint
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from sphfile import SPHFile
import librosa

import torch
from torch.utils.data import Dataset, DataLoader
import pickle

In [9]:
def compute_stft(audio_data, win_length=2048, hop_length=512, n_fft=2048):
    '''
    Helper method to compute the Short Time Fourier Transform
    '''
    return librosa.stft(audio_data, win_length=win_length, hop_length=hop_length, n_fft=n_fft)

In [10]:
BASE_DIR = os.path.join(os.getcwd(), '../')
DATA_DIR = os.path.join(os.getcwd(), '../data')

In [13]:
num_samples = 1
split = 'train'
alpha = 0.8
snr = 15
k = 512

In [15]:
def pickle_stft_data_snr(split, num_samples=1000, alpha=0.8, k=512):
    '''
    Samples a random SNR in [15,20] and generates noisy signal accordingly.
    '''
    signal_len_lst = []

    noise_list = open(os.path.join(DATA_DIR, 'noise_list.txt')).readlines()
    file_list = open(os.path.join(DATA_DIR, '%s_set.txt'%(split))).readlines()

    clean_mixed_data_dict = {}
    clean_mixed_data_dict['clean'] = []
    clean_mixed_data_dict['mixed'] = []

    for itr in tqdm(range(num_samples)):
        snr = choice((15,20)) # randomly sample snr from 15-20
        sph_data = SPHFile(os.path.join(BASE_DIR, file_list[itr]).rstrip())
        samp_rate = sph_data.format['sample_rate']

        # Randomly sample noise sample from noise data list
        noise_data = librosa.load(os.path.join(BASE_DIR, choice(noise_list)).rstrip(), sr=samp_rate)
        assert(noise_data[1] == samp_rate == 16000)
        noise_signal = noise_data[0]
    #     print(np.max(noise_signal), np.min(noise_signal))

        # Mixing noise with clean speech
        clean_signal = sph_data.content / (2**(sph_data.format['sample_sig_bits'] - 1))

        len_signal = min(clean_signal.shape[0], noise_signal.shape[0])
        signal_len_lst.append(len_signal)
    #         print('Length of signal -- %d'%(len_signal))

        start_n = randint(0, max(0, noise_signal.shape[0] - clean_signal.shape[0]))

        # randomly sample a window from noise sequence to be mixed
        noise_signal = noise_signal[start_n:start_n+len_signal]
        clean_signal = clean_signal[0:len_signal]

        p_noise = np.average(noise_signal**2)
        p_signal = np.average(clean_signal**2)
        alpha = np.sqrt(p_signal/p_noise * 10**-(snr/10))

        mixed_signal = clean_signal + alpha * noise_signal
    #         print(np.max(mixed_signal), np.min(mixed_signal))
        print('SNR -- %f' %(10*np.log10(np.average(clean_signal**2)/(alpha**2 * np.average((noise_signal)**2)))))

        stft_clean = compute_stft(clean_signal, win_length=k, n_fft=k)
        stft_mixed = compute_stft(mixed_signal, win_length=k, n_fft=k)

        clean_mixed_data_dict['clean'].append(stft_clean)
        clean_mixed_data_dict['mixed'].append(stft_mixed)

    with open(os.path.join(DATA_DIR, 'pkl_files/%s_data/clean_mixed_data_snr_%d.pickle'%(split, num_samples)), 'wb') as handle:
        pickle.dump(clean_mixed_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

  0%|          | 0/1 [00:00<?, ?it/s]

SNR -- 15.000000
