In [1]:
import numpy as np
import librosa
from glob import glob
import soundfile as sf
import random

In [2]:
files = glob('../speech-bahasa/LibriSpeech/test-clean/*/*/*.flac')
len(files)

2620

In [3]:
def read_flac(file):
    data, old_samplerate = sf.read(file)
    if len(data.shape) == 2:
        data = data[:, 0]
    return data, old_samplerate

In [4]:
def combine_speakers(files, n = 5):
    samples = random.sample(files, n)
    w_samples = [read_flac(s)[0] for s in samples]
    y = [w_samples[0]]
    left = w_samples[0]
    for i in range(1, n):
        
        right = w_samples[i]
        
        overlap = random.uniform(0.01, 1.5)
        left_len = int(overlap * len(left))
        
        padded_right = np.pad(right, (left_len, 0))
        
        if len(left) > len(padded_right):
            padded_right = np.pad(padded_right, (0, len(left) - len(padded_right)))
        else:
            left = np.pad(left, (0, len(padded_right) - len(left)))
            
        y.append(padded_right)
        left = left + padded_right
    return left, y

In [5]:
def padding_sequence_1d(seq, maxlen = None, padding: str = 'post', pad_int = 0):
    """
    padding sequence of 1d to become 2d array.

    Parameters
    ----------
    seq: List[List[int]]
    maxlen: int, optional (default=None)
        If None, will calculate max length in the function.
    padding: str, optional (default='post')
        If `pre`, will add 0 on the starting side, else add 0 on the end side.
    pad_int, int, optional (default=0)
        padding value.

    Returns
    --------
    result: np.array
    """
    if padding not in ['post', 'pre']:
        raise ValueError('padding only supported [`post`, `pre`]')

    if not maxlen:
        maxlen = max([len(s) for s in seq])
    padded_seqs = []
    for s in seq:
        if padding == 'post':
            padded_seqs.append(s.tolist() + [pad_int] * (maxlen - len(s)))
        if padding == 'pre':
            padded_seqs.append([pad_int] * (maxlen - len(s)) + s.tolist())
    return padded_seqs

In [6]:
def sampling(combined, y, frame_duration_ms = 700, sample_rate = 16000):
    n = int(sample_rate * (frame_duration_ms / 1000.0))
    offset = 0
    results, Y = [], []
    while offset + n < len(combined):
        y_ = []
        results.append(combined[offset : offset + n])
        for i in y:
            m = np.mean(i[offset : offset + n] == 0)
            if m < 1:
                y_.append(i[offset : offset + n])
        offset += n
        Y.append(y_)
    return results, Y

In [7]:
combined, y = combine_speakers(files)
padded_y = np.array(padding_sequence_1d(y))

In [14]:
results, Y = sampling(combined, padded_y, frame_duration_ms = 2000)
[(no, len(i)) for no, i in enumerate(Y)]

[(0, 1),
 (1, 2),
 (2, 2),
 (3, 2),
 (4, 2),
 (5, 2),
 (6, 1),
 (7, 1),
 (8, 0),
 (9, 0),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 0),
 (16, 0),
 (17, 0),
 (18, 0),
 (19, 0),
 (20, 0),
 (21, 1),
 (22, 1),
 (23, 0),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1)]

In [9]:
import IPython.display as ipd

In [15]:
ipd.Audio(results[1], rate = 16000)

In [16]:
ipd.Audio(Y[1][0], rate = 16000)

In [17]:
ipd.Audio(Y[1][1], rate = 16000)