In [1]:
import malaya_speech
import numpy as np
from malaya_speech import Pipeline
import matplotlib.pyplot as plt
import IPython.display as ipd

### Creating dummy audio

In [2]:
import random
import malaya_speech.augmentation.waveform as augmentation

sr = 8000
speakers_size = 4

def read_wav(f):
    return malaya_speech.load(f, sr = sr)


def random_sampling(s, length):
    return augmentation.random_sampling(s, sr = sr, length = length)

def combine_speakers(files, n = 5, limit = 4):
    w_samples = random.sample(files, n)
    w_samples = [read_wav(f)[0] for f in w_samples]
    w_lens = [len(w) / sr for w in w_samples]
    w_lens = int(min(min(w_lens) * 1000, random.randint(3000, 7000)))
    w_samples = [random_sampling(w, length = w_lens) for w in w_samples]
    y = [w_samples[0]]
    left = w_samples[0].copy()

    combined = None

    for i in range(1, n):
        right = w_samples[i].copy()
        overlap = random.uniform(0.98, 1.0)
        len_overlap = int(overlap * len(right))
        minus = len(left) - len_overlap
        if minus < 0:
            minus = 0
        padded_right = np.pad(right, (minus, 0))
        left = np.pad(left, (0, len(padded_right) - len(left)))

        left = left + padded_right

        if i >= (limit - 1):
            if combined is None:
                combined = padded_right
            else:
                combined = np.pad(
                    combined, (0, len(padded_right) - len(combined))
                )
                combined += padded_right

        else:
            y.append(padded_right)

    if combined is not None:
        y.append(combined)

    maxs = [max(left)]
    for i in range(len(y)):
        if len(y[i]) != len(left):
            y[i] = np.pad(y[i], (0, len(left) - len(y[i])))
            maxs.append(max(y[i]))

    max_amp = max(maxs)
    mix_scaling = 1 / max_amp * 0.9
    left = left * mix_scaling

    for i in range(len(y)):
        y[i] = y[i] * mix_scaling

    return left, y

In [3]:
from glob import glob

wavs = glob('speech/example-speaker/*.wav')
len(wavs)

8

#### Generate overlapped multispeakers

In [4]:
left, y = combine_speakers(wavs, speakers_size)
len(left) / sr, len(y)

(3.011125, 4)

In [5]:
ipd.Audio(left, rate = sr)

#### Added background music

In [6]:
background_music, sr = malaya_speech.load('speech/song/Lights-February-Air-sample.wav', sr = None)
sr

44100

In [7]:
resampled_left = malaya_speech.resample(left, 8000, 44100)
resampled_left = resampled_left + background_music[:len(resampled_left)] * 0.4
resampled_left = malaya_speech.resample(resampled_left, 44100, 16000)

In [8]:
import soundfile as sf

sf.write('background-multispeaker.wav', resampled_left, 16000)