In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import tensorflow as tf

tf.compat.v1.enable_eager_execution()

In [4]:
from glob import glob

wavs = glob('../speech/example-speaker/*.wav')
len(wavs)

8

In [5]:
import random
import malaya_speech.augmentation.waveform as augmentation
import malaya_speech
import numpy as np

sr = 8000
speakers_size = 4

# noise = malaya_speech.load('noise.wav', sr = sr)[0]

def read_wav(f):
    return malaya_speech.load(f, sr = sr)


def random_sampling(s, length):
    return augmentation.random_sampling(s, sr = sr, length = length)

def to_mel(y):
    mel = malaya_speech.featurization.universal_mel(y)
    mel[mel <= np.log(1e-2)] = np.log(1e-2)
    return mel

def combine_speakers(files, n = 5, limit = 4):
    w_samples = random.sample(files, n)
    w_samples = [read_wav(f)[0] for f in w_samples]
    w_lens = [len(w) / sr for w in w_samples]
    w_lens = int(min(min(w_lens) * 1000, random.randint(2000, 10000)))
    w_samples = [random_sampling(w, length = w_lens) for w in w_samples]
    y = [w_samples[0]]
    left = w_samples[0].copy()

    combined = None

    for i in range(1, n):
        right = w_samples[i].copy()
        overlap = random.uniform(0.98, 1.0)
        print(i, overlap)
        len_overlap = int(overlap * len(right))
        minus = len(left) - len_overlap
        if minus < 0:
            minus = 0
        padded_right = np.pad(right, (minus, 0))
        left = np.pad(left, (0, len(padded_right) - len(left)))

        left = left + padded_right

        if i >= (limit - 1):
            if combined is None:
                combined = padded_right
            else:
                combined = np.pad(
                    combined, (0, len(padded_right) - len(combined))
                )
                combined += padded_right

        else:
            y.append(padded_right)

    if combined is not None:
        y.append(combined)
        
    maxs = [max(left)]
    for i in range(len(y)):
        if len(y[i]) != len(left):
            y[i] = np.pad(y[i], (0, len(left) - len(y[i])))
            maxs.append(max(y[i]))
            
    max_amp = max(maxs)
    mix_scaling = 1 / max_amp * 0.95
    left = left * mix_scaling
    
    for i in range(len(y)):
        y[i] = y[i] * mix_scaling

#     for i in range(len(y)):
#         if len(y[i]) != len(left):
#             y[i] = np.pad(y[i], (0, len(left) - len(y[i])))
#             y[i] = y[i] / np.max(np.abs(y[i]))

#     left = left / np.max(np.abs(left))
        
    return left, y

# y, _ = malaya_speech.load('../speech/example-speaker/husein-zolkepli.wav')
# y = np.expand_dims(y, 0).astype(np.float32)
# y.shape

In [26]:
left, y = combine_speakers(wavs, 4)
y = np.array([y]).astype(np.float32)
left = np.array([left]).astype(np.float32)
x = np.random.normal(size = y.shape).astype(np.float32)
len(left) / sr, len(y)

1 0.991702905607049
2 0.9913397320058862
3 0.993411257433154


(0.000125, 1)

In [92]:
import torch

x_pt = torch.from_numpy(x)
x_tf = tf.constant(x)
y_pt = torch.from_numpy(y)
y_tf = tf.constant(y)

In [93]:
x_tf

<tf.Tensor: id=245, shape=(1, 4, 21889), dtype=float32, numpy=
array([[[-0.01159313,  2.4352384 , -0.49477547, ...,  0.5427677 ,
         -0.50842077,  1.3719425 ],
        [-0.62613714,  0.40092194,  0.98418045, ..., -0.33143365,
          0.03194261,  0.6151522 ],
        [ 1.2165703 ,  0.6483659 ,  0.9452555 , ..., -0.12862363,
         -0.5408644 , -0.96836466],
        [ 1.8071035 ,  0.52183706, -1.4022133 , ..., -0.6891508 ,
         -0.8197269 , -0.27434656]]], dtype=float32)>

In [94]:
# T, B, C
source = y_pt.permute((2,0,1))
estimate_source = x_pt.permute((2,0,1))

In [95]:
source_lengths = torch.tensor(
    [estimate_source.shape[0]] * estimate_source.shape[1]
)

In [96]:
num_samples = (
    source_lengths.contiguous().reshape(1, -1, 1).float()
)
num_samples.shape

torch.Size([1, 1, 1])

In [97]:
def get_mask(source, source_lengths):
    T, B, _ = source.size()
    mask = source.new_ones((T, B, 1))
    for i in range(B):
        mask[source_lengths[i] :, i, :] = 0
    return mask

mask = get_mask(source, source_lengths)

In [98]:
estimate_source *= mask

In [100]:
EPS = 1e-8

In [101]:
mean_target = torch.sum(source, dim=0, keepdim=True) / num_samples
mean_estimate = (
    torch.sum(estimate_source, dim=0, keepdim=True) / num_samples
)
zero_mean_target = source - mean_target
zero_mean_estimate = estimate_source - mean_estimate
# mask padding position along T
zero_mean_target *= mask
zero_mean_estimate *= mask
s_target = zero_mean_target  # [T, B, C]
s_estimate = zero_mean_estimate  # [T, B, C]
# s_target = <s', s>s / ||s||^2
dot = torch.sum(s_estimate * s_target, dim=0, keepdim=True)  # [1, B, C]
s_target_energy = (
    torch.sum(s_target ** 2, dim=0, keepdim=True) + EPS
)  # [1, B, C]
proj = dot * s_target / s_target_energy  # [T, B, C]
# e_noise = s' - s_target
e_noise = s_estimate - proj  # [T, B, C]
# SI-SNR = 10 * log_10(||s_target||^2 / ||e_noise||^2)
si_snr_beforelog = torch.sum(proj ** 2, dim=0) / (
    torch.sum(e_noise ** 2, dim=0) + EPS
)
si_snr = 10 * torch.log10(si_snr_beforelog + EPS)  # [B, C]

In [102]:
si_snr

tensor([[-46.3737, -58.6511, -40.0519, -53.9084]])

In [77]:
# T, B, C
source_tf = tf.transpose(y_tf, [2,0,1])
estimate_source_tf = tf.transpose(x_tf, [2,0,1])

In [78]:
source_lengths_tf = tf.tile([tf.shape(source_tf)[0]], [tf.shape(source_tf)[1]])

In [79]:
source_lengths_tf.shape

TensorShape([Dimension(1)])

In [80]:
mask = tf.cast(
        tf.sequence_mask(source_lengths_tf, tf.reduce_max(source_lengths_tf)),
        source_tf.dtype,
    )
mask = tf.transpose(mask)
mask.shape

TensorShape([Dimension(21889), Dimension(1)])

In [81]:
mask = tf.expand_dims(mask, 2)
estimate_source_tf *= mask

In [82]:
estimate_source.shape

torch.Size([21889, 1, 4])

In [83]:
source_lengths_tf

<tf.Tensor: id=207, shape=(1,), dtype=int32, numpy=array([21889], dtype=int32)>

In [84]:
num_samples = tf.cast(tf.reshape(source_lengths_tf, (1, -1, 1)), tf.float32)
num_samples

<tf.Tensor: id=225, shape=(1, 1, 1), dtype=float32, numpy=array([[[21889.]]], dtype=float32)>

In [87]:
mean_target = tf.reduce_sum(source_tf, axis = 0, keepdims = True) / num_samples
mean_estimate = (
    tf.reduce_sum(estimate_source_tf, axis = 0, keepdims = True) / num_samples
)
zero_mean_target_tf = source_tf - mean_target
zero_mean_estimate_tf = estimate_source_tf - mean_estimate

zero_mean_target_tf *= mask
zero_mean_estimate_tf *= mask

In [90]:
zero_mean_estimate

tensor([[[-0.0130, -0.6242,  1.2104,  1.8028]],

        [[ 2.4338,  0.4028,  0.6422,  0.5176]],

        [[-0.4962,  0.9861,  0.9391, -1.4065]],

        ...,

        [[ 0.5414, -0.3295, -0.1348, -0.6934]],

        [[-0.5098,  0.0339, -0.5471, -0.8240]],

        [[ 1.3705,  0.6171, -0.9746, -0.2786]]])

In [91]:
zero_mean_estimate_tf

<tf.Tensor: id=244, shape=(21889, 1, 4), dtype=float32, numpy=
array([[[-0.01298668, -0.6242194 ,  1.2103798 ,  1.8028239 ]],

       [[ 2.4338448 ,  0.40283966,  0.6421755 ,  0.5175575 ]],

       [[-0.49616903,  0.9860982 ,  0.9390651 , -1.406493  ]],

       ...,

       [[ 0.54137415, -0.32951593, -0.13481408, -0.69343036]],

       [[-0.5098143 ,  0.03386034, -0.5470548 , -0.82400644]],

       [[ 1.370549  ,  0.6170699 , -0.9745551 , -0.2786261 ]]],
      dtype=float32)>