In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import tensorflow as tf
import malaya_speech
import malaya_speech.train
from malaya_speech.train.model import unet
from malaya_speech.utils import tf_featurization
import malaya_speech.augmentation.waveform as augmentation
import IPython.display as ipd
import numpy as np
from malaya_speech.utils.tf_featurization import separation_exponent, EPSILON


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [3]:
# !pip3 install museval
import museval

In [4]:
class Model:
    def __init__(self, size = 2):
        self.X = tf.placeholder(tf.float32, (None))
        self.Y = tf.placeholder(tf.float32, (size, None))
        
        stft_X, D_X = tf_featurization.get_stft(self.X)
        
        self.stft = []
        for i in range(size):
            self.stft.append(tf_featurization.get_stft(self.Y[i]))
        
        self.outputs = []
        for i in range(size):
            with tf.variable_scope(f'model_{i}'):
                self.outputs.append(unet.Model(D_X).logits)
        
        self.loss = []
        for i in range(size):
            self.loss.append(
                tf.reduce_mean(tf.abs(self.outputs[i] - self.stft[i][1]))
            )

        self.cost = tf.reduce_sum(self.loss)
        
        output_sum = tf.reduce_sum([o ** separation_exponent for o in self.outputs], axis=0) + EPSILON
        
        self.istft = []
        for no, D in enumerate(self.outputs):

            instrument_mask = (D ** separation_exponent + (EPSILON / size)) / output_sum
            instrument_mask = tf_featurization.extend_mask(instrument_mask)
            old_shape = tf.shape(instrument_mask)
            new_shape = tf.concat(
                [[old_shape[0] * old_shape[1]], old_shape[2:]],
                axis=0)
            instrument_mask = tf.reshape(instrument_mask, new_shape)

            instrument_mask = instrument_mask[:tf.shape(stft_X)[0]]
            masked_stft = tf.cast(instrument_mask, dtype=tf.complex64) * stft_X
            self.istft.append(tf_featurization.istft(masked_stft, self.X)[:,0])
        
        for i in range(size):
            tf.identity(self.istft[i], name = f'logits_{i}')

In [5]:
tf.compat.v1.reset_default_graph()
model = Model()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [6]:
var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, 'speech-enhancement-unet/model.ckpt-500000')

INFO:tensorflow:Restoring parameters from speech-enhancement-unet/model.ckpt-500000


In [7]:
import pickle

with open('test-set-speech-enhancement.pkl', 'rb') as fopen:
    results = pickle.load(fopen)

In [8]:
results[0]

(array([ 0.01514318,  0.01278879,  0.00750327, ..., -0.21006574,
        -0.21571529, -0.21963991]),
 array([0.01038297, 0.00744865, 0.00195621, ..., 0.00248288, 0.00225717,
        0.00489053]),
 array([ 0.00476021,  0.00534014,  0.00554706, ..., -0.21254863,
        -0.21797246, -0.22453044]))

In [9]:
sess.run([model.cost, model.loss], feed_dict = {model.X: results[0][0],
                                               model.Y: results[0][1:]})

[0.9264984, [0.50170124, 0.42479718]]

In [10]:
from tqdm import tqdm

total, voice, noise = [], [], []

SDR, ISR, SAR = [], [], []

for i in tqdm(range(len(results))):
    c, l = sess.run([model.cost, model.loss], feed_dict = {model.X: results[i][0],
                                               model.Y: results[i][1:]})
    total.append(c)
    voice.append(l[0])
    noise.append(l[1])
    
    y_ = sess.run(model.istft, feed_dict = {model.X: results[i][0]})[0]
    sdr, isr, _, sar = museval.evaluate(np.reshape(results[i][1], (1, -1)), 
                                        np.reshape(y_, (1, -1)))
    SDR.append(np.nanmean(sdr))
    ISR.append(np.nanmean(isr))
    SAR.append(np.nanmean(sar))

100%|██████████| 100/100 [03:49<00:00,  2.30s/it]


In [11]:
np.mean(total)

0.85896146

In [12]:
np.mean(voice)

0.46849293

In [13]:
np.mean(noise)

0.39046836

In [14]:
np.mean(SDR)

12.128058507584178

In [15]:
np.mean(ISR)

14.670673275971492

In [16]:
np.mean(SAR)

15.019682991750814