In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import tensorflow as tf
import malaya_speech
import malaya_speech.train
from malaya_speech.train.model import unet
from malaya_speech.utils import tf_featurization
from tensorflow.keras.layers import Multiply
import IPython.display as ipd
import numpy as np






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [4]:
def get_stft(X):
    batch_size = tf.shape(X)[0]
    stft_X = tf.TensorArray(dtype = tf.complex64, size = batch_size, dynamic_size = False, infer_shape = False)
    D_X = tf.TensorArray(dtype = tf.float32, size = batch_size, dynamic_size = False, infer_shape = False)

    init_state = (0, stft_X, D_X)
    def condition(i, features, features_len):
        return i < batch_size

    def body(i, features, features_len):
        stft_x, D_x = tf_featurization.get_stft(X[i])
        return i + 1, stft_X.write(i, stft_x), D_X.write(i, D_x)

    _, stft_X, D_X = tf.while_loop(condition, body, init_state)
    stft_X = stft_X.stack()
    stft_X.set_shape((None, None, 2049, 1))
    D_X = D_X.stack()
    D_X.set_shape((None, None, 512, 1024, 1))
    return stft_X, D_X

class Model:
    def __init__(self, size = 4):
        self.X = tf.placeholder(tf.float32, (None, None))
        self.Y = tf.placeholder(tf.float32, (None, size, None))
        
        stft_X, D_X = get_stft(self.X)
        
        self.stft = []
        for i in range(size):
            self.stft.append(get_stft(self.Y[:, i]))
            
        self.outputs = unet.Model3D(D_X, dropout = 0.0, training = True, cout = size,
                                   kernel_size = 5).logits
        
        self.loss = []
        for i in range(size):
            self.loss.append(tf.reduce_mean(tf.abs(self.outputs[:, :, :, :, i: i + 1] - self.stft[i][1])))        
        
        self.cost = tf.reduce_sum(self.loss)
        
#         separation_exponent = 2
#         EPSILON = 1e-10
        
#         output_sum = tf.reduce_sum([o[0] ** separation_exponent for o in self.outputs], axis=0) + EPSILON
        
#         self.istft = []
#         for no, D in enumerate(self.outputs):
#             D = D[0]

#             instrument_mask = (D ** separation_exponent + (EPSILON / size)) / output_sum
#             instrument_mask = tf_featurization.extend_mask(instrument_mask)
#             old_shape = tf.shape(instrument_mask)
#             new_shape = tf.concat(
#                 [[old_shape[0] * old_shape[1]], old_shape[2:]],
#                 axis=0)
#             instrument_mask = tf.reshape(instrument_mask, new_shape)

#             instrument_mask = instrument_mask[:tf.shape(stft_X[0])[0]]
#             masked_stft = tf.cast(instrument_mask, dtype=tf.complex64) * stft_X[0]
#             self.istft.append(tf_featurization.istft(masked_stft, self.X)[:,0])

In [5]:
tf.compat.v1.reset_default_graph()
model = Model()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [6]:
model.outputs, model.stft

(<tf.Tensor 'output/mul:0' shape=(?, ?, 512, 1024, 4) dtype=float32>,
 [(<tf.Tensor 'TensorArrayStack_2/TensorArrayGatherV3:0' shape=(?, ?, 2049, 1) dtype=complex64>,
   <tf.Tensor 'TensorArrayStack_3/TensorArrayGatherV3:0' shape=(?, ?, 512, 1024, 1) dtype=float32>),
  (<tf.Tensor 'TensorArrayStack_4/TensorArrayGatherV3:0' shape=(?, ?, 2049, 1) dtype=complex64>,
   <tf.Tensor 'TensorArrayStack_5/TensorArrayGatherV3:0' shape=(?, ?, 512, 1024, 1) dtype=float32>),
  (<tf.Tensor 'TensorArrayStack_6/TensorArrayGatherV3:0' shape=(?, ?, 2049, 1) dtype=complex64>,
   <tf.Tensor 'TensorArrayStack_7/TensorArrayGatherV3:0' shape=(?, ?, 512, 1024, 1) dtype=float32>),
  (<tf.Tensor 'TensorArrayStack_8/TensorArrayGatherV3:0' shape=(?, ?, 2049, 1) dtype=complex64>,
   <tf.Tensor 'TensorArrayStack_9/TensorArrayGatherV3:0' shape=(?, ?, 512, 1024, 1) dtype=float32>)])

In [7]:
y, sr = malaya_speech.load('../speech/example-speaker/husein-zolkepli.wav', sr = 44100)
len(y) / sr

5.630680272108844

In [8]:
# ipd.Audio(y, rate = sr)

In [9]:
# y_ = malaya_speech.augmentation.waveform.sox_augment_high(y, min_bass_gain = 70,
#                                                           reverberance = 50, 
#                                                           negate = 0)
# ipd.Audio(y_, rate = sr)

In [10]:
# noise = y - y_

In [11]:
sess.run(model.loss, feed_dict = {model.X: [y],
                                         model.Y: [[y] * 4]})

[0.2172515, 0.24013034, 0.25318438, 0.35383564]

In [12]:
# stft[0][1].shape

In [13]:
# outputs = sess.run(model.outputs, feed_dict = {model.X: y_})
# [o.shape for o in outputs]

In [14]:
# sess.run(model.loss, feed_dict = {model.X: y_, model.Y: [y, noise]})

In [15]:
# istft = sess.run(model.istft, feed_dict = {model.X: y_})
# [s.shape for s in istft]

In [16]:
# ipd.Audio(istft[0], rate = sr)

In [17]:
# ipd.Audio(istft[1], rate = sr)

In [18]:
# ipd.Audio(y_, rate = sr)

In [19]:
saver = tf.train.Saver()

In [20]:
saver.save(sess, 'test/model.ckpt')

'test/model.ckpt'

In [21]:
!ls -lh test

total 384544
-rw-r--r--  1 huseinzolkepli  staff    77B Mar  1 21:44 checkpoint
-rw-r--r--  1 huseinzolkepli  staff   187M Mar  1 21:44 model.ckpt.data-00000-of-00001
-rw-r--r--  1 huseinzolkepli  staff   2.6K Mar  1 21:44 model.ckpt.index
-rw-r--r--  1 huseinzolkepli  staff   454K Mar  1 21:44 model.ckpt.meta


In [22]:
tf.trainable_variables()

[<tf.Variable 'conv3d/kernel:0' shape=(5, 5, 5, 1, 16) dtype=float32>,
 <tf.Variable 'conv3d/bias:0' shape=(16,) dtype=float32>,
 <tf.Variable 'batch_normalization/gamma:0' shape=(16,) dtype=float32>,
 <tf.Variable 'batch_normalization/beta:0' shape=(16,) dtype=float32>,
 <tf.Variable 'conv3d_1/kernel:0' shape=(5, 5, 5, 16, 32) dtype=float32>,
 <tf.Variable 'conv3d_1/bias:0' shape=(32,) dtype=float32>,
 <tf.Variable 'batch_normalization_1/gamma:0' shape=(32,) dtype=float32>,
 <tf.Variable 'batch_normalization_1/beta:0' shape=(32,) dtype=float32>,
 <tf.Variable 'conv3d_2/kernel:0' shape=(5, 5, 5, 32, 64) dtype=float32>,
 <tf.Variable 'conv3d_2/bias:0' shape=(64,) dtype=float32>,
 <tf.Variable 'batch_normalization_2/gamma:0' shape=(64,) dtype=float32>,
 <tf.Variable 'batch_normalization_2/beta:0' shape=(64,) dtype=float32>,
 <tf.Variable 'conv3d_3/kernel:0' shape=(5, 5, 5, 64, 128) dtype=float32>,
 <tf.Variable 'conv3d_3/bias:0' shape=(128,) dtype=float32>,
 <tf.Variable 'batch_normaliza

In [23]:
!rm -rf test