In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
import malaya_speech
import tensorflow as tf
from malaya_speech.train.model import fastsplit, fastspeech, sepformer
import malaya_speech.augmentation.waveform as augmentation
import numpy as np






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [3]:
from glob import glob

wavs = glob('speech/example-speaker/*.wav')
len(wavs)

8

In [4]:
from collections import defaultdict

librispeech = glob('../speech-bahasa/LibriSpeech/*/*/*/*.flac')
len(librispeech)


def get_speaker_librispeech(file):
    return file.split('/')[-1].split('-')[0]


speakers = defaultdict(list)
for f in librispeech:
    speakers[get_speaker_librispeech(f)].append(f)

vctk = glob('vtck/**/*.flac', recursive = True)
vctk_speakers = defaultdict(list)
for f in vctk:
    s = f.split('/')[-1].split('_')[0]
    vctk_speakers[s].append(f)

files = glob('../speech-bahasa/ST-CMDS-20170001_1-OS/*.wav')
speakers_mandarin = defaultdict(list)
for f in files:
    speakers_mandarin[f[:-9]].append(f)
len(speakers_mandarin)

speakers_malay = {}
speakers_malay['salina'] = glob(
    '../youtube/malay2/salina/output-wav-salina/*.wav'
)
male = glob('../youtube/malay2/turki/output-wav-turki/*.wav')
male.extend(
    glob(
        '../youtube/malay/dari-pasentran-ke-istana/output-wav-dari-pasentran-ke-istana/*.wav'
    )
)
speakers_malay['male'] = male
speakers_malay['haqkiem'] = glob('/home/husein/speech-bahasa/haqkiem/*.wav')
husein = glob('/home/husein/speech-bahasa/audio-wattpad/*.wav')
husein.extend(glob('/home/husein/speech-bahasa/audio-iium/*.wav'))
husein.extend(glob('/home/husein/speech-bahasa/audio/*.wav'))
speakers_malay['husein'] = husein

s = {**speakers}


keys = list(s.keys())


def random_speakers(n):
    ks = random.sample(keys, n)
    r = []
    for k in ks:
        r.append(random.choice(s[k]))
    return r

In [5]:
import random

sr = 8000
speakers_size = 4

# noise = malaya_speech.load('noise.wav', sr = sr)[0]

def read_wav(f):
    return malaya_speech.load(f, sr = sr)


def random_sampling(s, length):
    return augmentation.random_sampling(s, sr = sr, length = length)

def to_mel(y):
    mel = malaya_speech.featurization.universal_mel(y)
    mel[mel <= np.log(1e-2)] = np.log(1e-2)
    return mel

def combine_speakers(files, n = 5, limit = 4):
    w_samples = random.sample(files, n)
    w_samples = [read_wav(f)[0] for f in w_samples]
    w_lens = [len(w) / sr for w in w_samples]
    print(w_lens)
    w_lens = int(min(min(w_lens) * 1000, random.randint(1000, 5000)))
    w_samples = [random_sampling(w, length = w_lens) for w in w_samples]
    y = [w_samples[0]]
    left = w_samples[0].copy()

    combined = None

    for i in range(1, n):
        right = w_samples[i].copy()
        overlap = random.uniform(0.98, 1.0)
        print(i, overlap)
        len_overlap = int(overlap * len(right))
        minus = len(left) - len_overlap
        if minus < 0:
            minus = 0
        padded_right = np.pad(right, (minus, 0))
        left = np.pad(left, (0, len(padded_right) - len(left)))

        left = left + padded_right

        if i >= (limit - 1):
            if combined is None:
                combined = padded_right
            else:
                combined = np.pad(
                    combined, (0, len(padded_right) - len(combined))
                )
                combined += padded_right

        else:
            y.append(padded_right)

    if combined is not None:
        y.append(combined)
        
    maxs = [max(left)]
    for i in range(len(y)):
        if len(y[i]) != len(left):
            y[i] = np.pad(y[i], (0, len(left) - len(y[i])))
            maxs.append(max(y[i]))
            
    max_amp = max(maxs)
    mix_scaling = 1 / max_amp * 0.95
    left = left * mix_scaling
    
    for i in range(len(y)):
        y[i] = y[i] * mix_scaling

#     for i in range(len(y)):
#         if len(y[i]) != len(left):
#             y[i] = np.pad(y[i], (0, len(left) - len(y[i])))
#             y[i] = y[i] / np.max(np.abs(y[i]))

#     left = left / np.max(np.abs(left))
        
    return left, y

# y, _ = malaya_speech.load('../speech/example-speaker/husein-zolkepli.wav')
# y = np.expand_dims(y, 0).astype(np.float32)
# y.shape

In [6]:
count = 4
left, y = combine_speakers(random_speakers(count), count)
len(left) / sr, len(y)

[8.92, 15.685, 14.755, 15.555]
1 0.9888203676809841
2 0.9868842528667883
3 0.984018020025046


(4.15925, 4)

In [7]:
config = malaya_speech.config.fastspeech_config
dim = 256
config['encoder_hidden_size'] = dim
config['decoder_hidden_size'] = dim
config['encoder_num_hidden_layers'] = 4
config['encoder_num_attention_heads'] = 4
config = fastspeech.Config(vocab_size = 1, **config)

In [8]:
transformer = lambda: sepformer.Encoder_FastSpeech(config.encoder_self_attention_params)

In [9]:
model = sepformer.Model(transformer, transformer)


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [10]:
X = tf.placeholder(tf.float32, [None, None, 1])
len_X = tf.placeholder(tf.int32, [None])
logits = model(X)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [11]:
logits = tf.identity(logits, name = 'logits')

In [12]:
Y = tf.placeholder(tf.float32, [None, speakers_size, None])
estimate_source = tf.transpose(logits[:, :, :, 0], [1, 0, 2])
loss, max_snr, _ = sepformer.calculate_loss(
    Y, estimate_source, len_X, C = speakers_size
)




In [13]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [14]:
path = 'split-speaker-sepformer'
ckpt_path = tf.train.latest_checkpoint(path)
ckpt_path

'split-speaker-sepformer/model.ckpt-3522028'

In [15]:
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_list)
saver.restore(sess, ckpt_path)

INFO:tensorflow:Restoring parameters from split-speaker-sepformer/model.ckpt-3522028


In [16]:
count = 4
left, y = combine_speakers(wavs, count)
len(left) / sr, len(y)

[3.9295, 9.42725, 9.63625, 5.630625]
1 0.9819298103606807
2 0.9915476835371698
3 0.9806398889779804


(2.752875, 4)

In [17]:
%%time

l = sess.run(logits, feed_dict = {X: np.expand_dims([left], axis = -1)})
l.shape

CPU times: user 2.33 s, sys: 690 ms, total: 3.02 s
Wall time: 3.1 s


(4, 1, 22023, 1)

In [18]:
np.array([y]).shape

(1, 4, 22023)

In [19]:
sess.run([loss, max_snr * count], feed_dict = {X: np.expand_dims([left], axis = -1),
                              len_X: [len(left)], Y: [y]})

[2.3875208, array([[-9.550083]], dtype=float32)]

In [20]:
# def get_data(combined_path, speakers_size = 4, sr = 8000):
#     combined, _ = malaya_speech.load(combined_path, sr = sr, scale = False)
#     y = []
#     for i in range(speakers_size):
#         y_, _ = malaya_speech.load(combined_path.replace('combined', str(i)), sr = sr, scale = False)
#         y.append(y_)
#     return combined, y

# combined = glob('split-speaker-8k-test/combined/*.wav')
# len(combined)

In [21]:
# from tqdm import tqdm

# snrs = []

# for i in tqdm(range(len(combined))):
#     x, y = get_data(combined[i])
#     s = sess.run(max_snr * count, feed_dict = {X: np.expand_dims([x], -1), 
#                                                len_X: [len(x)], Y: [y]})[0,0]
#     snrs.append(s)

In [22]:
# np.mean(snrs) # 18.645731

In [23]:
saver = tf.train.Saver()
saver.save(sess, 'sepformer-4/model.ckpt')

'sepformer-4/model.ckpt'

In [24]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'gather' in n.op.lower()
        or 'Placeholder' in n.name
        or 'logits' in n.name)
        and 'adam' not in n.name
        and 'global_step' not in n.name
        and 'Assign' not in n.name
        and 'ReadVariableOp' not in n.name
        and 'Gather' not in n.name
        and 'IsVariableInitialized' not in n.name
    ]
)
strings.split(',')

['Placeholder', 'Placeholder_1', 'logits', 'Placeholder_2']

In [25]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [26]:
freeze_graph('sepformer-4', strings)

INFO:tensorflow:Restoring parameters from sepformer-4/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 278 variables.
INFO:tensorflow:Converted 278 variables to const ops.
5587 ops in the final graph.


In [27]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
                
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
        
    return graph

In [28]:
g = load_graph('sepformer-4/frozen_model.pb')

In [29]:
test_sess = tf.Session(graph = g)

In [30]:
X = g.get_tensor_by_name('import/Placeholder:0')
X_len = g.get_tensor_by_name('import/Placeholder_1:0')
logits = g.get_tensor_by_name('import/logits:0')

In [41]:
count = 4
left, y = combine_speakers(random_speakers(count), count)
len(left) / sr, len(y)

[3.61, 13.885, 15.1, 15.045]
1 0.9870830960487033
2 0.9952149705499818
3 0.982222059802885


(1.430125, 4)

In [42]:
%%time

l = test_sess.run(logits, feed_dict = {X: np.expand_dims([left], axis = -1)})
l.shape

CPU times: user 164 ms, sys: 59.8 ms, total: 223 ms
Wall time: 173 ms


(4, 1, 11441, 1)

In [46]:
import IPython.display as ipd

ipd.Audio(l[3,0,:,0], rate = sr)

In [48]:
from tensorflow.tools.graph_transforms import TransformGraph

In [49]:
transforms = ['add_default_attributes',
             'remove_nodes(op=Identity, op=CheckNumerics)',
             'fold_batch_norms',
             'fold_old_batch_norms',
             'quantize_weights(fallback_min=-1024, fallback_max=1024)',
             'strip_unused_nodes',
             'sort_by_execution_order']

In [50]:
pb = 'sepformer-4/frozen_model.pb'

In [51]:
input_graph_def = tf.GraphDef()
with tf.gfile.FastGFile(pb, 'rb') as f:
    input_graph_def.ParseFromString(f.read())

transformed_graph_def = TransformGraph(input_graph_def, 
                                           ['Placeholder'],
                                           ['logits'], transforms)
    
with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:
    f.write(transformed_graph_def.SerializeToString())

Instructions for updating:
Use tf.gfile.GFile.


In [52]:
g = load_graph('sepformer-4/frozen_model.pb.quantized')

In [53]:
test_sess = tf.Session(graph = g)

In [55]:
X = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')

In [56]:
%%time

l = test_sess.run(logits, feed_dict = {X: np.expand_dims([left], axis = -1)})
l.shape

CPU times: user 3.71 s, sys: 101 ms, total: 3.81 s
Wall time: 3.58 s


(4, 1, 11441, 1)

In [59]:
ipd.Audio(l[1,0,:,0], rate = sr)