In [1]:
import os
import sys
import glob
import numpy as np
import tensorflow as tf
import importlib
import IPython.display as ipd

import util_audio_preprocess
import util_audio_transform
import util_auditory_model_loss
import util_cochlear_model
import util_recognition_network


In [2]:
"""
Data inputput pipeline

Recommended training data format is tfrecords files containing foreground
(speech) and background (noise) waveforms stored separately. Model was
originally trained with 2-second audio clips (20 kHz sampling rate).

NOTE: the full dataset used to train models in the paper is not included in
this code release as it was compiled from previously published datasets
(speech from Wall Street Journal and Spoken Wikipedia Corpora; background
noise from Audioset). Data available upon request to authors.
"""

filenames = glob.glob('data/toy_dataset*.tfrecords')
batch_size = 8
feature_description = {
    'background/signal': tf.io.FixedLenFeature([], tf.string, default_value=None),
    'foreground/signal': tf.io.FixedLenFeature([], tf.string, default_value=None),
}
bytes_description = {
    'background/signal': {'dtype': tf.float32, 'shape': [40000]}, 
    'foreground/signal': {'dtype': tf.float32, 'shape': [40000]},
}


def parse_tfrecord(tfrecord):
    tfrecord = tf.parse_single_example(tfrecord, features=feature_description)
    for key in bytes_description.keys():
        tfrecord[key] = tf.decode_raw(tfrecord[key], bytes_description[key]['dtype'])
        tfrecord[key] = tf.reshape(tfrecord[key], bytes_description[key]['shape'])
    return tfrecord


def preprocess_audio_batch(batch):
    """
    Function combines foreground (speech) and background (noise) audio
    signals with signal-to-noise ratios drawn uniformly between -20 and
    +10 dB. The returned dictionary contains the noisy speech signal,
    the clean speech signal, and the SNR.
    """
    foreground_signal = batch['foreground/signal']
    background_signal = batch['background/signal']
    snr = tf.random.uniform(
        [tf.shape(foreground_signal)[0], 1],
        minval=-20.0,
        maxval=10.0,
        dtype=foreground_signal.dtype)
    signal_in_noise, signal, noise_scaled = util_audio_preprocess.tf_set_snr(
        foreground_signal,
        background_signal,
        snr)
    batch = {
        'snr': snr,
        'waveform_noisy': signal_in_noise,
        'waveform_clean': signal,
    }
    return batch


tf.reset_default_graph()
tf.random.set_random_seed(0)

dataset = tf.data.TFRecordDataset(filenames=filenames, compression_type='GZIP')
dataset = dataset.map(parse_tfrecord)
dataset = dataset.batch(batch_size)
dataset = dataset.map(preprocess_audio_batch)
dataset = dataset.prefetch(buffer_size=4)
dataset = dataset.shuffle(buffer_size=32)
dataset = dataset.repeat(count=None)

iterator = dataset.make_one_shot_iterator()
input_tensor_dict = iterator.get_next()


Instructions for updating:
Colocations handled automatically by placer.


In [3]:
"""
Model training graph

Components:
1. U-Net audio transform (`util_audio_transform.build_unet`)
2. Auditory model loss function (`util_auditory_model_loss.AuditoryModelLoss`)
3. Tensorflow optimizer to train U-Net weights
"""

### U-Net audio transform
tensor_waveform_noisy = input_tensor_dict['waveform_noisy']
tensor_waveform_clean = input_tensor_dict['waveform_clean']
tensor_waveform_denoised = util_audio_transform.build_unet(tensor_waveform_noisy)

### Build auditory model loss function (specify recognition networks
### to include in the deep feature loss)
list_recognition_networks = [
    'arch1_taskA',
#     'arch2_taskA',
#     'arch3_taskA',
]
auditory_model = util_auditory_model_loss.AuditoryModelLoss(
    list_recognition_networks=list_recognition_networks,
    tensor_wave0=tensor_waveform_clean,
    tensor_wave1=tensor_waveform_denoised)

transform_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='separator')
transform_saver = tf.train.Saver(var_list=transform_var_list, max_to_keep=0)

### Specify loss function (waveform, cochlear model, or deep features)
### and build optimizer object + training operation
loss = auditory_model.loss_cochlear_model # <-- cochlear model loss is lightweight and works well
# loss = auditory_model.loss_deep_features
# loss = auditory_model.loss_waveform
optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
train_op = optimizer.minimize(
    loss=loss,
    global_step=None,
    var_list=transform_var_list)


Instructions for updating:
Use keras.layers.conv1d instead.
1 recognition networks included for deep feature loss:
|__ arch1_taskA: models/recognition_networks/arch1_taskA.ckpt-550000
Building waveform loss
Building cochlear model loss
[make_cos_filters_nx] using filter_spacing=`erb`
[make_cos_filters_nx] using filter_spacing=`erb`
Building deep feature loss (recognition network: arch1_taskA)
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [4]:
"""
Simple training routine

Models in the paper were all trained for 600000 steps
with batch size 8 and learning rate 10e-4.
"""

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    auditory_model.load_auditory_model_vars(sess)
    
    for step in range(101):
        _, step_loss = sess.run([train_op, loss])
        if step % 10 == 0:
            print("Loss after training step {:06d} = {:.02f}".format(step, step_loss.mean()))
    
    transform_saver.save(
        sess,
        save_path='new_model.ckpt',
        global_step=step,
        write_meta_graph=False)


Loading `arch1_taskA` variables from models/recognition_networks/arch1_taskA.ckpt-550000
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from models/recognition_networks/arch1_taskA.ckpt-550000
INFO:tensorflow:Restoring parameters from models/recognition_networks/arch1_taskA.ckpt-550000
Loss after training step 000000 = 759.71
Loss after training step 000010 = 594.60
Loss after training step 000020 = 547.26
Loss after training step 000030 = 454.63
Loss after training step 000040 = 478.92
Loss after training step 000050 = 450.72
Loss after training step 000060 = 395.44
Loss after training step 000070 = 427.17
Loss after training step 000080 = 456.67
Loss after training step 000090 = 438.67
Loss after training step 000100 = 431.82
INFO:tensorflow:new_model.ckpt-100 is not in all_model_checkpoint_paths. Manually adding it.
