In [1]:
import os
import sys
import glob
import numpy as np
import tensorflow as tf
import importlib
import IPython.display as ipd

import util_audio_preprocess
import util_audio_transform
import util_auditory_model_loss
import util_cochlear_model
import util_recognition_network

importlib.reload(util_auditory_model_loss)


<module 'util_auditory_model_loss' from '/rdma/vast-rdma/vast/mcdermott/msaddler/auditory-model-denoising/util_auditory_model_loss.py'>

In [2]:
filenames = glob.glob('data/toy_dataset*.tfrecords')
batch_size = 8
feature_description = {
    'background/signal': tf.io.FixedLenFeature([], tf.string, default_value=None),
    'foreground/signal': tf.io.FixedLenFeature([], tf.string, default_value=None),
}
bytes_description = {
    'background/signal': {'dtype': tf.float32, 'shape': [40000]}, 
    'foreground/signal': {'dtype': tf.float32, 'shape': [40000]},
}


def parse_tfrecord(tfrecord):
    tfrecord = tf.parse_single_example(tfrecord, features=feature_description)
    for key in bytes_description.keys():
        tfrecord[key] = tf.decode_raw(tfrecord[key], bytes_description[key]['dtype'])
        tfrecord[key] = tf.reshape(tfrecord[key], bytes_description[key]['shape'])
    return tfrecord


def preprocess_audio_batch(batch):
    foreground_signal = batch['foreground/signal']
    background_signal = batch['background/signal']
    snr = tf.random.uniform(
        [tf.shape(foreground_signal)[0], 1],
        minval=-20.0,
        maxval=10.0,
        dtype=foreground_signal.dtype)
    signal_in_noise, signal, noise_scaled = util_audio_preprocess.tf_set_snr(
        foreground_signal,
        background_signal,
        snr)
    batch = {
        'snr': snr,
        'waveform_noisy': signal_in_noise,
        'waveform_clean': signal,
    }
    return batch


tf.reset_default_graph()
tf.random.set_random_seed(0)

dataset = tf.data.TFRecordDataset(filenames=filenames, compression_type='GZIP')
dataset = dataset.map(parse_tfrecord)
dataset = dataset.batch(batch_size)
dataset = dataset.map(preprocess_audio_batch)
dataset = dataset.prefetch(buffer_size=4)
dataset = dataset.shuffle(buffer_size=32)
dataset = dataset.repeat(count=None)

iterator = dataset.make_one_shot_iterator()
input_tensor_dict = iterator.get_next()
input_tensor_dict


Instructions for updating:
Colocations handled automatically by placer.


{'snr': <tf.Tensor 'IteratorGetNext:0' shape=(?, 1) dtype=float32>,
 'waveform_clean': <tf.Tensor 'IteratorGetNext:1' shape=(?, 40000) dtype=float32>,
 'waveform_noisy': <tf.Tensor 'IteratorGetNext:2' shape=(?, 40000) dtype=float32>}

In [3]:
tensor_waveform_noisy = input_tensor_dict['waveform_noisy']
tensor_waveform_clean = input_tensor_dict['waveform_clean']
with tf.variable_scope('audio_transform'):
    tensor_waveform_denoised = util_audio_transform.build_unet(tensor_waveform_noisy)

list_recognition_networks = [
    'arch1_taskA',
#     'arch2_taskA',
#     'arch3_taskA',
]
auditory_model = util_auditory_model_loss.AuditoryModelLoss(
    list_recognition_networks=list_recognition_networks,
    tensor_wave0=tensor_waveform_clean,
    tensor_wave1=tensor_waveform_denoised)

transform_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='audio_transform')
# loss = auditory_model.loss_waveform
# loss = auditory_model.loss_cochlear_model
loss = auditory_model.loss_deep_features
optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
train_op = optimizer.minimize(
    loss=loss,
    global_step=None,
    var_list=transform_var_list)

# transform_saver = tf.train.Saver(var_list=transform_var_list, max_to_keep=0)


Instructions for updating:
Use keras.layers.conv1d instead.
1 recognition networks included for deep feature loss:
|__ arch1_taskA: models/recognition_networks/arch1_taskA.ckpt-550000
Building waveform loss
Building cochlear model loss
[make_cos_filters_nx] using filter_spacing=`erb`
[make_cos_filters_nx] using filter_spacing=`erb`
Building deep feature loss (recognition network: arch1_taskA)
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [4]:
with tf.Session() as sess:
#     example = sess.run(iterator.get_next())
#     for k in example.keys():
#         print(k, example[k].dtype, example[k].shape)
    sess.run(tf.global_variables_initializer())
    auditory_model.load_auditory_model_vars(sess)
    
    for itr0 in range(100):
        _, batch_loss = sess.run([train_op, loss])
        if itr0 % 5 == 0:
            print(itr0, batch_loss.sum())


Loading `arch1_taskA` variables from models/recognition_networks/arch1_taskA.ckpt-550000
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from models/recognition_networks/arch1_taskA.ckpt-550000
INFO:tensorflow:Restoring parameters from models/recognition_networks/arch1_taskA.ckpt-550000
0 2.246965
5 2.2889702
10 2.3068545
15 2.0011575
20 1.9369578
25 1.9962149
30 2.034226
35 2.1364148
40 1.9627442
45 2.2372794
50 1.9959297
55 2.0009422
60 1.8449538
65 1.917606
70 1.6848927
75 1.7655628
80 2.0259156
85 1.8892846
90 2.0066733
95 1.9107757
