In [1]:
import numpy as np
np.set_printoptions(threshold=np.nan)

In [2]:
import tensorflow as tf
from tensorflow.python.platform import gfile
from tensorflow.python.ops import io_ops
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio


In [3]:
data_dir = "train/audio"
summary_dir = "tfboard" # where to save summary logs for Tensorboard
wanted_words = ["silence","unknown","yes","no","up","down","left","right","on","off","stop","go","silence"]

sample_rate = 16000 # per sec
clip_duration_ms = 1000 #

background_volume = 0.1 # how loud background noise should be, [0,1]
background_percent = 0.8 # how many of training samps have noise added

silence_percentage = 10.0 # what percent of training data should be silence
unknown_percentage = 10.0 # what percent of training data should be unknown words

time_shift_ms = 100.0 # range to randomly shift the training audio by time

window_size_ms = 30.0 # millisec length of frequency analysis window
window_stride_ms = 10.0

window_size_samples = int(1600 * window_size_ms / 1000)
window_stride_samples = int(1600 * window_stride_ms / 1000)

dct_coefficient_count = 40 # bins to use for MFCC fingerprint

percent_test = 10 # test set
percent_val = 10 # val set


In [11]:
samps = 1600

wav_filename_ph = tf.placeholder(tf.string,[],name="WAV_file")
wav_loader = io_ops.read_file(wav_filename_ph)
wav_decoder = contrib_audio.decode_wav(wav_loader,desired_channels=1,desired_samples=samps) # length of 1 sec

volume_ph = tf.placeholder(tf.float32,[],name="Foreground_Volume")

# this seems weird, shouldn't it do some kind of conditional scaling??
scaled_wav = tf.multiply(wav_decoder.audio, volume_ph)

time_shift_padding_ph = tf.placeholder(tf.int32,[2,2],name="Padding")
time_shift_offset_ph = tf.placeholder(tf.int32,[2],name="Slicing")

padded_wav = tf.pad(scaled_wav,time_shift_padding_ph,mode="CONSTANT")
sliced_wav = tf.slice(padded_wav,time_shift_offset_ph,[samps,-1])

bg_ph = tf.placeholder(tf.float32,[samps,1],name="Background_Noise")
bg_volume_ph = tf.placeholder(tf.float32,[],name="Background_Volume")

scaled_bg = tf.multiply(bg_ph,bg_volume_ph)

wav_with_bg = tf.add(sliced_wav,scaled_bg)

clamped_wav = tf.clip_by_value(wav_with_bg,-1.0,1.0)

spectrogram = contrib_audio.audio_spectrogram(
    clamped_wav,
    window_size = window_size_samples,
    stride = window_stride_samples,
    magnitude_squared = True
)

mfcc = contrib_audio.mfcc(
    spectrogram,
    wav_decoder.sample_rate,
    dct_coefficient_count = dct_coefficient_count
)




In [12]:
sess = tf.InteractiveSession()

In [13]:
mfcc_array = sess.run(mfcc,feed_dict={
    wav_filename_ph:"train/audio/bird/00b01445_nohash_0.wav",
    volume_ph : 10,
    time_shift_padding_ph: [[1,1],[0,0]],
    time_shift_offset_ph: [2,0],
    bg_ph: np.ones(samps).reshape(samps,1),
    bg_volume_ph: 0.1
})

In [14]:
mfcc_array.shape

(1, 98, 40)

In [18]:
mfcc.shape[2]

Dimension(40)

In [16]:
clamped_wav.shape

TensorShape([Dimension(1600), Dimension(None)])

In [1]:
import sys

In [2]:
sys.version

'3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]'

In [3]:
sys.executable

'/anaconda/envs/py35/bin/python'