In [1]:
import os
import time
import numpy as np
import tensorflow as tf

import core.plot_saver as ps
import core.data_gen as dg

from tensorflow.contrib import learn
from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib

import cPickle as pickle

training_data = 'train_data_100.pickle'
rel_log_dir = '../data/tf_logs/'
ckpt_dir_name = 'model.ckpt'

frequencies = 128
segment_size = 100
total_speakers = 100
batch_size = 128
gru_neurons = 256
#train_loops = 30

In [2]:
def tf_log_dir():
    current_workdir = os.getcwd()
    tstamp = int(time.time())
    sess_dir_name = 'sess_%s' % tstamp
    dirty_path = os.path.join(current_workdir, rel_log_dir, sess_dir_name)
    return os.path.realpath(dirty_path)

# Parse training data to matrices

def create_train_data():
    with open('../data/training/TIMIT_extracted/' + training_data, 'rb') as f:
      (X, y, speaker_names) = pickle.load(f)

    X_t, X_v, y_t, y_v = dg.splitter(X, y, 0.125, 8)
    return X_t, y_t, X_v, y_v

# Create data

X_t, y_t, X_v, y_v = create_train_data()
train_gen = dg.batch_generator(X_t, y_t, batch_size=batch_size, segment_size=segment_size)
val_gen = dg.batch_generator(X_v, y_v, batch_size=batch_size, segment_size=segment_size)
batches_t = ((X_t.shape[0]+128 -1 )// 128)*128
batches_v = ((X_v.shape[0]+128 -1 )// 128)*128

In [3]:
# Create basic net infrastructure

#def create_net(frequencies, segment_size, c1_dims, max1_dims, c2_dims, max2_dims, gru_neur, speakers_count):

# Placeholders
with tf.name_scope('Placeholders'):
    x_input = tf.placeholder(tf.float32, shape=(None, frequencies, segment_size, 1))
    out_labels = tf.placeholder(tf.float32, shape=(None, segment_size))

with tf.name_scope('Convolution_1'):
    conv1 = tf.layers.conv2d(inputs=x_input, filters=32, kernel_size=[8, 1], padding="same", activation=tf.nn.relu)

with tf.name_scope('MaxPooling_1'):
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[4, 4], strides=[2, 2])

with tf.name_scope('Convolution_2'):
    conv2 = tf.layers.conv2d(inputs=pool1, filters=64, kernel_size=[8, 1], padding="same", activation=tf.nn.relu)

with tf.name_scope('MaxPooling_2'):
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[4, 4], strides=[2, 2])
    
with tf.name_scope('Reshape'):
    dim1 = int(pool2.shape[3] * pool2.shape[1])
    dim2 = int(pool2.shape[2])
    lstm_input = tf.reshape(pool2, [-1, dim1, dim2])

with tf.name_scope('GRU'):
    x_gru = tf.unstack(lstm_input, lstm_input.get_shape()[1], 1)
    gru_cell = tf.contrib.rnn.GRUCell(gru_neurons)
    dense, _ = tf.contrib.rnn.static_rnn(gru_cell, x_gru, dtype='float')
    gru_out = dense[-1]

with tf.name_scope('Dense'):
    network = tf.layers.dense(inputs=gru_out, units=total_speakers, activation=tf.nn.softmax)
    #network = tf.layers.dense(inputs=gru_out, units=speakers_count, activation=tf.nn.softmax)
    
    #return network

In [4]:
# Optimizer

with tf.name_scope('Optimizer'):
    # Cross entropy and optimizer
    
    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=network, labels=out_labels))
    tf.summary.scalar('loss', cross_entropy)
    optimizer = tf.train.AdamOptimizer().minimize(cross_entropy)

In [6]:
# Training

train_loops = 5

sess = tf.Session()
sess.run(tf.global_variables_initializer())

# Tensorboard
tb_merged = tf.summary.merge_all()
tb_saver = tf.train.Saver()
tb_log_dir = tf_log_dir()
tb_train_writer = tf.summary.FileWriter(tb_log_dir, sess.graph)

run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()

for step in range(train_loops):
    start_time = time.time()
    
    # Get next batch
    x_b_t, y_b = train_gen.next()
    # Reshape the x_b batch with channel as last dimension
    x_b = np.reshape(x_b_t, [batch_size, frequencies, segment_size, 1])
    
    # Execute training
    feed_dict = { x_input: x_b, out_labels: y_b }
    _, loss_value = sess.run([optimizer, cross_entropy], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata)
    
    # define accuracy
    correct_pred = tf.equal(tf.argmax(network, 1), tf.argmax(out_labels, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
    sess_acc = sess.run(accuracy, feed_dict={ x_input: x_b, out_labels: y_b }, options=run_options, run_metadata=run_metadata)
    #print("Round %d accuracy: %s" % (step, sess_acc))
    
    if step % 10 == 0:
        tb_summary_str = sess.run(tb_merged, feed_dict={ x_input: x_b, out_labels: y_b })
        tb_train_writer.add_run_metadata(run_metadata, 'step_{:04d}'.format(step))
        tb_train_writer.add_summary(tb_summary_str, step)
        tb_train_writer.flush()
    
    if step % 100 == 0:
        ckpt_file = os.path.join(tb_log_dir, 'model.ckpt')
        tb_saver.save(sess, ckpt_file, global_step=step)

model_file = os.path.join(tb_log_dir, 'final_model.save')
tb_saver.save(sess, model_file)

'/home/sebastian/Dokumente/uni/BT/PA_Code/data/tf_logs/sess_1491859407/final_model.save'

In [85]:

test_x_data, test_y_data = dg.generate_test_data(X_v, y_v, 100)
print X_v.shape, y_v.shape
print test_x_data.shape, y_v.shape
#print test_x_data.shape, test_y_data.shape

#net_output = network.eval(feed_dict={x_input: np.reshape(test_x_data, [260, 128, 100, 1]), out_labels: np.reshape(test_y_data[:100,], [-1, 100])}, session=sess)
#gru_output = gru_out.eval(feed_dict={x_input: np.reshape(test_x_data, [260, 128, 100, 1]), out_labels: np.reshape(test_y_data[:100,], [-1, 100])}, session=sess)


(100, 1, 128, 800) (100,)
(260, 1, 128, 100) (100,)


In [82]:
print gru_output[1:5, 1:5]
print gru_output.shape
print pool2.shape

[[-0.01962449 -0.00752905  0.00128846 -0.0131578 ]
 [-0.02017676 -0.01848101  0.00381372 -0.0185026 ]
 [-0.03182903 -0.03185349  0.00299494 -0.01037239]
 [-0.02004599 -0.01478151  0.00450617 -0.01441335]]
(260, 256)
(?, 30, 23, 64)
