In [1]:
import glob
import os
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from matplotlib.pyplot import specgram
import time
from sklearn.cross_validation import train_test_split
%matplotlib inline
plt.style.use('ggplot')

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13

## TENSORFLOW GRAPH

In [2]:
# Log scaled melspectrogram parameters
BANDS = 128
FRAMES = 128
# Number of classification classes
N_LABELS = 10

### Helper function

In [3]:
def new_weights(shape):
    """Initialize 4-D weight using normal distribution with mean = 0, stddev = 0.1
    Args:
        shape (4-D list_like): [height, width, in_channels, out_channels] shape of weight
    Return
        weight (a tensor variables)
    """
    initial = tf.truncated_normal(shape, stddev = 0.1)
    return tf.Variable(initial)

def new_biases(length):
    """Initialize 1-D bias
    Args:
        lenght (int): length of bias
    Return
        bias (a tensor)
    """
    initial = tf.constant(1.0, shape = [length])
    return tf.Variable(initial)

In [4]:
def new_conv_layer(input,              # The previous layer.
                   num_input_channels, # Num. channels in prev. layer.
                   filter_size,        # Width and height of each filter.
                   num_filters,        # Number of filters.
                   use_pooling=True):  # Use 4x2 max-pooling.
    """Construct a convolutional layer by convoluting filter with input followed by a ReLu activation function
    by default and max pooling by choice
    Args:
        input (4-D tensor): the previous layer
        num_input_channels (int): number of channels in prev. layer
        filter_size (int): Width and height of each filter.
        num_filters (int): number of filters
        use_pooling (bool): to choose whether to use max pooling or not
    Return:
        layer (4-D tensor): convolutional layer
        weight (4-D tensor)
    """

    # Shape of the filter-weights for the convolution.
    # This format is determined by the TensorFlow API.
    shape = [filter_size, filter_size, num_input_channels, num_filters]

    # Create new weights with the given shape.
    weights = new_weights(shape=shape)

    # Create new biases, one for each weight.
    biases = new_biases(length=num_filters)

    # Create the TensorFlow operation for convolution.
    layer = tf.nn.conv2d(input=input,
                         filter=weights,
                         strides=[1, 1, 1, 1],
                         padding='VALID')

    # Add the biases to the results of the convolution.
    # A bias-value is added to each filter-channel.
    layer += biases

    # Use pooling to down-sample the image resolution?
    if use_pooling:
        # This is 4x2 max-pooling, which means that we
        # consider 4x2 windows and select the largest value
        # in each window. 
        layer = tf.nn.max_pool(value=layer,
                               ksize=[1, 4, 2, 1],
                               strides=[1, 4, 2, 1],
                               padding='VALID')

    # Rectified Linear Unit (ReLU).
    # It calculates max(x, 0) for each input pixel x.
    # This adds some non-linearity to the formula and allows us
    # to learn more complicated functions.
    layer = tf.nn.relu(layer)

    # Note that ReLU is normally executed before the pooling,
    # but since relu(max_pool(x)) == max_pool(relu(x)) we can
    # save 75% of the relu-operations by max-pooling first.

    # We return both the resulting layer and the filter-weights
    # because we will plot the weights later.
    return layer, weights

In [5]:
def flatten_layer(layer):
    """Flatten a layer into a 2-D shape
    Args:
        layer (more than 2-D tensor): the previous layer
    Return:
        layer_flat (2-D tensor): flaten version of layer
        n_features (int): number of features after flattenning
    """
    # Get the shape of the input layer.
    layer_shape = layer.get_shape()

    # The shape of the input layer is expected to be:
    # layer_shape == [n_samples, height, width, n_channels]

    # The number of features is: height * width * n_channels
    # We can use a function from TensorFlow to calculate this.
    n_features = layer_shape[1:4].num_elements()
    
    # Reshape the layer to [n_samples, n_features].
    # Note that we just set the size of the second dimension
    # to num_features and the size of the first dimension to -1
    # which means the size in that dimension is calculated
    # so the total size of the tensor is unchanged from the reshaping.
    layer_flat = tf.reshape(layer, [-1, n_features])

    # The shape of the flattened layer is now:
    # [n_samples, height * width * n_channels]

    # Return both the flattened layer and the number of features.
    return layer_flat, n_features

In [6]:
def new_fc_layer(input_layer,    # The previous layer.
                 num_inputs,     # Num. inputs from prev. layer.
                 num_outputs,    # Num. outputs.
                 activation=""): # Use Rectified Linear Unit (ReLU)?
    """Compute a fully connected layer
    Args:
        input_layer (2-D tensor): the previous layer
        num_inputs (int): Num. inputs features from prev. layer.
        num_outputs (int): Num. output features
        activation (string): type of activation function
    Return:
        layer (2-D tensor): output layer
        weight (2-D tensor)
    """           
    # Create new weights and biases.
    weights = new_weights(shape=[num_inputs, num_outputs])
    biases = new_biases(length=num_outputs)

    # Calculate the layer as the matrix multiplication of
    # the input and weights, and then add the bias-values.
    layer = tf.matmul(input_layer, weights) + biases
    activation = activation.lower()
    if (activation =="sigmoid"):
        layer = tf.nn.sigmoid(layer)
    elif (activation =="relu"):
        layer = tf.nn.relu(layer)    
    return layer, weights

### Placeholder variables

In [7]:
X = tf.placeholder(tf.float32, shape=[None, BANDS, FRAMES, 1])
Y = tf.placeholder(tf.float32, shape=[None, N_LABELS])

### Convolutional layer 1

In [8]:
#Convolution layer 1 params
filter_l1_size = 5
n_filter_l1 = 24
stride_l1 = [1, 4, 2, 1]
activation_func_l1 = "relu"
layer_conv1, weights_1 = new_conv_layer(input=X, 
                                        num_input_channels=1, 
                                        filter_size=filter_l1_size, 
                                        num_filters=n_filter_l1, 
                                        use_pooling=True)

In [9]:
layer_conv1

<tf.Tensor 'Relu:0' shape=(?, 31, 62, 24) dtype=float32>

### Convolutional layer 2

In [10]:
#Convolution layer 2 params
filter_l2_size = 5
n_filter_l2 = 48
stride_l2 = [1, 4, 2, 1]
activation_func_l2 = "relu"
layer_conv2, weights_2 = new_conv_layer(input=layer_conv1, 
                                        num_input_channels=n_filter_l1, 
                                        filter_size=filter_l2_size, 
                                        num_filters=n_filter_l2, 
                                        use_pooling=True)


In [11]:
layer_conv2

<tf.Tensor 'Relu_1:0' shape=(?, 6, 29, 48) dtype=float32>

### Convolutional layer 3

In [12]:
#Convolution layer 3 params
filter_l3_size = 5
n_filter_l3 = 48
activation_func_l3 = "relu"

layer_conv3, weights_3 = new_conv_layer(input=layer_conv2, 
                                        num_input_channels=n_filter_l2, 
                                        filter_size=filter_l3_size, 
                                        num_filters=n_filter_l3, 
                                        use_pooling=False)

In [13]:
layer_conv3

<tf.Tensor 'Relu_2:0' shape=(?, 2, 25, 48) dtype=float32>

### Input dropout 1

In [14]:
keep_prob = tf.placeholder(tf.float32)
layer_conv3_drop = tf.nn.dropout(layer_conv3, keep_prob=keep_prob)

In [15]:
layer_conv3_drop

<tf.Tensor 'dropout/mul:0' shape=(?, 2, 25, 48) dtype=float32>

### Flatten layer

In [16]:
layer_flat, num_features = flatten_layer(layer_conv3_drop)

In [17]:
layer_flat

<tf.Tensor 'Reshape:0' shape=(?, 2400) dtype=float32>

In [18]:
num_features

2400

### Fully-connected layer 1

In [19]:
#Fully connected layer 4
num_hidden = 64
activation_func_l4 = "relu"
layer_fc1, weights_fc1 = new_fc_layer(layer_flat, num_inputs=num_features, num_outputs=num_hidden, activation=activation_func_l4)


In [20]:
layer_fc1

<tf.Tensor 'Relu_3:0' shape=(?, 64) dtype=float32>

### Input Dropout 2

In [21]:
layer_fc1_drop = tf.nn.dropout(layer_fc1, keep_prob=keep_prob)

In [22]:
layer_fc1_drop

<tf.Tensor 'dropout_1/mul:0' shape=(?, 64) dtype=float32>

### Fully-connected layer 2

In [23]:
layer_fc2, weights_fc2 = new_fc_layer(layer_fc1_drop, num_inputs=num_hidden, num_outputs=N_LABELS)

In [24]:
layer_fc2

<tf.Tensor 'add_4:0' shape=(?, 10) dtype=float32>

### Predicted class

In [25]:
y_pred = tf.nn.softmax(layer_fc2)

In [26]:
y_pred

<tf.Tensor 'Softmax:0' shape=(?, 10) dtype=float32>

### Cost function and Optimization

In [27]:
BETA = 0.001 #L2 regularization penalty factor
LEARNING_RATE = 0.01

In [28]:
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=layer_fc2,labels=Y)

In [29]:
##L2 regularization
cost = tf.reduce_mean(cross_entropy + BETA*tf.nn.l2_loss(weights_fc1) +  BETA*tf.nn.l2_loss(weights_fc2))
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(cost)

In [30]:
# correct_prediction = tf.equal(tf.argmax(y_,1), tf.argmax(Y,1))
# accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

### Load training data

In [2]:
N_PARTS = 10
def load_train_data(parent_dirs):
    print "Load train data"
    print "Loading........."
    start_time = time.time()
    train_features = np.zeros(0)
    train_labels = np.zeros(0)
    train_names = np.zeros(0)
    
    
    for part in np.arange(N_PARTS-1):
        for parent_dir in parent_dirs:
            train_features = np.concatenate([train_features, np.load(parent_dir + "/cnn_train_features_full_part_" + str(part+1) + ".npy", allow_pickle=True)], axis=0)  
            train_labels = np.concatenate([train_labels, np.load(parent_dir + "/cnn_train_labels_full_part_" + str(part+1) + ".npy", allow_pickle=True)], axis=0)      
            train_names = np.concatenate([train_names, np.load(parent_dir + "/cnn_train_file_names_full_part_" + str(part+1) + ".npy", allow_pickle=True)], axis=0)
        
        print "Load part {0} successfully".format(part+1)
    print "---Running time: {0} seconds ---".format(time.time() - start_time)
    return train_features, train_labels, train_names
    

In [3]:
test_features = None
test_labels = None
test_names = None
test_IDs = None
parent_dirs = ["train_data"]
train_features, train_labels, train_names = load_train_data(parent_dirs)

Load train data
Loading.........
Load part 1 successfully
Load part 2 successfully
Load part 3 successfully
Load part 4 successfully
Load part 5 successfully
Load part 6 successfully
Load part 7 successfully
Load part 8 successfully
Load part 9 successfully
---Running time: 101.455986023 seconds ---


In [6]:
train_features[0].shape

(22144,)

In [33]:
def count_frames(data):
    total_frames = 0
    frame_ID = []
    d_length = np.shape(data)[0]
    for i,d in zip(np.arange(d_length),data):
        length = np.shape(d)[0]
        n_frames = (length/BANDS-FRAMES)
        total_frames = total_frames + n_frames
        for _ in np.arange(n_frames):
            frame_ID.append(i)
    return total_frames, np.asarray(frame_ID)

In [34]:
def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [35]:
train_one_hot = one_hot_encode(train_labels.astype(np.int))

In [36]:
train_one_hot[0]

array([ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [47]:
dummy_train_x, _, dummy_train_y, _ = train_test_split(train_features, train_one_hot, test_size=0.99, random_state=4, stratify=train_labels)

In [55]:
tt, idi = count_frames(dummy_train_x)

In [38]:
def fetch_patch(data, data_labels):
    patches = []
    labels = []
    for l, d in zip(data_labels.astype(np.int), data):
        length = np.shape(d)[0]
        p = np.reshape(d, (BANDS, length/BANDS))
        n_frames = np.shape(p)[1] - FRAMES
        for f in np.arange(n_frames):
            patches.append(p[:,f:f+FRAMES])
            labels.append(l)
    features = np.asanyarray(patches, dtype=np.float32).reshape(np.shape(patches)[0],BANDS,FRAMES,1)
    labels = np.asanyarray(labels, dtype=np.float32)
    return features, labels

In [39]:
dummy_train_x, dummy_train_y = fetch_patch(dummy_train_x, dummy_train_y)

In [40]:
np.shape(dummy_train_x)

(31487, 128, 128, 1)

### Tensorflow session

In [None]:
init = tf.global_variables_initializer()
BATCH_SIZE = 100
training_epochs = 300
saver = tf.train.Saver()

session = tf.Session()
    
session.run(init)

### Helper function to perform optimization iterations 

In [None]:
def dummy_optimize(num_epochs, train_x, train_y):
    start_time = time.time()
    cost_history = np.empty(shape=[1],dtype=float)
    print "Training......."
    print "------ Elapsed time ------- Epoch ---- Cost "
    for itr in range(num_epochs):
        offset = (itr * BATCH_SIZE) % (train_y.shape[0] - BATCH_SIZE)
        batch_x = train_x[offset:(offset + BATCH_SIZE), :, :, :]
        batch_y = train_y[offset:(offset + BATCH_SIZE), :]
        feed_dict_train={X: batch_x, Y : batch_y, keep_prob : 0.5}
        _, c = session.run([optimizer, cost],feed_dict=feed_dict_train)
        if(itr % 100 == 0):
            print "------ {:12.7f} ------- {:5d} ---- {:12.10f} ".format((time.time() - start_time), itr, c)
        cost_history = np.append(cost_history,c)
    print ("---Training time: %s seconds ---" % (time.time() - start_time))
    fig = plt.figure(figsize=(10,5))
    plt.plot(cost_history)
    plt.axis([0,itr,0,np.max(cost_history)])
    plt.show()

In [None]:
def optimize(num_epochs, train_x, train_y):
    print "Training......."
    print "------ Elapsed time ------- Epoch ---- Cost "
    cost_history = np.empty(shape=[1],dtype=float)
    n_samples = np.shape(train_x)[0]
    iterations = np.int32(np.floor(n_samples/(8*BATCH_SIZE)) + 1);
    samples = np.arange(n_samples)
    start_time = time.time()
    for epoch in range(num_epochs):    
   
        
        np.random.shuffle(samples)
        for itr in np.arange(iterations):
            offset = itr*BATCH_SIZE 
            batch = samples[offset:offset+BATCH_SIZE]

            batch_x = train_x[batch]
            batch_y = train_y[batch]
            feed_dict_train={X: batch_x, Y : batch_y, keep_prob : 0.5}
            _, c = session.run([optimizer, cost],feed_dict=feed_dict_train)
        if(epoch % 5 == 0):
            print "------ {:12.7f} ------- {:5d} ---- {:12.10f} ".format((time.time() - start_time), epoch, c)
        cost_history = np.append(cost_history,c)
    print ("---Training time: %s seconds ---" % (time.time() - start_time))
    fig = plt.figure(figsize=(10,5))
    plt.plot(cost_history)
    plt.axis([0,epoch,0,np.max(cost_history)])
    plt.show()
    

In [None]:
dummy_optimize(training_epochs, dummy_train_x, dummy_train_y)

In [None]:
optimize(training_epochs, dummy_train_x, dummy_train_y)

### Load test data

In [None]:
def dummy_load_test_data(parent_dirs):
    print "Load test data"
    print "Loading........."
    start_time = time.time()
    test_features = np.zeros((0, 128, 128, 1))
    test_labels = np.zeros(0)
    test_names = np.zeros(0)
    test_id = np.zeros(0)
    
    
    for fold in np.arange(2):
        for parent_dir in parent_dirs:
            test_features = np.concatenate([test_features, np.load(parent_dir + "/cnn_test_features_full_fold_" + str(fold+1) + ".npy", allow_pickle=True)], axis=0)  
            test_labels = np.concatenate([test_labels, np.load(parent_dir + "/cnn_test_labels_full_fold_" + str(fold+1) + ".npy", allow_pickle=True)], axis=0)      
            test_names = np.concatenate([test_names, np.load(parent_dir + "/cnn_test_file_names_full_fold_" + str(fold+1) + ".npy", allow_pickle=True)], axis=0)
            test_id = np.concatenate([test_id, np.load(parent_dir + "/cnn_test_ID_full_fold_" + str(fold+1) + ".npy", allow_pickle=True)], axis=0)
        
        print "Load fold {0} successfully".format(fold+1)
    print "---Running time: {0} seconds ---".format(time.time() - start_time)
    return test_features, test_labels, test_names, test_id

In [None]:
train_features = None
train_labels = None
train_names = None

In [None]:
dummy_train_x = None
dummy_train_y = None

In [None]:


dummy_test_x, dummy_test_y, dummy_test_names, dummy_test_IDs = dummy_load_test_data(["test_data"])

In [None]:
np.shape(dummy_test_x)

In [None]:
def load_test_data(parent_dirs):
    print "Load test data"
    print "Loading........."
    start_time = time.time()
    test_features = np.zeros((0, 128, 128, 1))
    test_labels = np.zeros(0)
    test_names = np.zeros(0)
    test_id = np.zeros(0)
    
    
    for fold in np.arange(20):
        for parent_dir in parent_dirs:
            test_features = np.concatenate([test_features, np.load(parent_dir + "/cnn_test_features_full_fold_" + str(fold+1) + ".npy", allow_pickle=True)], axis=0)  
            test_labels = np.concatenate([test_labels, np.load(parent_dir + "/cnn_test_labels_full_fold_" + str(fold+1) + ".npy", allow_pickle=True)], axis=0)      
            test_names = np.concatenate([test_names, np.load(parent_dir + "/cnn_test_file_names_full_fold_" + str(fold+1) + ".npy", allow_pickle=True)], axis=0)
            test_id = np.concatenate([test_id, np.load(parent_dir + "/cnn_test_ID_full_fold_" + str(fold+1) + ".npy", allow_pickle=True)], axis=0)
        
        print "Load fold {0} successfully".format(fold+1)
    print "---Running time: {0} seconds ---".format(time.time() - start_time)
    return test_features, test_labels, test_names, test_id

In [None]:
train_features = None
train_labels = None
train_names = None

test_features, test_labels, test_names, test_IDs = load_test_data(["test_data"])

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

### Helper function to make prediction for test set

In [None]:
def make_prediction(test_f, test_l, test_ID):
    ID_list = np.unique(test_ID)
    y_pred = []
    y_true = []
    for sound_ID in ID_list:
        indices = np.where(test_ID == sound_ID)
        input_x = np.float32(test_f[indices])
        true_y = test_l[indices].astype(int)[0]
        feed_dict_test = {X: input_x, keep_prob: 1.0}
        pred_y = session.run(y_,feed_dict=feed_dict_test)
        pred_y = np.mean(pred_y, axis=0)
        pred_y = np.argmax(pred_y)
        true_y = np.argmax(true_y)
        y_pred.append(pred_y)
        y_true.append(true_y)
    return np.array(y_pred), np.array(y_true)
    
    
    

In [None]:
#Dummy test
y_pred, y_true = make_prediction(dummy_test_x, one_hot_encode(dummy_test_y.astype(np.int)), dummy_test_IDs)
p,r,f,s = precision_recall_fscore_support(y_true, y_pred, average='micro') 
print "F-Score:", round(f,3)
print "Confusion Matrix: "
print confusion_matrix(y_true, y_pred)

In [None]:
i = np.where(dummy_test_IDs == 5)
ipu = np.float32(dummy_test_x[i])
pred_y = session.run(y_, feed_dict={X: ipu, keep_prob:1})

In [None]:
pred_y = np.sum(pred_y, axis=1)

In [None]:
pred_y

In [None]:
np.argmax(dummy_test_y[i][0])

In [None]:
p,r,f,s = precision_recall_fscore_support(y_true, y_pred, average='micro')
    
print "F-Score:", round(f,3)
print "Confusion Matrix: "
print confusion_matrix(y_true, y_pred)

In [None]:
p,r,f,s = precision_recall_fscore_support(y_true, y_pred, average='micro')
    
print "F-Score:", round(f,3)
print "Confusion Matrix: "
print confusion_matrix(y_true, y_pred)

In [None]:
i = np.where(test_IDs == 941)

In [None]:
ipu = test_features[i].astype(np.float32)

In [None]:
true = one_hot_encode(test_labels.astype(np.int))[i][0]

In [None]:
np.argmax(true)

In [None]:
(test_labels.astype(np.int))[i][0]

In [None]:
pred = session.run(y_,feed_dict={X: ipu, keep_prob:0.5})

In [None]:
np.argmax(np.mean(pred, axis=0))

In [None]:
np.mean(pred, axis=0)