# Challenge: MIR with Feedforward Neural Networks #

You will be assigned to a team of ~4 students. The goal is very straight forward:

* Design the feedforwad Neural Network that achieves the best "test" performance on the Iowa Musical Instrument Samples and the GTZAN datasets.

* We suggest that you use python and tensorflow, but if you have more experience using another toolbox or language, feel free to use anything.

* Divide work among team members: 
    * Individuals train different models in parallel on their computers 
    * Continuously share results and progress with the team 
    * Always help each other
    

* In an email to the members of the teaching staff, submit your results as a jupyter notebook (or similar script) that clearly shows the architecture of the model, your training and cross-validation of hyperparameters, and the test accuracy.

* NEVER use the test-set until you have finished tuning your hyper-parameters.

* Although the model MUST BE a simple feedforward Neural Network, you may research about and use ways to improve these simple models. Some ideas include: dropout, regularization, momentum, and data augmentation, but there are many more. Always include clear comments in your code that indicate what you are doing, with links and references to original papers/websites/ideas.

* The team achieving the best result will be pinned on r/DLfMIR.

* You may continue submitting your individual results even after the workshop is over. 

In [45]:
import numpy as np
import scipy.io
import tensorflow as tf
import matplotlib.pyplot as plt
import random, time, copy
# from utils import show_graph
tf.set_random_seed(0)
np.random.seed(0)

In [64]:
# let's reload the music dataset we used previously
dataset = scipy.io.loadmat('../data/GTZAN_small.mat')
data = dataset['dat_all']

In [65]:
# general data parameters
N = data.shape[0]
D = data.shape[1]-1
C = 10

# split into training, validation, and test sets
perc_tr = 0.8
perc_vl = 0.1
perc_ts = 0.1

# randomly shuffle the data (just to make sure)
np.random.permutation(data)

# separate into training, validation, and test sets
# data
x_tr = data[:int(N*perc_tr),0:-1]
x_vl = data[int(N*perc_tr):int(N*perc_tr+N*perc_vl),0:-1]
x_ts = data[-int(N*perc_ts):,0:-1]
# labels
y_tr = data[:int(N*perc_tr),-1].reshape(int(N*perc_tr),1)
y_vl = data[int(N*perc_tr):int(N*perc_tr+N*perc_vl),-1].reshape(int(N*perc_vl),1)
y_ts = data[-int(N*perc_ts):,-1].reshape(int(N*perc_ts),1)

# we won't use the variable `data` after this point.
del data

temp = np.zeros((y_tr.shape[0],C))
temp[np.arange(y_tr.shape[0]),y_tr.astype(int)[:,0]] = 1
y_tr = temp
temp = np.zeros((y_vl.shape[0],C))
temp[np.arange(y_vl.shape[0]),y_vl.astype(int)[:,0]] = 1
y_vl = temp
temp = np.zeros((y_ts.shape[0],C))
temp[np.arange(y_ts.shape[0]),y_ts.astype(int)[:,0]] = 1
y_ts = temp
del temp

In [66]:
def get_jacobian(y, x): # NO LOG
    shapey = list(y.shape)
    print type(shapey), shapey
    if len(shapey) == 1:
        return tf.convert_to_tensor([tf.gradients(y[i],x) for i in range(int(shapey[0]))])
    elif len(shapey) == 2:
        return tf.convert_to_tensor([[tf.gradients(y[i,j],x) for j in range(int(shapey[1]))] for i in range(int(shapey[0]))])
    elif len(shapey) == 3:
        return tf.convert_to_tensor([[[tf.gradients(y[i,j,k],x) for k in range(int(shapey[2]))] for j in range(int(shapey[1]))] for i in range(int(shapey[0]))])
    else:
        return tf.gradients(y,x)

def get_batch(size=20):
    inds = np.random.choice(N, size, replace=False)
    x, y = x_tr[inds,:], y_tr[inds,:]
    return x, y

def train(sess, nepochs=25, batch_size=25, calc_loss=True):
    if calc_loss:
        losses = np.ndarray(shape=(nepochs), dtype=float)
    for i in range(nepochs):
#         x, y = x_tr, y_tr
#         x, y = get_batch(size=batch_size)
#         x, y = tf.train.batch([x_tr, y_tr], batch_size)
        sess.run(GD_step, feed_dict={X: x_tr, y: y_tr})
        if calc_loss:
            loss = sess.run(loss_f, feed_dict={X: x_tr, y: y_tr})
            losses[i] = loss
            print "epoch ", i, " loss ", loss
    if calc_loss:
        return losses

def test():
    predicted_labels = sess.run(max_y_index, feed_dict={X: x_vl, y: y_vl})
    vl_acc = np.mean(predicted_labels == np.argmax(y_vl, axis = 1))
    print "The accuracy on the validation set is: ", vl_acc
    predicted_labels = sess.run(max_y_index, feed_dict={X: x_tr, y: y_tr})
    tr_acc = np.mean(predicted_labels == np.argmax(y_tr, axis = 1))
    print "The accuracy on the training set is: ", tr_acc
    return vl_acc

def graph(losses):
    plt.plot(losses)
    plt.xlabel("epoch")
    plt.ylabel("loss")
    plt.show()
    time.sleep(5)
    plt.close()

In [67]:
# general parameters
N = x_tr.shape[0] # number of training examples
D = x_tr.shape[1] # dimensionality of the data
C = y_tr.shape[1] # number of unique labels in the dataset
print N, D, C
# hyperparameters
H = 128 # number of hidden units. In general try to stick to a power of 2
lr = 0.001 # the learning rate (previously refered to in the notes as alpha)
dr = 0.4
stand_dev = 0.01

2400 44101 10


In [71]:
# let's initialize the weights
W_h = tf.Variable(tf.random_normal((D,H), stddev = 0.01))
W_o = tf.Variable(tf.random_normal((H,C), stddev = 0.01))
B_h = tf.Variable(tf.random_normal((1,H), stddev = 0.01))
B_o = tf.Variable(tf.random_normal((1,C), stddev = 0.01))

Ws = [W_h, B_h, W_o, B_o]

X = tf.placeholder("float", shape=[None,D])
y = tf.placeholder("float", shape=[None,C])

# we now do the forward pass until we obtain the scores
h1 = tf.layers.dense(X,100,tf.nn.softmax)
# h2 = tf.layers.dense(tf.layers.dropout(h1,rate=dr),100,tf.nn.softmax)
# h3 = tf.layers.dense(tf.layers.dropout(h2,rate=dr),50,tf.nn.relu)
# h4 = tf.layers.dense(tf.layers.dropout(h1,rate=dr),20,tf.nn.relu)
# scores = tf.layers.dense(h4,C,tf.nn.softmax)
h = tf.nn.sigmoid(tf.add(tf.matmul(X,W_h), B_h))
# h2 = tf.nn.relu(tf.add(tf.matmul(X,W_h), B_h))
# h3 = tf.nn.relu(tf.add(tf.matmul(X,W_h), B_h))
# h4 = tf.nn.relu(tf.add(tf.matmul(X,W_h), B_h))
scores = tf.add(tf.matmul(h, W_o), B_o)
y_hat = tf.nn.softmax(scores)
max_y_index = tf.argmax(y_hat, axis = 1)
# sampled_y_index = tf.to_int32(tf.multinomial(tf.log(y_hat), 1)[0][0])
# print y_hat.shape
loss_f = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=scores))
# GD_step = tf.train.GradientDescentOptimizer(lr).minimize(loss_f)
GD_step = tf.train.AdamOptimizer(lr).minimize(loss_f)

# FISHER INFOS
# jacobian = tf.gradients(tf.log(y_hat[0,sampled_y_index]), Ws)
# ys = copy.deepcopy(y_hat)
# print y_hat
# jacobians = tf.gradients(tf.log(y_hat), Ws)#, grad_ys=y_hat) # of shape Ws. Shouldn't it depend on shape of y and form a tensor?
# print y_hat
# jacobians = get_jacobian(y_hat, X)


# print len(jacobians)
# for grad in jacobians:
#     print grad.shape
# lin_fisher = [tf.multiply(stand_dev, tf.matmul(tf.transpose(jacobian), jacobian)) for jacobian in jacobians]

# # fishy_softmax = tf.diag(tf.divide(1,y)) # fishy because its a helper to fisher using y, the output
# # fishy doesn't depend on activation function of W or W. but the dimensions should match
# # fisher should be of dimension W.shape[0]xW.shape[0] so that it's inverse can be multiplied by the grad of J wrt W (dJ_dW) of shape W
# # datapoint by datapoint or batches? (expected value, divide later)
# fishy_softmax = tf.divide(1,y) # fishy because its a helper to fisher using y, the output
# print fishy_softmax.shape
# print jacobians[2].shape
# print "CHECK"
# print y[2].shape
# print tf.divide(jacobians[2],y[2][None,:])
# def fisher(jacobian, W, out):
#         fishy = tf.divide(1,y)
#         for jacobian in jacobians:
#             total = np.zeros(jacobian.shape)
#             for yi in y:
#                 total += tf.matmul(tf.transpose(jacobian), tf.divide(jacobian,tf.divide(1,y)))
            
#         [sum(tf.matmul(tf.transpose(jacobian), tf.divide(jacobian,tf.divide(1,y)))) for jacobian in jacobians]
# softmax_fisher = [sum(tf.matmul(tf.transpose(jacobian), tf.divide(jacobian,tf.divide(1,y))) for jacobian in jacobians]

# fishy_sigmoid = tf.diag(tf.divide(tf.divide(1,y),tf.subtract(1,y))) # fishy because its a helper to fisher using y, the output
# fishy_sigmoid = tf.divide(tf.divide(1,y),tf.subtract(1,y)) # fishy because its a helper to fisher using y, the output
# sigmoid_fisher = [tf.matmul(tf.transpose(jacobian), tf.matmul(fishy_sigmoid, jacobian)) for jacobian in jacobians]



In [77]:
# print x_tr[1:100,:].shape, D
# print y_tr[1:100,:].shape, C
# print x_tr.shape
print "init"
# print x_tr[0:1,:].shape
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
print "finished init"
# print y_hat
# grads = sess.run(tf.gradients(tf.log(y_hat), Ws), feed_dict={X: x_tr, y: y_tr})#, grad_ys=y_hat)) # of shape Ws. Shouldn't it depend on shape of y and form a tensor?
# y_hat = sess.run(y_hat, feed_dict={X: x_tr, y: y_tr})
# y_hat = sess.run(y_hat, feed_dict={X: x_tr[1:2,:], y: y_tr[1:2,:]})
print len(sess.run(tf.gradients(y_hat[0], W_o), feed_dict={X: x_tr[0:1,:], y: y_tr[0:1,:]}))
# print y_hat.shape
jacobians = sess.run(get_jacobian(y_hat[0,:], W_o), feed_dict={X: x_tr[0:1,:], y: y_tr[0:1,:]})
print jacobians.shape
# with sess.as_default():
    

    #     print jacobians.eval(feed_dict={X: x_tr, y: y_tr})
# print sess.run(loss_f, feed_dict={X: x_tr, y: y_tr})
sess.close()

init
finished init
1
<type 'list'> [Dimension(10)]
(10, 1, 128, 10)


In [None]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
print "hi"

In [69]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
print x_tr[np.random.randint(N)].shape
print np.array([x_tr[np.random.randint(N)]]).shape

print x_tr[0:np.random.randint(N),:].shape
print np.array([[x_tr[0:np.random.randint(N),:]]]).shape
grads = sess.run(jacobians, feed_dict={X: np.array([x_tr[np.random.randint(N)]])})
# for F_w in F:
#     F_w = 
# print grads.get_shape()
# print grads.shape()
print len(grads)
for grad in grads:
    print grad.shape
sess.close()
#  def compute_fisher(data, sess, plot_diffs=False, disp_freq=10):
#         # computer Fisher information for each parameter

#         # initialize Fisher information for most recent task
#         F = []
#         for weights in Ws:
#             F.append(np.zeros(weights.get_shape().as_list())

#         # sampling a random class from softmax
        

#         if(plot_diffs):
#             # track differences in mean Fisher info
#             F_prev = deepcopy(F)
#             mean_diffs = np.zeros(0)

#         for i in range(num_samples):
#             # compute first-order derivatives
#             grads = sess.run(jacobian, feed_dict={X: x_tr[np.random.randint(N)]})
#             # square the derivatives and add to total
#             for i in range(len(F)):
#                 F[i] += np.square(ders[i])
#             if(plot_diffs):
#                 if i % disp_freq == 0 and i > 0:
#                     # recording mean diffs of F
#                     F_diff = 0
#                     for v in range(len(self.F_accum)):
#                         F_diff += np.sum(np.absolute(self.F_accum[v]/(i+1) - F_prev[v]))
#                     mean_diff = np.mean(F_diff)
#                     mean_diffs = np.append(mean_diffs, mean_diff)
#                     for v in range(len(self.F_accum)):
#                         F_prev[v] = self.F_accum[v]/(i+1)
#                     plt.plot(range(disp_freq+1, i+2, disp_freq), mean_diffs)
#                     plt.xlabel("Number of samples")
#                     plt.ylabel("Mean absolute Fisher difference")
#                     display.display(plt.gcf())
#                     display.clear_output(wait=True)

#         # divide totals by number of samples
#         for v in range(len(self.F_accum)):
#             self.F_accum[v] /= num_samples

(44101,)
(1, 44101)
(2047, 44101)
(1, 1, 97, 44101)
4
(44101, 128)
(1, 128)
(128, 10)
(1, 10)


In [None]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
# we now ask tensorflow to run actual datadata through the graph.
# The data must be passed in using the feed_dict argument.

# for example, if I want to obtain the initial loss before doing any training:
loss = sess.run(loss_f, feed_dict={X: x_tr, y: y_tr})
print "The initial loss is: ", loss

# If what you want is to train the network using all the training data, then you have to ask:
sess.run(GD_step, feed_dict={X: x_tr, y: y_tr})

# you can loop over this to train over more than one epoch.

# If you want to obtain the accuracy of the network on the training set:
predicted_labels = sess.run(max_y_index, feed_dict={X: x_tr, y: y_tr})
tr_acc = np.mean(predicted_labels == np.argmax(y_tr, axis = 1))
loss = sess.run(loss_f, feed_dict={X: x_tr, y: y_tr})
print "The final training loss is: ", loss
print "The accuracy on the training set is: ", tr_acc

# If you want to obtain the accuracy of the network on the validation set:
predicted_labels = sess.run(max_y_index, feed_dict={X: x_vl, y: y_vl})
vl_acc = np.mean(predicted_labels == np.argmax(y_vl, axis = 1))
print "The accuracy on the validation set is: ", vl_acc 

sess.close()

The initial loss is:  2.30259
The final training loss is:  2.35615
The accuracy on the training set is:  0.105
The accuracy on the validation set is:  0.0933333333333


In [None]:
print "Starting"
nepochs = 120
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
test()
# losses = train(sess, nepochs=10)
# print losses
losses = np.ndarray(shape=(nepochs), dtype=float)
for i in range(nepochs):
#         x, y = tf.train.batch([x_tr, y_tr], batch_size)
#     inds = range(N)
#     np.random.shuffle(inds)
#     x_tr, y_tr = x_tr(inds), y_tr(inds)
    sess.run(GD_step, feed_dict={X: x_tr, y: y_tr})
    loss = sess.run(loss_f, feed_dict={X: x_tr, y: y_tr})
    losses[i] = loss
    print "epoch ", i, " loss ", loss
    test()
test()
sess.close()
graph(losses)


Starting
The accuracy on the validation set is:  0.103333333333
The accuracy on the training set is:  0.0995833333333
epoch  0  loss  2.30238
The accuracy on the validation set is:  0.1
The accuracy on the training set is:  0.11
epoch  1  loss  2.30217
The accuracy on the validation set is:  0.0966666666667
The accuracy on the training set is:  0.127083333333
epoch  2  loss  2.30195
The accuracy on the validation set is:  0.1
The accuracy on the training set is:  0.154583333333
epoch  3  loss  2.30173
The accuracy on the validation set is:  0.0966666666667
The accuracy on the training set is:  0.180833333333
epoch  4  loss  2.3015
The accuracy on the validation set is:  0.0966666666667
The accuracy on the training set is:  0.21
epoch  5  loss  2.30126
The accuracy on the validation set is:  0.1
The accuracy on the training set is:  0.2425
epoch  6  loss  2.30101
The accuracy on the validation set is:  0.1
The accuracy on the training set is:  0.266666666667
epoch  7  loss  2.30075
The 

The accuracy on the training set is:  0.661666666667
epoch  64  loss  2.28269
The accuracy on the validation set is:  0.113333333333
The accuracy on the training set is:  0.664166666667
epoch  65  loss  2.28238
The accuracy on the validation set is:  0.113333333333
The accuracy on the training set is:  0.666666666667
epoch  66  loss  2.28206
The accuracy on the validation set is:  0.116666666667
The accuracy on the training set is:  0.667916666667
epoch  67  loss  2.28175
The accuracy on the validation set is:  0.12
The accuracy on the training set is:  0.67125
epoch  68  loss  2.28144
The accuracy on the validation set is:  0.12
The accuracy on the training set is:  0.674166666667
epoch  69  loss  2.28112
The accuracy on the validation set is:  0.12
The accuracy on the training set is:  0.674583333333
epoch  70  loss  2.2808
The accuracy on the validation set is:  0.12
The accuracy on the training set is:  0.67875
epoch  71  loss  2.28048
The accuracy on the validation set is:  0.12
T

In [None]:

print x_tr
# x, y = get_batch(2)
# print type(x_tr)
# print type(x)

In [None]:
print len(x_vl)
# print type(x_tr)

In [None]:

# show_graph(tf.get_default_graph().as_graph_def())