# Weight initialization test
## 10 layer net with 500 neurons

In [None]:
import numpy as np

# assume some unit gaussian 10-D input data
D = np.random.randn(1000, 500)
hidden_layer_sizes = [500]*10
nonlinearities = ['tanh']*len(hidden_layer_sizes)

In [None]:
act = {'relu':lambda x:np.maximum(0,x), 'tanh':lambda x:np.tanh(x)}
Hs = {}
for i in range(len(hidden_layer_sizes)):
    X = D if i == 0 else Hs[i-1] # input at this layer
    fan_in = X.shape[1]
    fan_out = hidden_layer_sizes[i]
    W = np.random.randn(fan_in, fan_out) * 0.01 # layer initialization
    
    H = np.dot(X, W) # matrix multiply
    H = act[nonlinearities[i]](H) # nonlinearity
    Hs[i] = H # cache result on this layer

In [None]:
# look at distributions at each layer
print('input layer had mean %f and std %f' % (np.mean(D), np.std(D)))
layer_means = [np.mean(H) for i,H in Hs.items()]
layer_stds = [np.std(H) for i,H in Hs.items()]
for i,H in Hs.items():
    print('hidden layer %d had mean %f and std %f' % (i+1, layer_means[i], layer_stds[i]))

# plot the means and standard deviations


input layer had mean -0.000200 and std 0.999938
hidden layer 1 had mean 0.000015 and std 0.213441
hidden layer 2 had mean 0.000021 and std 0.047523
hidden layer 3 had mean -0.000001 and std 0.010607
hidden layer 4 had mean -0.000001 and std 0.002372
hidden layer 5 had mean 0.000001 and std 0.000528
hidden layer 6 had mean -0.000000 and std 0.000118
hidden layer 7 had mean 0.000000 and std 0.000026
hidden layer 8 had mean -0.000000 and std 0.000006
hidden layer 9 had mean -0.000000 and std 0.000001
hidden layer 10 had mean 0.000000 and std 0.000000
