In [385]:
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.models import load_model
from tensorflow.keras import initializers
import numpy as np
import tensorflow as tf

In [2]:
def sigmoid(X):
    return 1/(1+np.exp(-X))

In [307]:
def get_model(input_dim=2):
    model = Sequential()
    model.add(Dense(3, name='D1', input_dim=input_dim, kernel_initializer=initializers.RandomNormal(stddev=0.5), 
                    bias_initializer=initializers.RandomNormal(stddev=0.5)))
    model.add(Activation('sigmoid', name='A1'))
    model.add(Dense(2, name='D2', kernel_initializer=initializers.RandomNormal(stddev=0.5), 
                    bias_initializer=initializers.RandomNormal(stddev=0.5)))
    model.add(Activation('sigmoid', name='A2'))
    model.add(Dense(3,  name='D3', kernel_initializer=initializers.RandomNormal(stddev=0.5), 
                    bias_initializer=initializers.RandomNormal(stddev=0.5)))
    model.add(Activation('softmax', name='A3'))
    model.compile(SGD(lr=0.1), loss='mse')
    model.save('simple_model.hdf5')
    return model

model = get_model()
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
D1 (Dense)                   (None, 3)                 9         
_________________________________________________________________
A1 (Activation)              (None, 3)                 0         
_________________________________________________________________
D2 (Dense)                   (None, 2)                 8         
_________________________________________________________________
A2 (Activation)              (None, 2)                 0         
_________________________________________________________________
D3 (Dense)                   (None, 3)                 9         
_________________________________________________________________
A3 (Activation)              (None, 3)                 0         
Total params: 26
Trainable params: 26
Non-trainable params: 0
_________________________________________________________

In [308]:
#model.save('simple_model.hdf5')

In [309]:
lr = 1
model = load_model('simple_model.hdf5')
#model = get_model()
model.compile(SGD(lr=lr), loss='mse')

# model.summary()
weights = model.get_weights()
print(weights)

[array([[-0.3315773 , -0.9603926 , -0.6638519 ],
       [-0.89203614,  0.06406464, -0.70949006]], dtype=float32), array([ 0.21058255, -1.1298051 ,  0.24498418], dtype=float32), array([[ 0.9079479 ,  0.18939708],
       [-0.887635  , -0.41770914],
       [ 0.59481746,  0.71403545]], dtype=float32), array([-0.80065525,  0.51503426], dtype=float32), array([[-0.06259476, -0.641043  , -0.435419  ],
       [ 0.63357514,  0.05387408,  0.02064688]], dtype=float32), array([-0.18157978,  0.18466745, -0.6780159 ], dtype=float32)]


In [310]:
def sigmoid_jac(Xin):
    return sigmoid(Xin)*(1-sigmoid(Xin))

In [311]:
sigmoid_jac(np.array([1, 2]))

array([0.19661193, 0.10499359])

In [312]:
def softmax(z):
    exps = np.exp(z)
    sums = np.sum(exps)
    return np.divide(exps, sums)


def softmax_jac(Xin):
    sm = softmax(Xin)
    return np.diag(sm) - sm.reshape(-1, 1).dot(sm.reshape(1, -1))
    
softmax_jac(np.array([1, 2, 3]))        

array([[ 0.08192507, -0.02203304, -0.05989202],
       [-0.02203304,  0.18483645, -0.1628034 ],
       [-0.05989202, -0.1628034 ,  0.22269543]])

In [313]:
sm = softmax(np.array([1, 2, 3]))
sm

array([0.09003057, 0.24472847, 0.66524096])

In [304]:
X = np.array([[3.4, 2.1]])

In [305]:
model.predict(X)

array([[0.26419955, 0.41198996, 0.32381058]], dtype=float32)

In [208]:
D1_out = X.dot(weights[0]) + weights[1]
A1_out = sigmoid(D1_out)
D2_out = A1_out.dot(weights[2]) + weights[3]
A2_out = sigmoid(D2_out)
y_ = A2_out.dot(weights[4]) + weights[5] # D3_out
print(y_)

[[-0.81235484]]


In [209]:
# si el y es igual a la salida entonces los pesos no se modifican
y = np.array([[0.93]])
#y = np.array([[out]])

In [210]:
loss = (y-y_)**2
print(loss)

[[3.03580038]]


In [211]:
model.evaluate(X, y)



3.0358004570007324

In [212]:
model.fit(X, y)



<tensorflow.python.keras.callbacks.History at 0x63ba90610>

In [213]:
new_weights = model.get_weights()

# Para D1
$\theta_t = \theta_{t-1} - \mathrm{learning\_rate} * g_t$

In [238]:
(sigmoid_dif(D1_out).T*weights[2].dot(sigmoid_dif(D2_out).T*weights[4]*(-2*(y-y_)))).dot(X)

array([[-0.15024746, -0.0927999 ],
       [ 0.07026835,  0.04340104]])

In [242]:
X*(sigmoid_dif(D1_out).T*weights[2].dot(sigmoid_dif(D2_out).T*weights[4]*(-2*(y-y_))))

array([[-0.15024746, -0.0927999 ],
       [ 0.07026835,  0.04340104]])

### Derivada del MSE evaluada en y_

In [215]:
# Derivada de la loss respecto a y_
mse_der = -2*(y-y_)
print(mse_der)
prop_grad_y_ = mse_der

[[-3.48470968]]


### Delta para D3

In [216]:
# Gradiente de y_ respecto a cada parámetro
# Recordar que cuando derivo una capa dense respecto a cada parámetro, me da la entrada a esa cada. El Bias es como si entrarada con 1

y__grad_ws_d3 = A2_out
y__grad_bias_d3 = 1

delta_ws_d3 = y__grad_ws_d3*prop_grad_y_*lr
delta_bias_d3 = y__grad_bias_d3*prop_grad_y_*lr
print(delta_ws_d3, delta_bias_d3)

[[-2.26015587 -0.94712252 -1.49670461]] [[-3.48470968]]


In [217]:
# Pesos utlima capa modificados
new_weights[4].T - (weights[4].T - delta_ws_d3)

array([[7.67218125e-08, 5.60790036e-08, 1.47877194e-07]])

In [218]:
# Bias ultima capa modificado
new_weights[5].T - (weights[5].T - delta_bias_d3)

array([[9.2371895e-08]])

### A2

In [219]:
def sigmoid_dif(X):
    return sigmoid(X)*(1-sigmoid(X))

In [220]:
# Con que entro a A2? D2_out
# Necesito y__grad_in_d3 (Gradiente de y_ respecto a la entrada al bloque d3)
y__grad_in_d3 = weights[4]
prop_grad_d3 = y__grad_in_d3*prop_grad_y_
print(prop_grad_d3) # Entrada de D3
prop_grad_A2 = sigmoid_dif(D2_out).T*prop_grad_d3*lr
print(prop_grad_A2) # Entrada de A2

[[0.84538307]
 [1.28369178]
 [1.53521471]]
[[0.19267999]
 [0.25407076]
 [0.37617463]]


### D2

In [221]:
delta_ws_d2 = (prop_grad_A2*A1_out).T
delta_bias_d2 = prop_grad_A2.T

In [222]:
print(new_weights[2] - (weights[2] - delta_ws_d2))
print(new_weights[3] - (weights[3] - delta_bias_d2))

[[-6.25856622e-09 -5.46572199e-09  1.02110236e-08]
 [-1.62406742e-08 -1.44461989e-08 -1.64342799e-08]]
[[-1.48808340e-08  1.29551381e-09  1.10791643e-09]]


## A1

In [223]:
d2_out_grad_in_d2 = weights[2]
prop_grad_d2 = d2_out_grad_in_d2.dot(prop_grad_A2)
print(prop_grad_d2) 
prop_grad_A1 = sigmoid_dif(D1_out).T*prop_grad_d2*lr
print(prop_grad_A1) 

[[-0.19309713]
 [ 0.082746  ]]
[[-0.04419043]
 [ 0.02066716]]


### D1

In [224]:
prop_grad_A1

array([[-0.04419043],
       [ 0.02066716]])

In [249]:
delta_ws_d1 = (X*prop_grad_A1).T
delta_bias_d1 = prop_grad_A1.T
print(delta_ws_d1)

[[-0.15024746  0.07026835]
 [-0.0927999   0.04340104]]


In [250]:
print(new_weights[0] - (weights[0] - delta_ws_d1))
print(new_weights[1] - (weights[1] - delta_bias_d1))

[[ 1.94892052e-08 -8.69921603e-09]
 [-1.77648721e-08 -9.97487437e-09]]
[[-2.40702032e-08 -4.74994019e-09]]


In [393]:
A3_model = Model(model.input, model.get_layer('A3').input)

In [401]:
A3_model

<tensorflow.python.keras.engine.training.Model at 0x642dd54d0>

In [403]:
model = load_model('simple_model.hdf5')
inputs = tf.constant(X)

with tf.GradientTape() as tape:
    preds = model(inputs)
    loss = model.loss(tf.constant(y), preds)

grads = tape.gradient(loss, A3_model(inputs))
print(grads)
# grads = tape.gradient(loss, model.get_layer("D3").trainable_variables)
# print(grads)

None


In [361]:
model.loss

<function tensorflow.python.keras.losses.mean_squared_error(y_true, y_pred)>

In [362]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
D1 (Dense)                   (None, 3)                 9         
_________________________________________________________________
A1 (Activation)              (None, 3)                 0         
_________________________________________________________________
D2 (Dense)                   (None, 2)                 8         
_________________________________________________________________
A2 (Activation)              (None, 2)                 0         
_________________________________________________________________
D3 (Dense)                   (None, 3)                 9         
_________________________________________________________________
A3 (Activation)              (None, 3)                 0         
Total params: 26
Trainable params: 26
Non-trainable params: 0
_________________________________________________________

In [178]:
model.trainable_weights

[<tf.Variable 'dense_63_4/kernel:0' shape=(2, 2) dtype=float32, numpy=
 array([[-0.0132913 ,  0.39277452],
        [ 0.61490476, -0.11834475]], dtype=float32)>,
 <tf.Variable 'dense_63_4/bias:0' shape=(2,) dtype=float32, numpy=array([ 0.9016673 , -0.33314416], dtype=float32)>,
 <tf.Variable 'dense_64_4/kernel:0' shape=(2, 3) dtype=float32, numpy=
 array([[-0.4778273 ,  0.3336253 , -0.03871878],
        [ 0.07612087,  0.27854922, -0.02718207]], dtype=float32)>,
 <tf.Variable 'dense_64_4/bias:0' shape=(3,) dtype=float32, numpy=array([ 0.36176467, -0.86578524, -0.10165475], dtype=float32)>,
 <tf.Variable 'dense_65_4/kernel:0' shape=(3, 1) dtype=float32, numpy=
 array([[0.47252566],
        [0.73365825],
        [0.7718087 ]], dtype=float32)>,
 <tf.Variable 'dense_65_4/bias:0' shape=(1,) dtype=float32, numpy=array([1.4085784], dtype=float32)>]

In [179]:
model.output

<tf.Tensor 'dense_65_4/Identity:0' shape=(None, 1) dtype=float32>

In [180]:
from tensorflow.keras import backend as K

In [184]:
K.gradients(model.output, model.trainable_weights)

RuntimeError: tf.gradients is not supported when eager execution is enabled. Use tf.GradientTape instead.

In [None]:
tf.GradientTape()