# 循环神经网络

尝试使用`Numpy`手搓最简单的RNN实现

## 准备数据

目标是用'hello'这个单词训练，希望能得到的是输入'hell',推理出'o'

'hello'作为一个序列，每个字母是序列的元素，进行one-hot encoding, 这里太简单所以直接手动指定

In [1]:
vocab_size = 4
hidden_size = 100

In [2]:
import numpy as np

h = np.array([1,0,0,0])
e = np.array([0,1,0,0])
l = np.array([0,0,1,0])
o = np.array([0,0,0,1])

print(h)
print(e)
print(l)
print(o)

[1 0 0 0]
[0 1 0 0]
[0 0 1 0]
[0 0 0 1]


假设循环神经网络的hidden size是100,那么定义$W_{xh}$和$W_{hh}$

In [3]:
Wxh = np.random.randn(hidden_size,vocab_size)*0.01
Whh = np.random.randn(hidden_size,hidden_size)*0.01

print(Wxh)
print(Whh)

[[ 5.92502552e-03 -7.33528084e-04 -2.16685118e-03 -1.14955544e-02]
 [ 1.12833707e-03 -4.78431979e-03  3.54684520e-03  1.05841052e-02]
 [-1.75124823e-02  1.64949831e-02 -7.32777155e-03  3.26675532e-04]
 [-9.63455319e-03  8.73468623e-04 -7.67929423e-03  1.26905481e-02]
 [ 9.18227362e-03 -1.76067057e-03 -1.80828829e-03  3.50406896e-04]
 [ 1.88733397e-02  8.98231889e-03  7.91688701e-03  1.30615601e-03]
 [ 5.69821953e-04  3.02105834e-03  1.73476810e-02  1.29855185e-02]
 [ 1.34365821e-02 -5.80780468e-04  1.96866429e-02 -3.67169045e-03]
 [-3.09441579e-02  6.73085410e-03 -7.76482693e-03 -3.67422919e-03]
 [ 3.26858648e-03 -1.27307587e-02  3.85483936e-03 -8.62420914e-03]
 [-7.69468566e-04  1.03007380e-05  6.04930208e-03 -5.97513871e-04]
 [-1.07095932e-02 -4.59434892e-04 -2.18589318e-03  1.82322067e-02]
 [-2.51614565e-03 -1.04412751e-02 -1.98481817e-02  1.67955121e-02]
 [ 1.96950809e-03  3.99032164e-03  5.28512153e-04 -7.51025044e-03]
 [-2.00964600e-03  4.43853475e-03  1.37182304e-03 -1.78493406e

定义输出的权重


In [4]:
Why = np.random.randn(vocab_size,hidden_size)*0.01
print(Why)

[[ 5.40440538e-03  1.82651093e-03  5.42709042e-03  4.86957076e-03
   1.16163758e-02 -2.98937304e-03  3.39659020e-03  1.62961152e-02
   1.67404362e-02 -1.43593448e-03 -1.30902564e-02 -2.12379659e-02
   3.13155872e-03  3.12894840e-03  3.49985073e-03  3.70308923e-03
  -7.01942201e-03 -4.69654452e-03  4.67649983e-03 -2.18750295e-03
  -6.19330821e-03  5.93136924e-03  5.79769643e-03 -1.02969608e-04
  -8.03689472e-03  4.20611150e-03  3.45185225e-03 -1.58737712e-02
   6.52240288e-03  1.17852553e-02  9.51535691e-03  6.15548567e-04
  -1.74343026e-03 -2.41587235e-03  2.10111530e-02  3.44654275e-03
  -1.76250209e-03 -6.74957654e-03  4.46682237e-03  1.44537143e-02
  -1.39753709e-02 -4.34802731e-03  3.38016588e-03  5.00250211e-04
  -1.02755242e-02  1.51780955e-02  1.76948536e-03 -7.85768760e-04
  -2.45734843e-02  1.72308773e-02 -2.75388806e-03 -1.16707374e-03
   2.08437708e-02 -4.03361821e-03 -1.51222136e-03 -1.28957163e-02
   3.58981894e-03  1.82441388e-03  9.99146158e-03 -1.70922113e-03
  -7.61772

因为使用bias，定义两个bias，分别是hidden和output层的

In [5]:
bh = np.zeros((hidden_size,1)) #[hidden_size,1]
by = np.zeros((vocab_size,1)) #[vocab_size,1]

print(bh)
print(by)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[[0.]
 [0.]
 [0.]
 [0.]]


定义损失函数

$$
\begin{align*}
    & h_t = \tanh(W_{hx}x_t + W_{hh}h_{t-1} + b_h) \\
    & y_t = W_{hy}h_t + b_y \\
\end{align*}
$$

用到了softmax，定义为 $\sigma(z_i) = \frac{e^{z_{i}}}{\sum_{j=1}^K e^{z_{j}}} \ \ \ for\ i=1,2,\dots,K$

交叉熵函数，因为M>2,所以为 $-\sum_{c=1}^My_{o,c}\log(p_{o,c})$

In [None]:
# hprev是初始化的隐藏状态，因为第一步的输入是没有前一个状态的
# 原作代码中inputs是没有经过one-hot编码的，所以在lossFun中进行编码，这里因为已经自定义了，所以使用不同的方式
# inputs = [[1,0,0,0],[0,1,0,0]...]
# targets= [[1,0,0,0],[0,1,0,0]...]


def lossFun(inputs, targets,hprev):
    xs = {}
    hs = {}
    ys = {}
    ps = {}
    
    hs[-1] = np.copy(hprev)
    loss = 0
    
    for t in range(len(inputs)):
        xs[t] = inputs[t]
        hs[t] = np.tanh(np.dot(Wxh,xs[t]) + np.dot(Whh,hs[t-1])+ bh) ## 计算隐藏状态
        ys[t] = np.dot(Why,hs[t])+by ##计算输出
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) ## softmax
        loss += -np.log(ps[t][targets[t],0]) ## 交叉熵损失
    
    ##反向传播
    ### 定义变量存储偏导
    dWxh = np.zeros_like(Wxh)
    dWhh = np.zeros_like(Whh)
    dWhy = np.zeros_like(Why)
    dbh = np.zeros_like(bh)
    dby = np.zeros_like(by)
    
    dhnext = np.zeros_like(hs[0])