## 第一题

In [None]:
import numpy as np
def softmax(x):
  x = np.array(x)
  max_x = np.max(x)
  return np.exp(x-max_x) / np.sum(np.exp(x-max_x))

class myRNN:
  def __init__(self, data_dim, hidden_dim=100, bptt_back=4):
    # data_dim: 词向量维度，即词典长度; hidden_dim: 隐单元维度; bptt_back: 反向传播回传时间长度
    self.data_dim = data_dim
    self.hidden_dim = hidden_dim
    self.bptt_back = bptt_back

    # 初始化权重向量 U， W， V; U为输入权重; W为递归权重; V为输出权重
    self.U = np.random.uniform(-np.sqrt(1.0/self.data_dim), np.sqrt(1.0/self.data_dim),
                                (self.hidden_dim, self.data_dim))
    self.W = np.random.uniform(-np.sqrt(1.0/self.hidden_dim), np.sqrt(1.0/self.hidden_dim),
                                (self.hidden_dim, self.hidden_dim))
    self.V = np.random.uniform(-np.sqrt(1.0/self.hidden_dim), np.sqrt(1.0/self.hidden_dim),
                                (self.data_dim, self.hidden_dim))

  # 前向传播
  def forward(self, x):
    # 向量时间长度
    T = len(x)

    # 初始化状态向量, s包含额外的初始状态 s[-1]
    s = np.zeros((T+1, self.hidden_dim))
    o = np.zeros((T, self.data_dim))

    for t in range(T): #xrange
      s[t] = np.tanh(self.U[:, x[t]] + self.W.dot(s[t-1]))
      o[t] = softmax(self.V.dot(s[t]))

    return [o, s]

  # 预测输出
  def predict(self, x):
      o, s = self.forward(x)
      pre_y = np.argmax(o, axis=1)
      return pre_y

  # 计算损失， softmax损失函数， (x,y)为多个样本
  def loss(self, x, y):
    cost = 0
    for i in range(len(y)):
      o, s = self.forward(x[i])
      # 取出 y[i] 中每一时刻对应的预测值
      pre_yi = o[range(len(y[i])), y[i]]
      cost -= np.sum(np.log(pre_yi))

    # 统计所有y中词的个数, 计算平均损失
    N = np.sum([len(yi) for yi in y])
    ave_loss = cost / N

    return ave_loss

  # 求梯度, (x,y)为一个样本
  def bptt(self, x, y):
    dU = np.zeros(self.U.shape)
    dW = np.zeros(self.W.shape)
    dV = np.zeros(self.V.shape)

    o, s = self.forward(x)
    delta_o = o
    delta_o[range(len(y)), y] -= 1

    for t in np.arange(len(y))[::-1]:
      # 梯度沿输出层向输入层的传播
      dV += delta_o[t].reshape(-1, 1) * s[t].reshape(1, -1)  # self.data_dim * self.hidden_dim
      delta_t = delta_o[t].reshape(1, -1).dot(self.V) * ((1 - s[t-1]**2).reshape(1, -1)) # 1 * self.hidden_dim

      # 梯度沿时间t的传播
      for bpt_t in np.arange(np.max([0, t-self.bptt_back]), t+1)[::-1]:
        dW += delta_t.T.dot(s[bpt_t-1].reshape(1, -1))
        dU[:, x[bpt_t]] = dU[:, x[bpt_t]] + delta_t

        delta_t = delta_t.dot(self.W.T) * (1 - s[bpt_t-1]**2)

    return [dU, dW, dV]

  # 计算梯度
  def sgd_step(self, x, y, learning_rate):
    dU, dW, dV = self.bptt(x, y)

    self.U -= learning_rate * dU
    self.W -= learning_rate * dW
    self.V -= learning_rate * dV

  # 训练RNN
  def train(self, X_train, y_train, learning_rate=0.005, n_epoch=5):
    losses = []
    num_examples = 0

    for epoch in range(n_epoch):
      for i in range(len(y_train)):
        self.sgd_step(X_train[i], y_train[i], learning_rate)
        num_examples += 1

      loss = self.loss(X_train, y_train)
      losses.append(loss)
      print('epoch {0}: loss = {1}'.format(epoch+1, loss))
      # 若损失增加，降低学习率
      if len(losses) > 1 and losses[-1] > losses[-2]:
        learning_rate *= 0.5
        print('decrease learning_rate to', learning_rate)

In [None]:
rnn = myRNN(4)

In [None]:
X = np.array([
    [1,0,0,0],
    [0,1,0,0],
    [0,0,1,0],
    [0,0,0,1],
    [1, 0, 0, 0],
    [0, 1, 0, 0],
    [0, 0, 1, 0],
    [0, 0, 0, 1],
    [1, 0, 0, 0],
    [0, 1, 0, 0],
    [0, 0, 1, 0],
    [0, 0, 0, 1]
])
y =  np.array([
    [0,1,0,0],
    [0,0,1,0],
    [0,0,0,1],
    [0,0,0,0],
    [0, 1, 0, 0],
    [0, 0, 1, 0],
    [0, 0, 0, 1],
    [0, 0, 0, 0],
    [0, 1, 0, 0],
    [0, 0, 1, 0],
    [0, 0, 0, 1],
    [0, 0, 0, 0],
])

In [None]:
rnn.train(X,y)

epoch 1: loss = 0.8700298718487586
epoch 2: loss = 0.6327577010423717
epoch 3: loss = 0.5063252751893155
epoch 4: loss = 0.4178235284559139
epoch 5: loss = 0.348172729305062


In [None]:
import numpy as np

# 定义RNN类
class RNN:
    def __init__(self, input_size, hidden_size, output_size):
        # 初始化权重
        self.Wxh = np.random.randn(hidden_size, input_size) * 0.01
        self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
        self.Why = np.random.randn(output_size, hidden_size) * 0.01
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((output_size, 1))
        # 存储激活值
        self.h = np.zeros((hidden_size, 1))

    def forward(self, inputs):
        xs, hs, ys, ps = {}, {}, {}, {}
        hs[-1] = np.copy(self.h)
        # 前向传播
        for t, x in enumerate(inputs):
            xs[t] = np.zeros((input_size, 1))
            xs[t][x] = 1  # 将输入编码为 one-hot 向量
            hs[t] = np.tanh(np.dot(self.Wxh, xs[t]) + np.dot(self.Whh, hs[t-1]) + self.bh)  # 隐藏状态
            ys[t] = np.dot(self.Why, hs[t]) + self.by  # 输出
            ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # 概率分布

        return ps, hs

    def sample(self, seed, n):
        # 从初始种子字符开始生成新文本
        x = seed
        # 将种子字符编码为 one-hot 向量
        x_one_hot = np.zeros((input_size, 1))
        x_one_hot[x] = 1
        # 存储生成的文本
        generated_text = [x]

        # 前向传播，生成文本
        for _ in range(n):
            _, h = self.forward([x])
            # 计算下一个字符的概率分布
            p = h[-1]
            # 根据概率分布随机选择下一个字符
            x = np.random.choice(range(output_size), p=p.ravel())
            # 添加到生成的文本中
            generated_text.append(x)
        
        return generated_text

# 输入输出大小
input_size = output_size = 256
# 隐藏层大小
hidden_size = 128

# 创建RNN对象
rnn = RNN(input_size, hidden_size, output_size)

# 用户输入文本
user_input = input("请输入文本：")

# 将文本转换为字符索引
char_to_idx = {chr(i): i for i in range(256)}
idx_to_char = {i: chr(i) for i in range(256)}
input_text = [char_to_idx[char] for char in user_input]

# 生成续写的文本
generated_text = rnn.sample(input_text[-1], 500)

# 将生成的文本转换为字符串
generated_text = ''.join([idx_to_char[idx] for idx in generated_text])

# 输出

## 第二题

In [None]:
import numpy as np

In [None]:
class Loss():
  def forward(self,y_hat,y):
    delta=1e-7  #添加一个微小值可以防止负无限大(np.log(0))的发生。
    return -np.sum(y_hat*np.log(y+delta))
    pass
  def backward(self,y_hat,y):
    delta=1e-7
    return np.log(y+delta)
    pass
  pass

In [None]:
class Tanh():

  # 函数原型：e^x/sum(e^x)
  def forward(self, input):
    '''
    正向计算求值
    :param input:
    :return:
    '''
    result = np.exp(input)
    result2 = np.exp(-input)
    return (result - result2) / (result + result2)
    pass

  def backward(self, output):
    '''
    反向计算求梯度
    :param output: 是正向计算的结果，forward的计算结果
    :return:
    '''
    return 1 - output ** 2
    pass
  pass

In [None]:
class SoftMax():
# 函数原型：e^x/sum(e^x)
  def forward(self, input):
    '''
    正向计算求值
    :param input:
    :return:
    '''
    result = np.exp(input)
    return result / (np.sum(result) + 0.001)
    pass

  def backward(self, output):
    '''
    反向计算求梯度
    :param output: 是正向计算的结果，forward的计算结果
    :return:
    '''
    return output * (1 - output)
    pass
  pass

In [None]:
class Relu():
  def forward(self,input):
    """
    正向计算求值
    """
    result = np.maximum(0,input)
    return result
    pass

  def backward(self,output):
    """
    反向传播计算梯度
    """
    output[output>0] = 1
    output[output<=0] = 0
    return output
    pass
  pass

In [None]:
class WRNN():
  #初始化操作
  def __init__(self,input_size,state_size,output_size,times=1,maxlen=32,learningRate = 0.01):
    """
    定义网络结构
    input_size:输入x向量的长度
    state_size:隐藏层b的长度，隐藏层a的长度：state_size+2
    output_size:输出y向量的长度
    time:记录时间状态

    """
    self.input_size = input_size
    self.output_size = output_size
    self.hidden_a_size = state_size
    self.state_size  = state_size 
    self.hidden_b_size = state_size
    #self.activator = activator

    #初始化权重矩阵
    self.Ux = np.random.uniform(-0.5, 0.5, (self.state_size, input_size))
    self.Vx = np.random.uniform(-0.5, 0.5, (self.state_size, self.state_size))
    self.Qx = np.random.uniform(-0.5, 0.5, (self.output_size, self.state_size))
    self.Rx = np.random.uniform(-0.5, 0.5, (self.state_size, self.state_size))
    self.Ws = np.random.uniform(-0.5, 0.5, (self.state_size, self.state_size))
    self.Tx = np.random.uniform(-0.5, 0.5, (self.state_size, self.state_size))

    #初始化bias
    self.S1 = np.zeros(self.state_size)
    self.S2 = np.zeros(self.state_size)
    self.S3 = np.zeros(self.output_size)
    # 记录状态信息
    self.stateList_a = []      # 有少条样本就有产生多少条状态
    self.stateList_b = []      # 有少条样本就有产生多少条状态
    self.stateList_ha = []
    self.stateList_hb = []
    self.times = times       # times是记录的计算次数，一个轮次内，最大是系列总长度
    self.maxLen = maxlen     # 暂时不用吧
    self.outputList = []

    #记录梯度，后续方便实现不同的优化器SGDM、Adam等
    self.S1GradList = []
    self.S2GradList = []
    self.S3GradList = []

    self.UxGradList = []
    self.VxGradList = []
    self.QxGradList = []
    self.RxGradList = []
    self.WsGradList = []
    self.TxGradList = []
    
  #t时刻前向传播的计算
  def forward(self,inputX,times,maxLen):
    if times - 1 == 0: # t0时刻对stateList，outputList重置，说明一轮训练结束
      self.stateList_a = []
      self.stateList_b = []
      self.outputList = []
      self.stateList_ha = []
      self.stateList_hb = []
      self.stateList_b.append(np.zeros(self.state_size))
      self.stateList_hb.append(np.zeros(self.state_size))#初始增加bt-1，全零值，方便统一计算
      self.outputList.append(np.zeros(self.output_size))
      self.stateList_a.append(np.zeros(self.state_size)) 
      self.stateList_ha.append(np.zeros(self.state_size))
      #t=1时，state_a中无数据，随机初始化一个a_t-2
      #self.stateList_a.append(np.random.uniform(0,1,self.stateSize))
      #self.stateList_b.append = []
      #清空历史梯度
      self.S1GradList = []
      self.S2GradList = []
      self.S3GradList = []

      self.UxGradList = []
      self.VxGradList = []
      self.QxGradList = []
      self.RxGradList = []
      self.WsGradList = []
      self.TxGradList = []
      
    state_a_ = self.stateList_a[-1]#t-1时刻的state_a
    #t时刻state_a的计算，使用tanh作为激活函数
    state_a = np.dot(self.Ux,inputX)+np.dot(self.Ws,state_a_)+self.S1
    self.stateList_ha.append(state_a)
    tanh = Tanh()##f
    state_a = tanh.forward(state_a)
    self.stateList_a.append(state_a)


    state_b_ = self.stateList_b[-1]
    #t时刻state_b的计算，使用relu作为激活函数
    #########################################
    state_b = np.dot(self.Vx,state_a)+np.dot(self.Rx,state_a_)+np.dot(self.Tx,state_b_)+self.S2
    self.stateList_hb.append(state_b)
    #relu = Relu()
    state_b = tanh.forward(state_b)
    self.stateList_b.append(state_b)
    #t时刻的输出值，使用softmax作为激活函数
    output = np.dot(self.Qx,state_b)+self.S3
    softmax = SoftMax()
    output = softmax.forward(output)
    # 记录t时刻的输出
    self.outputList.append(output)

    self.times = times # 下一个时刻
    return output
    
  

  #反向传播求梯度
  # loss函数使用 E(y) = -np.sum(y * log(y) + (1 - y)*log)
  def backward(self,X,inputY,T):
    #deltaT = self.Qx.dot(self.outputList[T] - inputY[T-1])
    #deltaList = [0 for i in range(0, T+1)]
    #deltaList[T] = deltaT
    g = SoftMax()
    f = Tanh()
    loss = Loss()
    #初始化梯度为0
    #loss/S1 loss/S2 loss/S3
    self.S1Grad = np.zeros(shape = self.S1.shape)
    self.S2Grad = np.zeros(shape = self.S2.shape)
    self.S3Grad = np.zeros(shape = self.S3.shape)
    #loss/Q loss/R loss/U loss/V loss/W
    self.UxGrad = np.zeros(shape = self.Ux.shape)
    self.VxGrad = np.zeros(shape = self.Vx.shape)
    self.QxGrad = np.zeros(shape = self.Qx.shape)
    self.RxGrad = np.zeros(shape = self.Rx.shape)
    self.WsGrad = np.zeros(shape = self.Ws.shape)
    self.TxGrad = np.zeros(shape = self.Tx.shape)
    
    for t in range(1,T+1):
      o_b_Grad = np.dot(g.forward(np.dot(self.Qx,self.stateList_b[t])),self.Qx)
      b_s_Grad = f.forward(self.stateList_hb[t])
      b_V_Grad = np.dot(b_s_Grad,self.Vx)
      b_R_Grad = np.dot(b_s_Grad,np.dot(self.stateList_a[t-1].T,np.eye(self.Vx.shape[0],self.Vx.shape[1])))
      
      b_a_Grad = np.dot(f.backward(self.stateList_ha[t]),self.Vx)
      b_a1_Grad =  np.dot(f.backward(self.stateList_ha[t]),self.Rx)
      a_p_Grad = f.forward(self.stateList_ha[t])
      a1_p_Grad = f.forward(self.stateList_ha[t-1])
      l_o_Grad = np.dot(loss.backward(self.outputList[t],inputY[t-1]), self.stateList_b[t]) 
      a_U_Grad = np.dot(f.forward(self.stateList_ha[t]),np.dot(X.T,np.eye(self.Ux.shape[0],self.Ux.shape[1])))
     
      #a1_U_Grad = f.forward(self.stateList_ha[t-1])
      a_W_Grad = np.dot(f.forward(self.stateList_ha[t]),np.dot(self.stateList_a[t].T,np.eye(self.Wx.shape[0],self.Wx.shape[1])))
      #a1_W_Grad = f.forward(self.stateList_ha[t-1])
     
      self.SGrad += np.dot(np.dot(loss.backward(self.outputList[t] , inputY[t-1]),o_b_Grad), b_s_Grad)
      self.VxGrad +=  np.dot(np.dot(loss.backward(self.outputList[t] , inputY[t-1])[:, np.newaxis],o_b_Grad[np.newaxis,:])[:, np.newaxis], b_V_Grad[np.newaxis,:])
      self.RxGrad +=  np.dot(np.dot(loss.backward(self.outputList[t] , inputY[t-1])[:, np.newaxis],o_b_Grad[np.newaxis,:])[:, np.newaxis], b_R_Grad[np.newaxis,:])
      self.PGrad += np.dot(np.dot(np.dot(l_o_Grad ,o_b_Grad ),b_a_Grad),a_p_Grad) + np.dot(np.dot(np.dot(l_o_Grad ,o_b_Grad ),b_a1_Grad),a1_p_Grad)
      self.WxGrad += np.dot(np.dot(np.dot(l_o_Grad ,o_b_Grad ),b_a_Grad),a_U_Grad)
      self.UxGrad += np.dot(np.dot(np.dot(l_o_Grad ,o_b_Grad ),b_a_Grad),a_W_Grad)
      self.TxGrad += np.dot(np.dot(np.dot(l_o_Grad ,o_b_Grad ),b_a_Grad),a_W_Grad)
    



    #记录梯度
    self.S1GradList.append(self.S1Grad)
    self.S2GradList.append(self.S2Grad)
    self.S3GradList.append(self.S3Grad)

    self.UxGradList.append(self.UxGrad)
    self.VxGradList.append(self.VxGrad)
    self.QxGradList.append(self.QxGrad)
    self.RxGradList.append(self.RxGrad)
    self.WsGradList.append(self.WsGrad)
    self.TxGradList.append(self.TxGrad)
    
  

  def update(self,learningRate):

    self.Ux -= self.UxGrad*learningRate
    self.Vx -= self.VxGrad*learningRate
    self.Qx -= self.QxGrad*learningRate
    self.Rx -= self.RxGrad*learningRate
    self.Ws -= self.WsGrad*learningRate
    self.Tx -= self.TxGrad*learningRate
    
    self.S1 -= self.S1Grad*learningRate
    self.S2 -= self.S2Grad*learningRate
    self.S3 -= self.S3Grad*learningRate
    pass


  def predict(self, input):
    
    output = self.forward(input, 1, 12)

    return output
    pass

  def fit(self, X, Y, loss=None, epochs=100, learningRate=0.1):
    '''
    训练模型
    :param X:
    :param y:
    :param loss: 成本函数
    :return:
    '''
    T = len(X)
    for i in range(epochs):
      # epochs
      print("epochs:", i)
      times = 1
      # 正向计算所有的state和output
      for x, y in zip(X, Y):
        self.forward(x, times, T)  # T是系列的总长度，暂时不考虑截取
        times += 1
        pass
      # 反向传播求梯度
      self.backward(X, Y, T)
      # 跟新梯度
      self.update(learningRate) 
    pass
  pass


In [None]:
import torch
class WRNN2():
  #初始化操作
  def __init__(self,input_size,state_size,output_size,times=1,maxlen=32,learningRate = 0.01):
    """
    定义网络结构
    input_size:输入x向量的长度
    state_size:隐藏层b的长度，隐藏层a的长度：state_size+2
    output_size:输出y向量的长度
    time:记录时间状态

    """
    self.input_size = input_size
    self.output_size = output_size
    self.hidden_a_size = state_size
    self.state_size  = state_size 
    self.hidden_b_size = state_size
    #self.activator = activator

    #初始化权重矩阵
    self.Ux = torch.normal(-0.5, 0.5, size = (self.state_size, input_size),requires_grad=True)
    self.Vx = torch.normal(-0.5, 0.5, size = (self.state_size, self.state_size),requires_grad=True)
    self.Qx = torch.normal(-0.5, 0.5, size = (self.output_size, self.state_size),requires_grad=True)
    self.Rx = torch.normal(-0.5, 0.5, size = (self.state_size, self.state_size),requires_grad=True)
    self.Ws = torch.normal(-0.5, 0.5, size = (self.state_size, self.state_size),requires_grad=True)
    self.Tx = torch.normal(-0.5, 0.5, size = (self.state_size, self.state_size),requires_grad=True)

    #初始化bias
    self.S1 = torch.zeros(self.state_size,requires_grad=True)
    self.S2 = torch.zeros(self.state_size,requires_grad=True)
    self.S3 = torch.zeros(self.output_size,requires_grad=True)
    # 记录状态信息
    self.stateList_a = []      # 有少条样本就有产生多少条状态
    self.stateList_b = []      # 有少条样本就有产生多少条状态
    self.stateList_ha = []
    self.stateList_hb = []
    self.times = times       # times是记录的计算次数，一个轮次内，最大是系列总长度
    self.maxLen = maxlen     # 暂时不用吧
    self.outputList = []

    self.softmax = torch.nn.Softmax(dim=0)
    self.relu = torch.nn.ReLU()##f
  #t时刻前向传播的计算
  def forward(self,inputX,times,maxLen=0):
    if times - 1 == 0: # t0时刻对stateList，outputList重置，说明一轮训练结束
      self.stateList_a = []
      self.stateList_b = []
      self.outputList = []
      self.stateList_ha = []
      self.stateList_hb = []
      self.stateList_b.append(torch.zeros(self.state_size,requires_grad=True))
      self.stateList_hb.append(torch.zeros(self.state_size,requires_grad=True))#初始增加bt-1，全零值，方便统一计算
      #self.outputList.append(torch.zeros(self.output_size,requires_grad=True))
      self.stateList_a.append(torch.zeros(self.state_size,requires_grad=True)) 
      self.stateList_ha.append(torch.zeros(self.state_size,requires_grad=True))
      #t=1时，state_a中无数据，随机初始化一个a_t-2
      #self.stateList_a.append(np.random.uniform(0,1,self.stateSize))
      #self.stateList_b.append = []
      
    state_a_ = self.stateList_a[-1]#t-1时刻的state_a
    #t时刻state_a的计算，使用tanh作为激活函数
    state_a = self.Ux@inputX + self.Ws@state_a_ + self.S1
    self.stateList_ha.append(state_a)
    
    state_a = self.relu(state_a)
    self.stateList_a.append(state_a)


    state_b_ = self.stateList_b[-1]
    #t时刻state_b的计算，使用relu作为激活函数
    #########################################
    state_b = self.Vx@state_a + self.Rx@state_a_ + self.Tx@state_b_ + self.S2
    self.stateList_hb.append(state_b)
    #relu = Relu()
    state_b = self.relu(state_b)
    self.stateList_b.append(state_b)
    #t时刻的输出值，使用softmax作为激活函数
    output = self.Qx@state_b+self.S3
    
    output = self.softmax(output)
    # 记录t时刻的输出
    self.outputList.append(output)

    self.times = times # 下一个时刻
    return output

  def get_parameters(self):
    return [self.S1,self.S2,self.S3,self.Ux,self.Vx,self.Qx,self.Rx,self.Ws,self.Tx]
    
  def predict(self,X):
    
    state_a_List = []
    state_b_list = []
    output_list =[]
    state_a_List.append(self.stateList_a[-1])
    state_b_list.append(torch.zeros(self.state_size,requires_grad=True))
    for i in range(X.shape[0]):
      state_a_ = state_a_List[-1]#t-1时刻的state_a
      state_a = self.Ux@X[i] + self.Ws@state_a_ + self.S1
      
      state_a = self.relu(state_a)
      state_a_List.append(state_a)


      state_b_ = state_b_list[-1]
      #t时刻state_b的计算，使用relu作为激活函数
      #########################################
      state_b = self.Vx@state_a + self.Rx@state_a_ + self.Tx@state_b_ + self.S2
      
      state_b = self.relu(state_b)
      state_b_list.append(state_b)
      #t时刻的输出值，使用softmax作为激活函数
      output = self.Qx@state_b+self.S3
      
      output = self.softmax(output)
      # 记录t时刻的输出
      output_list.append(output)

      
    return output_list
     
    
  def fit(self, X, Y, loss=None, epochs=10, learningRate=0.01):
    '''
    训练模型
    :param X:
    :param y:
    :param loss: 成本函数
    :return:
    '''
    optimizer = torch.optim.SGD(self.get_parameters(),learningRate)
    T = len(X)
    if loss is None:
      loss = torch.nn.CrossEntropyLoss()
    
    
    for i in range(epochs):
      # epochs
      loss_sum = 0
      print("epochs:", i)
      times = 1
      # 正向计算所有的state和output
      for x, y in zip(X, Y):
        y_hat = self.forward(x, times, T)  # T是系列的总长度，暂时不考虑截取
        self.outputList.append(y_hat)
        times += 1
        loss_sum += loss(y_hat,y) #计算每次的损失，并累加
        
      print("loss:",loss_sum)
      # 跟新梯度
      optimizer.zero_grad()
      loss_sum.backward()
      optimizer.step()
  

In [None]:
rnn=WRNN2(4, 10 , 4)

In [None]:
import numpy as np

In [None]:
X = torch.tensor([
    [3.,1.,0.,0.],
    [1.,0.,1.,0.],
    [1.,0.,0.,1.],
    [0.,1.,0.,0.]
])

  y = torch.tensor(y)


In [None]:
Y = torch.tensor([
    [1.,0.,0.,0.],
    [0.,1.,0.,0.],
    [0.,0.,1.,0.],
    [0.,0.,0.,1.]
])

In [None]:
rnn.fit(X,Y,epochs=100,learningRate=0.01)

epochs: 0
loss: tensor(4.7687, grad_fn=<AddBackward0>)
epochs: 1
loss: tensor(4.7664, grad_fn=<AddBackward0>)
epochs: 2
loss: tensor(4.7641, grad_fn=<AddBackward0>)
epochs: 3
loss: tensor(4.7618, grad_fn=<AddBackward0>)
epochs: 4
loss: tensor(4.7598, grad_fn=<AddBackward0>)
epochs: 5
loss: tensor(4.7573, grad_fn=<AddBackward0>)
epochs: 6
loss: tensor(4.7549, grad_fn=<AddBackward0>)
epochs: 7
loss: tensor(4.7524, grad_fn=<AddBackward0>)
epochs: 8
loss: tensor(4.7500, grad_fn=<AddBackward0>)
epochs: 9
loss: tensor(4.7475, grad_fn=<AddBackward0>)
epochs: 10
loss: tensor(4.7450, grad_fn=<AddBackward0>)
epochs: 11
loss: tensor(4.7426, grad_fn=<AddBackward0>)
epochs: 12
loss: tensor(4.7403, grad_fn=<AddBackward0>)
epochs: 13
loss: tensor(4.7377, grad_fn=<AddBackward0>)
epochs: 14
loss: tensor(4.7351, grad_fn=<AddBackward0>)
epochs: 15
loss: tensor(4.7325, grad_fn=<AddBackward0>)
epochs: 16
loss: tensor(4.7299, grad_fn=<AddBackward0>)
epochs: 17
loss: tensor(4.7273, grad_fn=<AddBackward0>)
ep

In [None]:
for i in range(X.shape[0]):
  out = rnn.forward(X[i],i+1,1)
  print(out)

tensor([0.2500, 0.2500, 0.2500, 0.2500], grad_fn=<SoftmaxBackward0>)
tensor([0.2500, 0.2500, 0.2500, 0.2500], grad_fn=<SoftmaxBackward0>)
tensor([0.2500, 0.2500, 0.2499, 0.2500], grad_fn=<SoftmaxBackward0>)
tensor([0.2500, 0.2500, 0.2500, 0.2500], grad_fn=<SoftmaxBackward0>)


  output = self.softmax(output)


In [None]:
rnn.predict(X)

[tensor([0.7990, 0.0115, 0.0034, 0.1861], grad_fn=<SoftmaxBackward0>),
 tensor([0.0568, 0.4419, 0.0110, 0.4903], grad_fn=<SoftmaxBackward0>),
 tensor([0.8650, 0.0033, 0.0015, 0.1301], grad_fn=<SoftmaxBackward0>),
 tensor([6.3992e-05, 3.3829e-03, 6.6728e-06, 9.9655e-01],
        grad_fn=<SoftmaxBackward0>)]

In [None]:
Y = torch.tensor([
    [1.,0.,0.,0.],
    [0.,1.,0.,0.],
    [0.,0.,1.,0.],
    [0.,0.,0.,1.]
])

## 第三题

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving train.txt to train.txt
User uploaded file "train.txt" with length 25813320 bytes


In [None]:
!ls


Coding.txt  sample_data  train.txt


In [None]:
Char_Number={}
Number_Char={}

In [None]:
f = open("Coding.txt")

i = 0
for line in f:
  Char_Number[line[0]] = i
  Number_Char[str(i)] = line
  i=i+1
print(i)

4371


In [None]:
from torch.nn import functional as F

In [None]:
K = F.one_hot(torch.arange(4371))

In [None]:
file_name = "train.txt"
train_file = open(file_name)
str1 = train_file.readline()

将汉字进行one_hot编码
返回train_list长度的训练集，one_hot编码后的数据，target是紧跟着训练集后面的数据，长度与训练集相同

In [None]:
def readFile(file_name,train_list=50,step=10):
  train_file = open(file_name)
  cur_str = ""
  for line in train_file:
    cur_str+=line
    cur_str = cur_str.replace("\n","")
    if(len(cur_str)>train_list*2):
      train_data = cur_str[0:train_list]
      target_data = cur_str[train_list:train_list*2]
      train_code = torch.zeros(train_list,4371)
      target_code = torch.zeros(train_list,4371)
      for i in range(train_list):
        train_code[i] = K[Char_Number[train_data[i]]]
        target_code[i] = K[Char_Number[target_data[i]]]
      yield train_code,target_code
      cur_str = cur_str[step:]

In [None]:
get_data = readFile(file_name)
X,Y = next(get_data)

In [None]:
X[0].argmax()

tensor(385)

梯度不更新，可能one_hot编码太多零了，4371维表示一个汉字，使用的汉字表里有4371个字符

In [None]:
rnn=WRNN2(4371, 7000 , 4371)
for i in range(100):
  get_data = readFile(file_name)
  X,Y = next(get_data)
  print("batch_size:",i)
  rnn.fit(X,Y,epochs=10,learningRate=0.1)

batch_size: 0
epochs: 0
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 1
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 2
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 3
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 4
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 5
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 6
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 7
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 8
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 9
loss: tensor(419.1375, grad_fn=<AddBackward0>)
batch_size: 1
epochs: 0
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 1
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 2
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 3
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 4
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 5
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epochs: 6
loss: tensor(419.1375, grad_fn=<AddBackward0>)
epo

In [None]:
demo="康熙的意思是，这种奏折是秘密奏报，并非正式公文，要李林盛自己书写，不会写汉字则写清字好了。"


In [None]:
def getX(demo):
  """
  demo:str
  return:X，shape is [len(demo),4371]
  """
  X = torch.zeros(len(demo),4371)
  for i in range(len(demo)):
    X[i] = K[Char_Number[demo[i]]]
    
  return X

In [None]:
def decode(Y):
  """
  Y :shape is [L,4371]
  return str
  """
  str1 = ""
  for i in Y:
    j = i.argmax()
    str1+=Number_Char[str(j)]
  return str1
  pass

In [None]:
count = 500/len(demo)+1
for i in range(count):
  X = getX(demo)
  Y = rnn.predict(X)
  str1 = decode(Y)
  print(str1,end="")
  X = getX(str1)

In [None]:
def readFile2(file_name,train_list=50,step=10):
  train_file = open(file_name)
  cur_str = ""
  for line in train_file:
    cur_str+=line
    cur_str = cur_str.replace("\n","")
    if(len(cur_str)>train_list*3):
      train_data = cur_str[0:train_list]
      target_data = cur_str[train_list-20:train_list*2]
      train_code = torch.zeros(train_list,4371)
      target_code = torch.zeros(train_list,4371)
      for i in range(train_list):
        train_code[i] = K[Char_Number[train_data[i]]]
        target_code[i] = K[Char_Number[target_data[i]]]
      #train_code = embedding(torch.LongTensor(train_code))
      yield train_code,target_code
      cur_str = cur_str[step:]

In [None]:
import torch.nn as nn
file_name = "train.txt"
rnn=WRNN2(4371, 5000 , 4371)
for i in range(100):
  get_data = readFile2(file_name)
  X,Y = next(get_data)
  print("batch_size:",i)
  rnn.fit(X,Y,loss=nn.BCELoss(),epochs=10,learningRate=0.1)

batch_size: 0
epochs: 0
loss: tensor(0.1073, grad_fn=<AddBackward0>)
epochs: 1
loss: tensor(0.1073, grad_fn=<AddBackward0>)
epochs: 2
loss: tensor(0.1073, grad_fn=<AddBackward0>)
epochs: 3
loss: tensor(0.1073, grad_fn=<AddBackward0>)
epochs: 4
loss: tensor(0.1073, grad_fn=<AddBackward0>)
epochs: 5
loss: tensor(0.1073, grad_fn=<AddBackward0>)
epochs: 6
loss: tensor(0.1073, grad_fn=<AddBackward0>)
epochs: 7
loss: tensor(0.1073, grad_fn=<AddBackward0>)
epochs: 8
loss: tensor(0.1073, grad_fn=<AddBackward0>)
epochs: 9
loss: tensor(0.1073, grad_fn=<AddBackward0>)
batch_size: 1
epochs: 0
loss: tensor(0.1073, grad_fn=<AddBackward0>)
epochs: 1
loss: tensor(0.1073, grad_fn=<AddBackward0>)
epochs: 2
loss: tensor(0.1073, grad_fn=<AddBackward0>)


### 使用embedding编码试试

In [None]:
import torch.nn as nn

In [None]:
coding_dim =100

In [None]:
embedding = nn.Embedding(4371, coding_dim)

In [None]:
coding2 = embedding(K[0])

In [None]:
def readFile3(file_name,train_list=50,step=10):
  train_file = open(file_name)
  cur_str = ""
  for line in train_file:
    cur_str+=line
    cur_str = cur_str.replace("\n","")
    if(len(cur_str)>train_list*2):
      train_data = cur_str[0:train_list]
      target_data = cur_str[train_list:train_list*2]
      train_code = torch.zeros(train_list,coding_dim)
      target_code = torch.zeros(train_list,4371)
      for i in range(train_list):
        train_code[i] = coding2[Char_Number[train_data[i]]]
        target_code[i] = K[Char_Number[target_data[i]]]
      yield train_code,target_code
      cur_str = cur_str[step:]

In [None]:
import torch.nn as nn
file_name = "train.txt"
rnn=WRNN2(coding_dim, 5000 , 4371)
for i in range(100):
  get_data = readFile3(file_name)
  X,Y = next(get_data)
  print("batch_size:",i)
  rnn.fit(X,Y,epochs=10,learningRate=0.1)