# LSTM, GRU, 勾配クリッピング

- LSTM, GRU(ともに順伝播計算のみ)のクラス
    - 系列を返すモデル等一部拡張したモデルは下の方のセルにある
    - back propagation のためには状態の保持などが必要
- 勾配クリッピングを行う関数

In [0]:
import numpy as np

In [0]:
# 活性化関数の定義
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

## LSTM

### LSTMCell クラス
LSTMの順伝播(1ステップ)を計算  
入力:   
　$x$ : 各時刻の入力データ, shape = (#data, input_dim)  
　$h^{(t-1)}$ : 前の時刻の出力, shape = (#data, output_dim)  
　$s^{(t-1)}$ : 前の時刻の状態, shape = (#data, output_dim)
 
出力：　  
　$h^{(t)}$ : 出力, shape = (#data, output_dim)  
　$s^{(t)}$ : 状態, shape = (#data, output_dim)

In [0]:
np.split(np.array([1,2,3,4]),2)

[array([1, 2]), array([3, 4])]

In [0]:
class LSTMCell:
    # 出力次元の設定
    def __init__(self, input_dim, output_dim):
        self.output_dim = output_dim
        self.input_dim = input_dim
        # パラメータを初期化
        self.U = np.random.rand(input_dim, output_dim*4) # (input_dim, output_dim*4)
        self.W = np.random.rand(output_dim, output_dim*4)
        self.b = np.random.rand(output_dim*4)
    
    # 順伝播１ステップの計算
    def forward(self, x, h_prev, s_prev): # x: (#data, input_dim)
        tmp = np.dot(x, self.U) + np.dot(h_prev, self.W) + self.b
        # _z,_i,_f,_o = np.split(tmp,4,axis=1)
        _z,_i,_f,_o = np.hsplit(tmp,4)
        z = np.tanh(_z)
        i = sigmoid(_i)    # 入力ゲート
        f = sigmoid(_f)   # 忘却ゲート
        o = sigmoid(_o)    # 出力ゲート
        s = f * s_prev + i * z # + - * / は要素同士の演算
        h = o * np.tanh(s)
        return h, s

### LSTM クラス
順伝播(全ステップ)を計算  
入力:   
　$x$ : 入力データ, shape = (#data, max_len, input_dim)
 
出力：　  
　$h^{(max\_len)}$ : 出力, shape = (#data, output_dim)

In [0]:
class LSTM:
    # 出力次元の設定
    def __init__(self, max_len, input_dim, output_dim):
        self.output_dim = output_dim
        self.max_len = max_len
        self.input_dim = input_dim
        # パラメータを初期化
        self.cell = LSTMCell(input_dim, output_dim)    # 実際に計算を行うためのLSTMCell
        self.h0 = np.zeros(output_dim)    # 最初の時刻における"前の時刻の出力"
        self.s0 = np.zeros(output_dim)    # 最初の時刻における"前の時刻の状態"　(i.e. 初期状態)
    
    def forward(self, x):    # x : shape (#data, max_len, input_dim) 
        h = np.broadcast_to(self.h0, (x.shape[0], self.output_dim))    # データ数だけ　h0　を並べて　h　の初期値を作る
        s = np.broadcast_to(self.s0, (x.shape[0], self.output_dim))    # データ数だけ　s0　を並べて　s　の初期値を作る
        for t in range(self.max_len):    # 系列の長さだけ１ステップを繰り返す
            h, s = self.cell.forward(x[:, t, :], h, s)
        return h

## GRU

### GRUCell クラス
GRUの順伝播(1ステップ)を計算  
入力:   
　$x$ : 各時刻の入力データ, shape = (#data, input_dim)  
　$h^{(t-1)}$ : 前の時刻の出力, shape = (#data, output_dim)  
 
出力：　  
　$h^{(t)}$ : 出力, shape = (#data, output_dim)  

In [0]:
class GRUCell:
    # 出力次元の設定
    def __init__(self, input_dim, output_dim):
        self.output_dim = output_dim
        self.input_dim = input_dim
        # パラメータを初期化
        self.U_r = np.random.rand(input_dim, output_dim)
        self.U_z = np.random.rand(input_dim, output_dim)
        self.U_h_tilde = np.random.rand(input_dim, output_dim)
        self.W_r = np.random.rand(output_dim, output_dim)
        self.W_z = np.random.rand(output_dim, output_dim)
        self.W_h_tilde = np.random.rand(output_dim, output_dim)
        self.b_r = np.random.rand(output_dim)
        self.b_z = np.random.rand(output_dim)
        self.b_h_tilde = np.random.rand(output_dim)
    
    # 順伝播１ステップの計算
    def forward(self, x, h_prev):
        r = sigmoid(np.dot(x, self.U_r) + np.dot(h_prev, self.W_r) + self.b_r)    # リセットゲート
        h_tilde = np.tanh(np.dot(x, self.U_h_tilde) + np.dot(r * h_prev, self.W_h_tilde) + self.b_h_tilde)
        z = sigmoid(np.dot(x, self.U_z) + np.dot(h_prev, self.W_z) + self.b_z)   # 更新ゲート
        h = (1-z) * h_prev + z * h_tilde
        return h

### GRU クラス
順伝播(全ステップ)を計算  
入力:   
　$x$ : 入力データ, shape = (#data, max_len, input_dim)
 
出力：　  
　$h^{(max\_len)}$ : 出力, shape = (#data, output_dim)

In [0]:
class GRU:
    # 出力次元の設定
    def __init__(self, max_len, input_dim, output_dim):
        self.output_dim = output_dim
        self.max_len = max_len
        self.input_dim = input_dim
        # パラメータを初期化
        self.cell = GRUCell(input_dim, output_dim)    # 実際に計算を行うためのGRUCell
        self.h0 = np.zeros(output_dim)    # 最初の時刻における"前の時刻の出力" (i.e. 初期状態)
    
    def forward(self, x):    # x : shape (#data, max_len, input_dim)
        h = np.broadcast_to(self.h0, (x.shape[0], self.output_dim))    # データ数だけ　h0　を並べて　h　の初期値を作る
        for t in range(self.max_len):    # 系列の長さだけ１ステップを繰り返す
            h = self.cell.forward(x[:, t, :], h)
        return h

## LSTMとGRUのテスト
※backpropがないので学習はできません

In [0]:
# 入力 : 2 データ,　系列長 5, 特徴量　10次元
x = np.ones((2,5,10), dtype = np.float)

In [6]:
np.random.seed(1)
lstm = LSTM(5,10,2)
lstm.forward(x)

array([[0.99766685, 0.99757613],
       [0.99766685, 0.99757613]])

In [0]:
np.random.seed(1)
gru = GRU(5,10,2)
gru.forward(x)

array([[0.99999887, 0.99995055],
       [0.99999887, 0.99995055]])

## 勾配クリッピング

In [0]:
# gs: 勾配の行列(またはベクトル)のリスト,  v: 勾配ノルムの閾値
def grad_clip(gs, v):
    # 全ての勾配を合わせてノルムを計算
    norm = 0
    for g in gs:
        norm += np.sum(g ** 2)
    norm = np.sqrt(norm)
    # ノルムが閾値を超えていたら圧縮
    if norm > v:
        gn = []
        for g in gs:
            gn.append(g / norm * v)
        return gn
    else:
        return gs

### grad_clipのテスト

In [8]:
# 勾配の代わりに使うデータ
# ここではパラメータは1つの行列にまとまっていると仮定
g = np.arange(6, dtype=np.float).reshape(3,2)
print(g)
print('norm =', np.linalg.norm(g))

[[0. 1.]
 [2. 3.]
 [4. 5.]]
norm = 7.416198487095663


In [9]:
# 閾値8でクリッピング
g_8 = grad_clip([g], 8)[0]
g_8    # 変化なし

array([[0., 1.],
       [2., 3.],
       [4., 5.]])

In [10]:
# 閾値５でクリッピング
a_5 = grad_clip([g],5)[0]
a_5

array([[0.        , 0.67419986],
       [1.34839972, 2.02259959],
       [2.69679945, 3.37099931]])

In [11]:
# クリッピング後のノルムを確認
np.linalg.norm(a_5)

5.0

# BiLSTM
上で実装したLSTMを利用して双方向LSTMを実装します。

In [0]:
class BiLSTM:
    # 出力次元の設定
    def __init__(self, max_len, input_dim, output_dim, merge_method):
        self.merge_method = merge_method
        self.output_dim = output_dim
        self.max_len = max_len
        
        self.forwardLSTM = LSTM(max_len, input_dim, output_dim)
        self.backwardLSTM = LSTM(max_len, input_dim, output_dim)
    
    def forward(self, x):
        fh = self.forwardLSTM.forward(x)
        bh = self.backwardLSTM.forward(x)
        if self.merge_method=="add":
            return fh + bh
        elif self.merge_method=="mean":
            return (fh + bh)/2
        elif self.merge_method=="concat":
            return np.concatenate((fh, bh),axis=1)
        else:
            raise NotImplemented("unknown merge method")

# BiLSTMのテスト

In [0]:
# 入力 : 2 データ,　系列長 5, 特徴量　10次元
x = np.ones((2,5,10), dtype = np.float)

In [0]:
np.random.seed(1)
lstm = BiLSTM(5,10,10,"add")
lstm.forward(x)

array([[1.99920705, 1.99980443, 1.99978512, 1.99973577, 1.99977834,
        1.99978311, 1.99967825, 1.99964642, 1.9998059 , 1.99977405],
       [1.99920705, 1.99980443, 1.99978512, 1.99973577, 1.99977834,
        1.99978311, 1.99967825, 1.99964642, 1.9998059 , 1.99977405]])

## 以下演習問題の回答として
必要があれば講義内容として

## Peephole Connectionsを追加したLSTM

In [0]:
class LSTMCell_P:
    # 出力次元の設定
    def __init__(self, input_dim, output_dim):
        self.output_dim = output_dim
        self.input_dim = input_dim
        # パラメータを初期化
        self.U_i = np.random.rand(input_dim, output_dim)
        self.U_f = np.random.rand(input_dim, output_dim)
        self.U_o = np.random.rand(input_dim, output_dim)
        self.U_z = np.random.rand(input_dim, output_dim)
        self.W_i = np.random.rand(output_dim, output_dim)
        self.W_f = np.random.rand(output_dim, output_dim)
        self.W_o = np.random.rand(output_dim, output_dim)
        self.W_z = np.random.rand(output_dim, output_dim)
        self.b_i = np.random.rand(output_dim)
        self.b_f = np.random.rand(output_dim)
        self.b_o = np.random.rand(output_dim)
        self.b_z = np.random.rand(output_dim)
        # peephole の重み
        self.p_i = np.random.rand(output_dim)
        self.p_f = np.random.rand(output_dim)
        self.p_o = np.random.rand(output_dim)
    
    # 順伝播１ステップの計算
    def forward(self, x, h_prev, s_prev):
        z = np.tanh(np.dot(x, self.U_z) + np.dot(h_prev, self.W_z) +  self.b_z)
        i = sigmoid(np.dot(x, self.U_i) + np.dot(h_prev, self.W_i) + self.p_i * s_prev + self.b_i)    # 入力ゲート
        f = sigmoid(np.dot(x, self.U_f) + np.dot(h_prev, self.W_f) + self.p_f * s_prev + self.b_f)   # 忘却ゲート
        s = f * s_prev + i * z
        o = sigmoid(np.dot(x, self.U_o) + np.dot(h_prev, self.W_o) + self.p_o * s + self.b_o)    # 出力ゲート
        h = o * np.tanh(s)
        return h, s

In [0]:
class LSTM_P:
    # 出力次元の設定
    def __init__(self, max_len, input_dim, output_dim):
        self.output_dim = output_dim
        self.max_len = max_len
        self.input_dim = input_dim
        # パラメータを初期化
        self.cell = LSTMCell_P(input_dim, output_dim)    # 実際に計算を行うためのLSTMCell
        self.h0 = np.zeros(output_dim)    # 最初の時刻における"前の時刻の出力"
        self.s0 = np.zeros(output_dim)    # 最初の時刻における"前の時刻の状態"　(i.e. 初期状態)
    
    def forward(self, x):    # x : shape (#data, max_len, input_dim) 
        h = np.broadcast_to(self.h0, (x.shape[0], self.output_dim))    # データ数だけ　h0　を並べて　h　の初期値を作る
        s = np.broadcast_to(self.s0, (x.shape[0], self.output_dim))    # データ数だけ　s0　を並べて　s　の初期値を作る
        for t in range(self.max_len):    # 系列の長さだけ１ステップを繰り返す
            h, s = self.cell.forward(x[:, t, :], h, s)
        return h

## 系列を出力するLSTM

In [0]:
class LSTM_Seq:
    # 出力次元の設定
    def __init__(self, max_len, input_dim, output_dim):
        self.output_dim = output_dim
        self.max_len = max_len
        self.input_dim = input_dim
        # パラメータを初期化
        self.cell = LSTMCell(input_dim, output_dim)    # 実際に計算を行うためのLSTMCell
        self.h0 = np.zeros(output_dim)    # 最初の時刻における"前の時刻の出力"
        self.s0 = np.zeros(output_dim)    # 最初の時刻における"前の時刻の状態"　(i.e. 初期状態)
    
    def forward(self, x):    # x : shape (#data, max_len, input_dim) 
        h = np.broadcast_to(self.h0, (x.shape[0], self.output_dim))    # データ数だけ　h0　を並べて　h　の初期値を作る
        s = np.broadcast_to(self.s0, (x.shape[0], self.output_dim))    # データ数だけ　s0　を並べて　s　の初期値を作る
        hs = np.empty((x.shape[0], self.max_len, self.output_dim))    # 出力する系列の保存用
        for t in range(self.max_len):    # 系列の長さだけ１ステップを繰り返す
            h, s = self.cell.forward(x[:, t, :], h, s)
            hs[:, t, :] = h
        return hs

## 系列を出力するGRU

In [0]:
class GRU_Seq:
    # 出力次元の設定
    def __init__(self, max_len, input_dim, output_dim):
        self.output_dim = output_dim
        self.max_len = max_len
        self.input_dim = input_dim
        # パラメータを初期化
        self.cell = GRUCell(input_dim, output_dim)    # 実際に計算を行うためのGRUCell
        self.h0 = np.zeros(output_dim)    # 最初の時刻における"前の時刻の出力" (i.e. 初期状態)
    
    def forward(self, x):    # x : shape (#data, max_len, input_dim)
        h = np.broadcast_to(self.h0, (x.shape[0], self.output_dim))    # データ数だけ　h0　を並べて　h　の初期値を作る
        hs = np.empty((x.shape[0], self.max_len, self.output_dim))    # 出力する系列の保存用
        for t in range(self.max_len):    # 系列の長さだけ１ステップを繰り返す
            h = self.cell.forward(x[:, t, :], h)
            hs[:, t, :] = h
        return hs

In [0]:
np.random.seed(1)
lstm_p = LSTM_P(5,10,2)
lstm_p.forward(x)

array([[0.9997627 , 0.99989612],
       [0.9997627 , 0.99989612]])

`array([[0.9997627 , 0.99989612],
       [0.9997627 , 0.99989612]])`

In [0]:
np.random.seed(1)
lstm_seq = LSTM_Seq(5,10,2)
lstm_seq.forward(x)

array([[[0.73940966, 0.75362175],
        [0.95555568, 0.9611083 ],
        [0.99036505, 0.99312864],
        [0.9953357 , 0.99755416],
        [0.99603789, 0.99815691]],

       [[0.73940966, 0.75362175],
        [0.95555568, 0.9611083 ],
        [0.99036505, 0.99312864],
        [0.9953357 , 0.99755416],
        [0.99603789, 0.99815691]]])

`array([[[0.73940966, 0.75362175],
        [0.95555568, 0.9611083 ],
        [0.99036505, 0.99312864],
        [0.9953357 , 0.99755416],
        [0.99603789, 0.99815691]],
       [[0.73940966, 0.75362175],
        [0.95555568, 0.9611083 ],
        [0.99036505, 0.99312864],
        [0.9953357 , 0.99755416],
        [0.99603789, 0.99815691]]])`

In [0]:
np.random.seed(1)
gru_seq = GRU_Seq(5,10,2)
gru_seq.forward(x)

array([[[0.99387413, 0.99828587],
        [0.99998052, 0.99994929],
        [0.99999882, 0.99995054],
        [0.99999887, 0.99995055],
        [0.99999887, 0.99995055]],

       [[0.99387413, 0.99828587],
        [0.99998052, 0.99994929],
        [0.99999882, 0.99995054],
        [0.99999887, 0.99995055],
        [0.99999887, 0.99995055]]])

`array([[[0.00562019, 0.00150819],
        [0.01119976, 0.00300842],
        [0.01673909, 0.00450079],
        [0.02223855, 0.00598539],
        [0.0276985 , 0.00746229]],
       [[0.00562019, 0.00150819],
        [0.01119976, 0.00300842],
        [0.01673909, 0.00450079],
        [0.02223855, 0.00598539],
        [0.0276985 , 0.00746229]]])`

# Seq2Seq
学習可能なseq2seqのサンプル実装

In [0]:
import os
import time

id_to_char = {}
char_to_id = {}


def _update_vocab(txt):
    for char in txt:
        if char not in char_to_id:
            tmp_id = len(char_to_id)
            char_to_id[char] = tmp_id
            id_to_char[tmp_id] = char


def load_data():

    questions, answers = [], []

    for line in open('addition.txt', 'r'):
        idx = line.find('_')
        questions.append(line[:idx])
        answers.append(line[idx:-1])

    # create vocab dict
    for i in range(len(questions)):
        q, a = questions[i], answers[i]
        _update_vocab(q)
        _update_vocab(a)

    # create numpy array
    x = np.zeros((len(questions), len(questions[0])), dtype=np.int)
    t = np.zeros((len(questions), len(answers[0])), dtype=np.int)

    for i, sentence in enumerate(questions):
        x[i] = [char_to_id[c] for c in list(sentence)]
    for i, sentence in enumerate(answers):
        t[i] = [char_to_id[c] for c in list(sentence)]

    # shuffle
    indices = np.arange(len(x))
    np.random.seed(1984)
    np.random.shuffle(indices)
    x = x[indices]
    t = t[indices]

    # 10% for validation set
    split_at = len(x) - len(x) // 10
    (x_train, x_test) = x[:split_at], x[split_at:]
    (t_train, t_test) = t[:split_at], t[split_at:]

    return (x_train, t_train), (x_test, t_test)


class Adam:
    '''
    Adam (http://arxiv.org/abs/1412.6980v8)
    '''
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = [], []
            for param in params:
                self.m.append(np.zeros_like(param))
                self.v.append(np.zeros_like(param))
        
        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)

        for i in range(len(params)):
            self.m[i] += (1 - self.beta1) * (grads[i] - self.m[i])
            self.v[i] += (1 - self.beta2) * (grads[i]**2 - self.v[i])
            
            params[i] -= lr_t * self.m[i] / (np.sqrt(self.v[i]) + 1e-7)


def eval_seq2seq(model, question, correct, id_to_char,
                 verbos=False):
    correct = correct.flatten()
    # 頭の区切り文字
    start_id = correct[0]
    correct = correct[1:]
    guess = model.generate(question, start_id, len(correct))

    # 文字列へ変換
    question = ''.join([id_to_char[int(c)] for c in question.flatten()])
    correct = ''.join([id_to_char[int(c)] for c in correct])
    guess = ''.join([id_to_char[int(c)] for c in guess])

    if verbos:

        colors = {'ok': '\033[92m', 'fail': '\033[91m', 'close': '\033[0m'}
        print('Q', question)
        print('T', correct)

        is_windows = os.name == 'nt'

        if correct == guess:
            mark = colors['ok'] + '☑' + colors['close']
            if is_windows:
                mark = 'O'
            print(mark + ' ' + guess)
        else:
            mark = colors['fail'] + '☒' + colors['close']
            if is_windows:
                mark = 'X'
            print(mark + ' ' + guess)
        print('---')

    return 1 if guess == correct else 0


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(x):
    x = x - x.max(axis=1, keepdims=True)
    x = np.exp(x)
    x /= x.sum(axis=1, keepdims=True)
    return x


class Embedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None

    def forward(self, idx):
        W, = self.params
        self.idx = idx
        out = W[idx]
        return out

    def backward(self, dout):
        dW, = self.grads
        dW[...] = 0
        np.add.at(dW, self.idx, dout)
        return None



class LSTMCell:
    def __init__(self, Wx, Wh, b):
        '''

        Parameters
        ----------
        Wx: 入力`x`用の重みパラーメタ（4つ分の重みをまとめる）
        Wh: 隠れ状態`h`用の重みパラメータ（4つ分の重みをまとめる）
        b: バイアス（4つ分のバイアスをまとめる）
        '''
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.cache = None

    def forward(self, x, h_prev, c_prev):
        Wx, Wh, b = self.params
        N, H = h_prev.shape

        A = np.dot(x, Wx) + np.dot(h_prev, Wh) + b

        f = A[:, :H]
        g = A[:, H:2*H]
        i = A[:, 2*H:3*H]
        o = A[:, 3*H:]

        f = sigmoid(f)
        g = np.tanh(g)
        i = sigmoid(i)
        o = sigmoid(o)

        c_next = f * c_prev + g * i
        h_next = o * np.tanh(c_next)

        self.cache = (x, h_prev, c_prev, i, f, g, o, c_next)
        return h_next, c_next

    def backward(self, dh_next, dc_next):
        Wx, Wh, b = self.params
        x, h_prev, c_prev, i, f, g, o, c_next = self.cache

        tanh_c_next = np.tanh(c_next)

        ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)

        dc_prev = ds * f

        di = ds * g
        df = ds * c_prev
        do = dh_next * tanh_c_next
        dg = ds * i

        di *= i * (1 - i)
        df *= f * (1 - f)
        do *= o * (1 - o)
        dg *= (1 - g ** 2)

        dA = np.hstack((df, dg, di, do))

        dWh = np.dot(h_prev.T, dA)
        dWx = np.dot(x.T, dA)
        db = dA.sum(axis=0)

        self.grads[0][...] = dWx
        self.grads[1][...] = dWh
        self.grads[2][...] = db

        dx = np.dot(dA, Wx.T)
        dh_prev = np.dot(dA, Wh.T)

        return dx, dh_prev, dc_prev


class LSTM:
    def __init__(self, Wx, Wh, b, stateful=False):
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.layers = None

        self.h, self.c = None, None
        self.dh = None
        self.stateful = stateful

    def forward(self, xs):
        Wx, Wh, b = self.params
        N, T, D = xs.shape
        H = Wh.shape[0]

        self.layers = []
        hs = np.empty((N, T, H), dtype='f')

        if not self.stateful or self.h is None:
            self.h = np.zeros((N, H), dtype='f')
        if not self.stateful or self.c is None:
            self.c = np.zeros((N, H), dtype='f')

        for t in range(T):
            layer = LSTMCell(*self.params)
            self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c)
            hs[:, t, :] = self.h

            self.layers.append(layer)

        return hs

    def backward(self, dhs):
        Wx, Wh, b = self.params
        N, T, H = dhs.shape
        D = Wx.shape[0]

        dxs = np.empty((N, T, D), dtype='f')
        dh, dc = 0, 0

        grads = [0, 0, 0]
        for t in reversed(range(T)):
            layer = self.layers[t]
            dx, dh, dc = layer.backward(dhs[:, t, :] + dh, dc)
            dxs[:, t, :] = dx
            for i, grad in enumerate(layer.grads):
                grads[i] += grad

        for i, grad in enumerate(grads):
            self.grads[i][...] = grad
        self.dh = dh
        return dxs

    def set_state(self, h, c=None):
        self.h, self.c = h, c

    def reset_state(self):
        self.h, self.c = None, None


class TimeEmbedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.layers = None
        self.W = W

    def forward(self, xs):
        N, T = xs.shape
        V, D = self.W.shape

        out = np.empty((N, T, D), dtype='f')
        self.layers = []

        for t in range(T):
            layer = Embedding(self.W)
            out[:, t, :] = layer.forward(xs[:, t])
            self.layers.append(layer)

        return out

    def backward(self, dout):
        N, T, D = dout.shape

        grad = 0
        for t in range(T):
            layer = self.layers[t]
            layer.backward(dout[:, t, :])
            grad += layer.grads[0]

        self.grads[0][...] = grad
        return None


class TimeAffine:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.x = None

    def forward(self, x):
        N, T, D = x.shape
        W, b = self.params

        rx = x.reshape(N*T, -1)
        out = np.dot(rx, W) + b
        self.x = x
        return out.reshape(N, T, -1)

    def backward(self, dout):
        x = self.x
        N, T, D = x.shape
        W, b = self.params

        dout = dout.reshape(N*T, -1)
        rx = x.reshape(N*T, -1)

        db = np.sum(dout, axis=0)
        dW = np.dot(rx.T, dout)
        dx = np.dot(dout, W.T)
        dx = dx.reshape(*x.shape)

        self.grads[0][...] = dW
        self.grads[1][...] = db

        return dx


class TimeSoftmaxWithLoss:
    def __init__(self):
        self.params, self.grads = [], []
        self.cache = None
        self.ignore_label = -1

    def forward(self, xs, ts):
        N, T, V = xs.shape

        if ts.ndim == 3:  # 教師ラベルがone-hotベクトルの場合
            ts = ts.argmax(axis=2)

        mask = (ts != self.ignore_label)

        # バッチ分と時系列分をまとめる（reshape）
        xs = xs.reshape(N * T, V)
        ts = ts.reshape(N * T)
        mask = mask.reshape(N * T)

        ys = softmax(xs)
        ls = np.log(ys[np.arange(N * T), ts])
        ls *= mask  # ignore_labelに該当するデータは損失を0にする
        loss = -np.sum(ls)
        loss /= mask.sum()

        self.cache = (ts, ys, mask, (N, T, V))
        return loss

    def backward(self, dout=1):
        ts, ys, mask, (N, T, V) = self.cache

        dx = ys
        dx[np.arange(N * T), ts] -= 1
        dx *= dout
        dx /= mask.sum()
        dx *= mask[:, np.newaxis]  # ignore_labelに該当するデータは勾配を0にする

        dx = dx.reshape((N, T, V))

        return dx





class Encoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = LSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False)

        self.params = self.embed.params + self.lstm.params
        self.grads = self.embed.grads + self.lstm.grads
        self.hs = None

    def forward(self, xs):
        xs = self.embed.forward(xs)
        hs = self.lstm.forward(xs)
        self.hs = hs
        return hs[:, -1, :]

    def backward(self, dh):
        dhs = np.zeros_like(self.hs)
        dhs[:, -1, :] = dh

        dout = self.lstm.backward(dhs)
        dout = self.embed.backward(dout)
        return dout


class Decoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = LSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.affine = TimeAffine(affine_W, affine_b)

        self.params, self.grads = [], []
        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, xs, h):
        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        out = self.lstm.forward(out)
        score = self.affine.forward(out)
        return score

    def backward(self, dscore):
        dout = self.affine.backward(dscore)
        dout = self.lstm.backward(dout)
        dout = self.embed.backward(dout)
        dh = self.lstm.dh
        return dh

    def generate(self, h, start_id, sample_size):
        sampled = []
        sample_id = start_id
        self.lstm.set_state(h)

        for _ in range(sample_size):
            x = np.array(sample_id).reshape((1, 1))
            out = self.embed.forward(x)
            out = self.lstm.forward(out)
            score = self.affine.forward(out)

            sample_id = np.argmax(score.flatten())
            sampled.append(int(sample_id))

        return sampled


class Seq2seq:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = Decoder(V, D, H)
        self.softmax = TimeSoftmaxWithLoss()

        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads

    def forward(self, xs, ts):
        decoder_xs, decoder_ts = ts[:, :-1], ts[:, 1:]

        h = self.encoder.forward(xs)
        score = self.decoder.forward(decoder_xs, h)
        loss = self.softmax.forward(score, decoder_ts)
        return loss

    def backward(self, dout=1):
        dout = self.softmax.backward(dout)
        dh = self.decoder.backward(dout)
        dout = self.encoder.backward(dh)
        return dout

    def generate(self, xs, start_id, sample_size):
        h = self.encoder.forward(xs)
        sampled = self.decoder.generate(h, start_id, sample_size)
        return sampled


def main():
    (x_train, t_train), (x_test, t_test) = load_data()

    # ハイパーパラメータの設定
    vocab_size = len(char_to_id)
    wordvec_size = 16
    hideen_size = 128
    max_epoch = 25
    eval_interval = 20
    batch_size=128

    model = Seq2seq(vocab_size, wordvec_size, hideen_size)
    optimizer = Adam()

    acc_list = []
    x,t = x_train,t_train
    data_size = len(x)
    max_iters = data_size // batch_size
    for epoch in range(max_epoch):
        total_loss = 0
        loss_count = 0
        
        idx = np.random.permutation(np.arange(data_size))
        x = x[idx]
        t = t[idx]

        for iters in range(max_iters):
            batch_x = x[iters*batch_size:(iters+1)*batch_size]
            batch_t = t[iters*batch_size:(iters+1)*batch_size]

            # 勾配を求め、パラメータを更新
            loss = model.forward(batch_x, batch_t)
            model.backward()
            optimizer.update(model.params, model.grads)
            total_loss += loss
            loss_count += 1

            # 評価
            if iters % eval_interval == 0:
                avg_loss = total_loss / loss_count
                print('| epoch %d |  iter %d / %d | loss %.2f'
                        % (epoch + 1, iters + 1, max_iters, avg_loss))
                total_loss, loss_count = 0, 0

        epoch += 1

        correct_num = 0
        for i in range(len(x_test)):
            question, correct = x_test[[i]], t_test[[i]]
            verbose = i < 10
            correct_num += eval_seq2seq(model, question, correct,
                                        id_to_char, verbose)

        acc = float(correct_num) / len(x_test)
        acc_list.append(acc)
        print('val acc %.3f%%' % (acc * 100))

main()

FileNotFoundError: ignored