### 文書生成の実装

In [1]:
from common.functions import softmax
from rnnlm import Rnnlm
from better_rnnlm import BetterRnnlm

In [2]:
class RnnlmGen(Rnnlm):
    def generate(self, start_id, skip_ids=None, sample_size=100):
        word_ids = [start_id]
        
        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1, 1)
            score = self.predict(x)
            p = softmax(score.flatten())
            
            sampled = np.random.choice(len(p), size=1, p=p)
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x))
                
        return word_ids

In [3]:
from dataset import ptb

In [4]:
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)

In [5]:
model = RnnlmGen()
# model.load_params('./Rnnlm.pkl')

In [6]:
start_word = 'you'
start_id = word_to_id[start_word]
skip_words = ['N', '<unk>', '$']
skip_ids = [word_to_id[w] for w in skip_words]

In [7]:
word_ids = model.generate(start_id, skip_ids)
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')
print(txt)

you portraying weil uniform libel tracks nekoosa modify abused cox hampered chaos sellers neighbors nixon evident stayed adviser reasonable detrex ignoring partly personally canceled affairs eurodollar he legislator suitable announced institution provoked unit reverse traffickers tied constitute surge higher on-site polled jolted actions barney sight televised suffering outlined balked decisions predict edward stated effect pfizer madison shortly corporations crest sanford barbara analysis yeargin bob mass lauder harold honduras wendy turned cray respondents obliged helping air trips pump tree maker leery free booked redevelopment industries orange substance marketplace expectation oddly streets survival introduction background waters tide killing operators slumped mae wanting


### 足し算データセット

In [8]:
from dataset import sequence

In [9]:
(x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt', seed=1984)
char_to_id, id_to_char = sequence.get_vocab()

In [10]:
print(x_train.shape, t_train.shape)
print(x_test.shape, t_test.shape)

(45000, 7) (45000, 5)
(5000, 7) (5000, 5)


In [11]:
print(x_train[0])
print(t_train[0])

[ 3  0  2  0  0 11  5]
[ 6  0 11  7  5]


In [12]:
print(''.join([id_to_char[c] for c in x_train[0]]))
print(''.join([id_to_char[c] for c in t_train[0]]))

71+118 
_189 


### Seq2Seqの実装

In [13]:
from common.time_layers import *

In [14]:
class Encoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        
        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False)
        
        self.params = self.embed.params + self.lstm.params
        self.grads = self.embed.grads + self.lstm.grads
        self.hs = None
        
    def forward(self, xs):
        xs = self.embed.forward(xs)
        hs = self.lstm.forward(xs)
        self.hs = hs
        return hs[:, -1, :]
    
    def backward(self, dh):
        dhs = np.zeros_like(self.hs)
        dhs[:, -1, :] = dh
        
        dout = self.lstm.backward(dhs)
        dout = self.embed.backward(dout)
        return dout

In [15]:
class Decoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        
        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.affine = TimeAffine(affine_W, affine_b)
        
        self.params, self.grads = [], []
        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads
            
    def forward(self, xs, h):
        self.lstm.set_state(h)
        
        out = self.embed.forward(xs)
        out = self.lstm.forward(out)
        score = self.affine.forward(out)
        return score
       
    def backward(self, dscore):
        dout = self.affine.backward(dscore)
        dout = self.lstm.backward(dout)
        dout = self.embed.backward(dout)
        dh = self.lstm.dh
        return dh
    
    def generate(self, h, start_id, sample_size):
        sampled = []
        sample_id = start_id
        self.lstm.set_state(h)
        
        for _ in range(sample_size):
            x = np.array(sample_id).reshape((1, 1))
            out = self.embed.forward(x)
            out = self.lstm.forward(out)
            score = self.affine.forward(out)
            
            sample_id = np.argmax(score.flatten())
            sampled.append(int(sample_id))
            
        return sampled

In [16]:
from common.base_model import BaseModel

In [17]:
class Seq2seq(BaseModel):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = Decoder(V, D, H)
        self.softmax = TimeSoftmaxWithLoss()
        
        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads
        
    def forward(self, xs, ts):
        decoder_xs, decoder_ts = ts[:, :-1], ts[:, 1:]
        
        h = self.encoder.forward(xs)
        score = self.decoder.forward(decoder_xs, h)
        loss = self.softmax.forward(score, decoder_ts)
        return loss
    
    def backward(self, dout=1):
        dout = self.softmax.backward(dout)
        dh = self.decoder.backward(dout)
        dout = self.encoder.backward(dh)
        return dout
    
    def generate(self, xs, start_id, sample_size):
        h = self.encoder.forward(xs)
        sampled = self.decoder.generate(h, start_id, sample_size)
        return sampled

In [18]:
from dataset import sequence
from common.optimizer import Adam
from common.trainer import Trainer
from common.util import eval_seq2seq

In [19]:
(x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt')
char_to_id, id_to_char = sequence.get_vocab()

In [20]:
vocab_size = len(char_to_id)
wordvec_size = 16
hidden_size = 128
batch_size = 128
max_epoch = 25
max_grad = 5.0

In [21]:
model = Seq2seq(vocab_size, wordvec_size, hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)

In [22]:
acc_list = []
for epoch in range(max_epoch):
    trainer.fit(x_train, t_train, max_epoch=1, batch_size=batch_size, max_grad=max_grad)
    
    correct_num = 0
    for i in range(len(x_test)):
        question, correct = x_test[[i]], t_test[[i]]
        verbose = i < 10
        correct_num += eval_seq2seq(model, question, correct, id_to_char, verbose)
        
    acc = float(correct_num) / len(x_test)
    acc_list.append(acc)
    print('val acc %.3f%%' % (acc * 100))

| epoch 1 |  iter 1 / 351 | time 0[s] | loss 2.56
| epoch 1 |  iter 21 / 351 | time 0[s] | loss 2.53
| epoch 1 |  iter 41 / 351 | time 1[s] | loss 2.17
| epoch 1 |  iter 61 / 351 | time 1[s] | loss 1.96
| epoch 1 |  iter 81 / 351 | time 2[s] | loss 1.92
| epoch 1 |  iter 101 / 351 | time 3[s] | loss 1.87
| epoch 1 |  iter 121 / 351 | time 3[s] | loss 1.85
| epoch 1 |  iter 141 / 351 | time 4[s] | loss 1.83
| epoch 1 |  iter 161 / 351 | time 5[s] | loss 1.79
| epoch 1 |  iter 181 / 351 | time 6[s] | loss 1.77
| epoch 1 |  iter 201 / 351 | time 7[s] | loss 1.77
| epoch 1 |  iter 221 / 351 | time 7[s] | loss 1.76
| epoch 1 |  iter 241 / 351 | time 8[s] | loss 1.76
| epoch 1 |  iter 261 / 351 | time 9[s] | loss 1.76
| epoch 1 |  iter 281 / 351 | time 9[s] | loss 1.75
| epoch 1 |  iter 301 / 351 | time 10[s] | loss 1.74
| epoch 1 |  iter 321 / 351 | time 11[s] | loss 1.75
| epoch 1 |  iter 341 / 351 | time 11[s] | loss 1.74
Q 77+85  
T 162 
[91m☒[0m 100 
---
Q 975+164
T 1139
[91m☒[0m 10

| epoch 7 |  iter 121 / 351 | time 4[s] | loss 1.08
| epoch 7 |  iter 141 / 351 | time 4[s] | loss 1.07
| epoch 7 |  iter 161 / 351 | time 5[s] | loss 1.08
| epoch 7 |  iter 181 / 351 | time 6[s] | loss 1.07
| epoch 7 |  iter 201 / 351 | time 7[s] | loss 1.06
| epoch 7 |  iter 221 / 351 | time 7[s] | loss 1.06
| epoch 7 |  iter 241 / 351 | time 8[s] | loss 1.06
| epoch 7 |  iter 261 / 351 | time 9[s] | loss 1.06
| epoch 7 |  iter 281 / 351 | time 9[s] | loss 1.09
| epoch 7 |  iter 301 / 351 | time 10[s] | loss 1.05
| epoch 7 |  iter 321 / 351 | time 11[s] | loss 1.05
| epoch 7 |  iter 341 / 351 | time 12[s] | loss 1.04
Q 77+85  
T 162 
[91m☒[0m 156 
---
Q 975+164
T 1139
[91m☒[0m 1160
---
Q 582+84 
T 666 
[91m☒[0m 665 
---
Q 8+155  
T 163 
[91m☒[0m 146 
---
Q 367+55 
T 422 
[91m☒[0m 418 
---
Q 600+257
T 857 
[91m☒[0m 856 
---
Q 761+292
T 1053
[91m☒[0m 1039
---
Q 830+597
T 1427
[91m☒[0m 1409
---
Q 26+838 
T 864 
[92m☑[0m 864 
---
Q 143+93 
T 236 
[91m☒[0m 222 
---
val

| epoch 13 |  iter 221 / 351 | time 7[s] | loss 0.89
| epoch 13 |  iter 241 / 351 | time 8[s] | loss 0.88
| epoch 13 |  iter 261 / 351 | time 9[s] | loss 0.90
| epoch 13 |  iter 281 / 351 | time 10[s] | loss 0.91
| epoch 13 |  iter 301 / 351 | time 10[s] | loss 0.96
| epoch 13 |  iter 321 / 351 | time 11[s] | loss 0.92
| epoch 13 |  iter 341 / 351 | time 12[s] | loss 0.91
Q 77+85  
T 162 
[91m☒[0m 161 
---
Q 975+164
T 1139
[91m☒[0m 1118
---
Q 582+84 
T 666 
[91m☒[0m 669 
---
Q 8+155  
T 163 
[91m☒[0m 167 
---
Q 367+55 
T 422 
[91m☒[0m 419 
---
Q 600+257
T 857 
[91m☒[0m 859 
---
Q 761+292
T 1053
[91m☒[0m 1039
---
Q 830+597
T 1427
[91m☒[0m 1418
---
Q 26+838 
T 864 
[91m☒[0m 859 
---
Q 143+93 
T 236 
[91m☒[0m 239 
---
val acc 7.420%
| epoch 14 |  iter 1 / 351 | time 0[s] | loss 0.89
| epoch 14 |  iter 21 / 351 | time 0[s] | loss 0.89
| epoch 14 |  iter 41 / 351 | time 1[s] | loss 0.89
| epoch 14 |  iter 61 / 351 | time 2[s] | loss 0.90
| epoch 14 |  iter 81 / 351 | tim

| epoch 19 |  iter 301 / 351 | time 10[s] | loss 0.81
| epoch 19 |  iter 321 / 351 | time 11[s] | loss 0.83
| epoch 19 |  iter 341 / 351 | time 12[s] | loss 0.85
Q 77+85  
T 162 
[91m☒[0m 164 
---
Q 975+164
T 1139
[91m☒[0m 1160
---
Q 582+84 
T 666 
[91m☒[0m 672 
---
Q 8+155  
T 163 
[91m☒[0m 167 
---
Q 367+55 
T 422 
[91m☒[0m 424 
---
Q 600+257
T 857 
[92m☑[0m 857 
---
Q 761+292
T 1053
[91m☒[0m 1049
---
Q 830+597
T 1427
[91m☒[0m 1424
---
Q 26+838 
T 864 
[91m☒[0m 872 
---
Q 143+93 
T 236 
[91m☒[0m 237 
---
val acc 8.760%
| epoch 20 |  iter 1 / 351 | time 0[s] | loss 0.80
| epoch 20 |  iter 21 / 351 | time 0[s] | loss 0.82
| epoch 20 |  iter 41 / 351 | time 1[s] | loss 0.83
| epoch 20 |  iter 61 / 351 | time 2[s] | loss 0.83
| epoch 20 |  iter 81 / 351 | time 2[s] | loss 0.81
| epoch 20 |  iter 101 / 351 | time 3[s] | loss 0.85
| epoch 20 |  iter 121 / 351 | time 4[s] | loss 0.82
| epoch 20 |  iter 141 / 351 | time 5[s] | loss 0.79
| epoch 20 |  iter 161 / 351 | time

val acc 10.500%


In [23]:
class PeekyDecoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(H + D, 4 * H) / np.sqrt(H + D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(H + H, V) / np.sqrt(H + H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.affine = TimeAffine(affine_W, affine_b)

        self.params, self.grads = [], []
        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads
        self.cache = None

    def forward(self, xs, h):
        N, T = xs.shape
        N, H = h.shape

        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        hs = np.repeat(h, T, axis=0).reshape(N, T, H)
        out = np.concatenate((hs, out), axis=2)

        out = self.lstm.forward(out)
        out = np.concatenate((hs, out), axis=2)

        score = self.affine.forward(out)
        self.cache = H
        return score

    def backward(self, dscore):
        H = self.cache

        dout = self.affine.backward(dscore)
        dout, dhs0 = dout[:, :, H:], dout[:, :, :H]
        dout = self.lstm.backward(dout)
        dembed, dhs1 = dout[:, :, H:], dout[:, :, :H]
        self.embed.backward(dembed)

        dhs = dhs0 + dhs1
        dh = self.lstm.dh + np.sum(dhs, axis=1)
        return dh

    def generate(self, h, start_id, sample_size):
        sampled = []
        char_id = start_id
        self.lstm.set_state(h)

        H = h.shape[1]
        peeky_h = h.reshape(1, 1, H)
        for _ in range(sample_size):
            x = np.array([char_id]).reshape((1, 1))
            out = self.embed.forward(x)

            out = np.concatenate((peeky_h, out), axis=2)
            out = self.lstm.forward(out)
            out = np.concatenate((peeky_h, out), axis=2)
            score = self.affine.forward(out)

            char_id = np.argmax(score.flatten())
            sampled.append(char_id)

        return sampled


In [24]:
class PeekySeq2seq(Seq2seq):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = PeekyDecoder(V, D, H)
        self.softmax = TimeSoftmaxWithLoss()

        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads

In [25]:
from dataset import sequence
from common.optimizer import Adam
from common.trainer import Trainer
from common.util import eval_seq2seq

In [26]:
(x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt')
char_to_id, id_to_char = sequence.get_vocab()

In [27]:
x_train, x_test = x_train[:, ::-1], x_test[:, ::-1]

In [28]:
vocab_size = len(char_to_id)
wordvec_size = 16
hidden_size = 128
batch_size = 128
max_epoch = 25
max_grad = 5.0

In [29]:
model = PeekySeq2seq(vocab_size, wordvec_size, hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)

In [30]:
acc_list = []
for epoch in range(max_epoch):
    trainer.fit(x_train, t_train, max_epoch=1, batch_size=batch_size, max_grad=max_grad)
    
    correct_num = 0
    for i in range(len(x_test)):
        question, correct = x_test[[i]], t_test[[i]]
        verbose = i < 10
        correct_num += eval_seq2seq(model, question, correct, id_to_char, verbose)
        
    acc = float(correct_num) / len(x_test)
    acc_list.append(acc)
    print('val acc %.3f%%' % (acc * 100))

| epoch 1 |  iter 1 / 351 | time 0[s] | loss 2.57
| epoch 1 |  iter 21 / 351 | time 0[s] | loss 2.48
| epoch 1 |  iter 41 / 351 | time 1[s] | loss 2.20
| epoch 1 |  iter 61 / 351 | time 2[s] | loss 1.99
| epoch 1 |  iter 81 / 351 | time 2[s] | loss 1.89
| epoch 1 |  iter 101 / 351 | time 3[s] | loss 1.82
| epoch 1 |  iter 121 / 351 | time 4[s] | loss 1.82
| epoch 1 |  iter 141 / 351 | time 4[s] | loss 1.80
| epoch 1 |  iter 161 / 351 | time 5[s] | loss 1.79
| epoch 1 |  iter 181 / 351 | time 6[s] | loss 1.78
| epoch 1 |  iter 201 / 351 | time 7[s] | loss 1.77
| epoch 1 |  iter 221 / 351 | time 7[s] | loss 1.76
| epoch 1 |  iter 241 / 351 | time 8[s] | loss 1.76
| epoch 1 |  iter 261 / 351 | time 9[s] | loss 1.75
| epoch 1 |  iter 281 / 351 | time 9[s] | loss 1.74
| epoch 1 |  iter 301 / 351 | time 10[s] | loss 1.74
| epoch 1 |  iter 321 / 351 | time 11[s] | loss 1.73
| epoch 1 |  iter 341 / 351 | time 12[s] | loss 1.73
Q   58+77
T 162 
[91m☒[0m 100 
---
Q 461+579
T 1139
[91m☒[0m 10

| epoch 7 |  iter 121 / 351 | time 4[s] | loss 0.65
| epoch 7 |  iter 141 / 351 | time 5[s] | loss 0.64
| epoch 7 |  iter 161 / 351 | time 6[s] | loss 0.63
| epoch 7 |  iter 181 / 351 | time 6[s] | loss 0.61
| epoch 7 |  iter 201 / 351 | time 7[s] | loss 0.61
| epoch 7 |  iter 221 / 351 | time 8[s] | loss 0.60
| epoch 7 |  iter 241 / 351 | time 9[s] | loss 0.57
| epoch 7 |  iter 261 / 351 | time 10[s] | loss 0.57
| epoch 7 |  iter 281 / 351 | time 10[s] | loss 0.57
| epoch 7 |  iter 301 / 351 | time 11[s] | loss 0.55
| epoch 7 |  iter 321 / 351 | time 12[s] | loss 0.54
| epoch 7 |  iter 341 / 351 | time 13[s] | loss 0.53
Q   58+77
T 162 
[92m☑[0m 162 
---
Q 461+579
T 1139
[92m☑[0m 1139
---
Q  48+285
T 666 
[91m☒[0m 665 
---
Q   551+8
T 163 
[91m☒[0m 156 
---
Q  55+763
T 422 
[92m☑[0m 422 
---
Q 752+006
T 857 
[91m☒[0m 858 
---
Q 292+167
T 1053
[91m☒[0m 1052
---
Q 795+038
T 1427
[91m☒[0m 1428
---
Q  838+62
T 864 
[92m☑[0m 864 
---
Q  39+341
T 236 
[91m☒[0m 235 
---
v

| epoch 13 |  iter 201 / 351 | time 7[s] | loss 0.06
| epoch 13 |  iter 221 / 351 | time 8[s] | loss 0.06
| epoch 13 |  iter 241 / 351 | time 9[s] | loss 0.06
| epoch 13 |  iter 261 / 351 | time 10[s] | loss 0.06
| epoch 13 |  iter 281 / 351 | time 10[s] | loss 0.06
| epoch 13 |  iter 301 / 351 | time 11[s] | loss 0.05
| epoch 13 |  iter 321 / 351 | time 12[s] | loss 0.05
| epoch 13 |  iter 341 / 351 | time 13[s] | loss 0.06
Q   58+77
T 162 
[92m☑[0m 162 
---
Q 461+579
T 1139
[92m☑[0m 1139
---
Q  48+285
T 666 
[92m☑[0m 666 
---
Q   551+8
T 163 
[92m☑[0m 163 
---
Q  55+763
T 422 
[92m☑[0m 422 
---
Q 752+006
T 857 
[92m☑[0m 857 
---
Q 292+167
T 1053
[92m☑[0m 1053
---
Q 795+038
T 1427
[92m☑[0m 1427
---
Q  838+62
T 864 
[92m☑[0m 864 
---
Q  39+341
T 236 
[92m☑[0m 236 
---
val acc 94.420%
| epoch 14 |  iter 1 / 351 | time 0[s] | loss 0.05
| epoch 14 |  iter 21 / 351 | time 0[s] | loss 0.05
| epoch 14 |  iter 41 / 351 | time 1[s] | loss 0.05
| epoch 14 |  iter 61 / 351 | 

| epoch 19 |  iter 261 / 351 | time 10[s] | loss 0.03
| epoch 19 |  iter 281 / 351 | time 10[s] | loss 0.03
| epoch 19 |  iter 301 / 351 | time 11[s] | loss 0.02
| epoch 19 |  iter 321 / 351 | time 12[s] | loss 0.03
| epoch 19 |  iter 341 / 351 | time 13[s] | loss 0.02
Q   58+77
T 162 
[92m☑[0m 162 
---
Q 461+579
T 1139
[92m☑[0m 1139
---
Q  48+285
T 666 
[92m☑[0m 666 
---
Q   551+8
T 163 
[92m☑[0m 163 
---
Q  55+763
T 422 
[92m☑[0m 422 
---
Q 752+006
T 857 
[92m☑[0m 857 
---
Q 292+167
T 1053
[92m☑[0m 1053
---
Q 795+038
T 1427
[92m☑[0m 1427
---
Q  838+62
T 864 
[92m☑[0m 864 
---
Q  39+341
T 236 
[92m☑[0m 236 
---
val acc 97.220%
| epoch 20 |  iter 1 / 351 | time 0[s] | loss 0.03
| epoch 20 |  iter 21 / 351 | time 0[s] | loss 0.02
| epoch 20 |  iter 41 / 351 | time 1[s] | loss 0.04
| epoch 20 |  iter 61 / 351 | time 2[s] | loss 0.03
| epoch 20 |  iter 81 / 351 | time 3[s] | loss 0.04
| epoch 20 |  iter 101 / 351 | time 3[s] | loss 0.03
| epoch 20 |  iter 121 / 351 | t

| epoch 25 |  iter 321 / 351 | time 12[s] | loss 0.01
| epoch 25 |  iter 341 / 351 | time 13[s] | loss 0.01
Q   58+77
T 162 
[92m☑[0m 162 
---
Q 461+579
T 1139
[92m☑[0m 1139
---
Q  48+285
T 666 
[92m☑[0m 666 
---
Q   551+8
T 163 
[92m☑[0m 163 
---
Q  55+763
T 422 
[92m☑[0m 422 
---
Q 752+006
T 857 
[92m☑[0m 857 
---
Q 292+167
T 1053
[92m☑[0m 1053
---
Q 795+038
T 1427
[92m☑[0m 1427
---
Q  838+62
T 864 
[92m☑[0m 864 
---
Q  39+341
T 236 
[92m☑[0m 236 
---
val acc 97.760%
