In [163]:
import math
import numpy as np
import tensorflow as tf

In [164]:
def get_mean_std(x: tf.Tensor):
    mean = tf.reduce_mean(x, axis=-1, keepdims=True)
    squared = tf.square(x - mean)
    variance = tf.reduce_mean(squared, axis=-1, keepdims=True)
    std = tf.sqrt(variance)

    return mean, std

In [200]:
def layer_norm(layer: tf.Tensor):

#

    with tf.variable_scope("norm",reuse=tf.AUTO_REUSE):
        scale = tf.get_variable("scale", shape=layer.shape[-1], dtype=tf.float32)
        base = tf.get_variable("base", shape=layer.shape[-1], dtype=tf.float32)
        mean, std = get_mean_std(layer)
#
        norm = (layer - mean) / (std + 1e-6)
#
        return norm * scale + base

In [166]:
def attention(query: tf.Tensor, key: tf.Tensor, value: tf.Tensor, *,mask: tf.Tensor,keep_prob: float):
    d_k = query.shape[-1].value
    scores = tf.matmul(query, tf.transpose(key, perm=[0, 1, 3, 2]))
    scores = scores / tf.constant(math.sqrt(d_k))
    mask_add = ((scores * 0) - 1e9) * (tf.constant(1.) - mask)
    scores = scores * mask + mask_add
    attn = tf.nn.softmax(scores, axis=-1)
    attn = tf.nn.dropout(attn, keep_prob)
    return tf.matmul(attn, value), attn

In [167]:
def prepare_for_multi_head_attention(x: tf.Tensor, heads: int, name: str):
    n_batches, seq_len, d_model = x.shape
    assert d_model % heads == 0
    d_k = d_model // heads
    x = tf.layers.dense(x, units=d_model, name=name)
    x = tf.reshape(x, shape=[n_batches, seq_len, heads, d_k])
    x = tf.transpose(x, perm=[0, 2, 1, 3])
    return x

In [168]:
def multi_head_attention(query: tf.Tensor, key: tf.Tensor, value: tf.Tensor,mask: tf.Tensor,heads: int,keep_prob: float):
    with tf.variable_scope("multi_head"):
        

        n_batches, seq_len, d_model = query.shape

        query = prepare_for_multi_head_attention(query, heads, "query")
        key = prepare_for_multi_head_attention(key, heads, "key")
        value = prepare_for_multi_head_attention(value, heads, "value")
        
        mask = tf.expand_dims(mask, axis=1)

        out, _ = attention(query, key, value, mask=mask, keep_prob=keep_prob)
        out = tf.transpose(out, perm=[0, 2, 1, 3])
        out = tf.reshape(out, shape=[n_batches, seq_len, d_model])
        return tf.layers.dense(out, units=d_model, name="attention")



In [205]:
def feed_forward(x: tf.Tensor,d_model: int, d_ff: int, keep_prob: float):
    with tf.variable_scope("feed_forward",reuse=tf.AUTO_REUSE):
        hidden = tf.layers.dense(x, units=d_ff, name="hidden")
        hidden = tf.nn.relu(hidden)
        hidden = tf.nn.dropout(hidden, keep_prob=keep_prob)
        return tf.layers.dense(hidden, units=d_model, name="out")



In [206]:
def encoder_layer(x: tf.Tensor, *,mask: tf.Tensor, index: int, heads: int, keep_prob: float, d_ff: int):
    d_model = x.shape[-1]
    with tf.variable_scope(f"attention_{index}",reuse=tf.AUTO_REUSE):
        attention_out = multi_head_attention(x, x, x,mask=mask, heads=heads, keep_prob=keep_prob)
        added = x + tf.nn.dropout(attention_out, keep_prob)
        
        x = layer_norm(added)
        with tf.variable_scope(f"ff_{index}"):
            ff_out = feed_forward(x, d_model, d_ff, keep_prob)
        
            added = x + tf.nn.dropout(ff_out, keep_prob)
    return layer_norm(added)

        
        

In [207]:
def encoder(x: tf.Tensor, *, mask: tf.Tensor, n_layers: int, heads: int, keep_prob: float, d_ff: int):
    with tf.variable_scope("encoder",reuse=tf.AUTO_REUSE):
        for i in range(n_layers):
            x = encoder_layer(x,
                              mask=mask, index=i,
                              heads=heads, keep_prob=keep_prob, d_ff=d_ff)

        return x



In [208]:
def decoder_layer(encoding: tf.Tensor, x: tf.Tensor, *, enc_mask: tf.Tensor, mask: tf.Tensor, index: int, heads: int, keep_prob: float, d_ff: int):
    d_model = encoding.shape[-1]

    with tf.variable_scope(f"{index}_self_attention",reuse=tf.AUTO_REUSE):
        attention_out = multi_head_attention(x, x, x,
                                             mask=mask, heads=heads, keep_prob=keep_prob)
        added = x + tf.nn.dropout(attention_out, keep_prob=keep_prob)
        x = layer_norm(added)
        
    with tf.variable_scope(f"{index}_encoding_attention",reuse=tf.AUTO_REUSE):
        attention_out = multi_head_attention(x, encoding, encoding,
                                             mask=enc_mask, heads=heads, keep_prob=keep_prob)
        added = x + tf.nn.dropout(attention_out, keep_prob=keep_prob)
        
        x = layer_norm(added)
            
    with tf.variable_scope(f"{index}_ff"):
        ff_out = feed_forward(x, d_model, d_ff, keep_prob)         
            
    return layer_norm(added)     
        

In [209]:
def decoder(encoding: tf.Tensor, x: tf.Tensor, *,
            enc_mask: tf.Tensor, mask: tf.Tensor,
            n_layers: int,
            heads: int, keep_prob: float, d_ff: int):

    with tf.variable_scope("decoder",reuse=tf.AUTO_REUSE):
        for i in range(n_layers):
            x = decoder_layer(encoding, x,
                              enc_mask=enc_mask, mask=mask, index=i,
                              heads=heads, keep_prob=keep_prob, d_ff=d_ff)
            return x

In [210]:
def get_embeddings(input_ids: tf.Tensor, output_ids: tf.Tensor,
                   vocab_size: int, d_model: int):
    with tf.variable_scope("word_embds",reuse=tf.AUTO_REUSE):
        word_embeddings = tf.get_variable("word_embeddings",
                                      shape=[vocab_size, d_model],
                                      dtype=tf.float32,
                                      initializer=tf.initializers.random_normal())

    

        in_emb = tf.nn.embedding_lookup(word_embeddings, input_ids)

        out_emb = tf.nn.embedding_lookup(word_embeddings, output_ids)
    
        return word_embeddings, in_emb, out_emb


In [211]:
def generate_positional_encodings(d_model: int, max_len: int = 5000):
    
    encodings = np.zeros((max_len, d_model), dtype=float)
    position = np.arange(0, max_len).reshape((max_len, 1))
    two_i = np.arange(0, d_model, 2)
    div_term = np.exp(-math.log(10000.0) * two_i / d_model)
    encodings[:, 0::2] = np.sin(position * div_term)   
    encodings[:, 1::2] = np.cos(position * div_term)
    return tf.constant(encodings.reshape((1, max_len, d_model)), dtype=tf.float32, name="positional_encodings")



In [212]:
def prepare_embeddings(x: tf.Tensor, *,
                       positional_encodings: tf.Tensor,
                       keep_prob: float, is_input: bool):

    name = "prepare_input" if is_input else "prepare_output"
    with tf.variable_scope(name):
        _, seq_len, _ = x.shape  
        x = x + positional_encodings[:, :seq_len, :]
        x = tf.nn.dropout(x, keep_prob)
                
    
    return layer_norm(x)


In [213]:
def generator(x: tf.Tensor, *, vocab_size: int):
    res = tf.layers.dense(x, units=vocab_size, name="generator")
    return tf.nn.log_softmax(res, axis=-1)



In [214]:
def label_smoothing_loss(results: tf.Tensor, expected: tf.Tensor, *,
                         vocab_size: int, smoothing: float):



    results = tf.reshape(results, shape=(-1, vocab_size))
    expected = tf.reshape(expected, shape=[-1])

    confidence = 1 - smoothing
    smoothing = smoothing / (vocab_size - 1)
    expected = tf.one_hot(expected, depth=vocab_size) * (confidence - smoothing)
    expected += smoothing
    results = tf.distributions.Categorical(logits=results)
    expected = tf.distributions.Categorical(logits=expected)
    return tf.reduce_mean(tf.distributions.kl_divergence(results, expected))

In [215]:
def generate_data(batch_size: int, seq_len: int, vocab_size: int):
    start_token = vocab_size - 1
    repeat_token = vocab_size - 2
    vocab_size -= 2

    inputs = np.random.randint(0, vocab_size, size=(batch_size, seq_len))
    
    outputs = np.zeros((batch_size, seq_len + 1), dtype=int)
    outputs[:, 1:] = np.flip(inputs, 1)
    outputs[:, 0] = start_token

    for i in range(batch_size):
        v = np.zeros(vocab_size, dtype=bool)
        for j in range(seq_len):
            word = inputs[i, j]
            
            if v[word]:
                v[word] = False
                outputs[i][seq_len - j] = repeat_token
            else:
                v[word] = True
                
    return inputs, outputs

In [216]:
def noam_learning_rate(step: int, warm_up: float, d_model: int):

    return (d_model ** -.5) * min(step ** -.5, step * warm_up ** -1.5)

In [217]:
def output_subsequent_mask(seq_len: int):
    mask = np.zeros((seq_len, seq_len), dtype=float)
    for i in range(seq_len):
        for j in range(i + 1):
            mask[i, j] = 1.

    return mask

In [218]:
def train():
    seq_length = 10
    vocab_size = 10 + 1 + 1
    vocab_str = [f"{i}" for i in range(10)]
    vocab_str += ['X', 'S']
    batch_size = 32  # 12000
    d_model = 128  # 512
    heads = 8
    keep_prob = 0.9
    n_layers = 2  # 6
    d_ff = 256  # 2048
    positional_encodings = generate_positional_encodings(d_model)
    
    inputs = tf.placeholder(dtype=tf.int32,
                            shape=(batch_size, seq_length), name="input")
    outputs = tf.placeholder(dtype=tf.int32,
                             shape=(batch_size, seq_length), name="output")
    expected = tf.placeholder(dtype=tf.int32,
                              shape=(batch_size, seq_length), name="expected")
    inputs_mask = tf.placeholder(dtype=tf.float32,
                                 shape=(1, 1, seq_length),
                                 name="input_mask")
    output_mask = tf.placeholder(dtype=tf.float32,
                                 shape=(1, seq_length, seq_length),
                                 name="output_mask")
    
    learning_rate = tf.placeholder(dtype=tf.float32, name="learning_rate")

    

    w_embed, input_embeddings, output_embeddings = get_embeddings(inputs, outputs, vocab_size,
                                                                  d_model)
    input_embeddings = prepare_embeddings(input_embeddings,
                                          positional_encodings=positional_encodings,
                                          keep_prob=keep_prob,
                                          is_input=True)
    output_embeddings = prepare_embeddings(output_embeddings,
                                           positional_encodings=positional_encodings,
                                           keep_prob=keep_prob,
                                           is_input=False)

    encoding = encoder(input_embeddings, mask=inputs_mask, n_layers=n_layers, heads=heads,
                       keep_prob=keep_prob, d_ff=d_ff)
    decoding = decoder(encoding, output_embeddings,
                       enc_mask=inputs_mask, mask=output_mask,
                       n_layers=n_layers, heads=heads, keep_prob=keep_prob, d_ff=d_ff)
    log_results = generator(decoding, vocab_size=vocab_size)
    results = tf.exp(log_results)

    loss = label_smoothing_loss(log_results, expected, vocab_size=vocab_size, smoothing=0.0)
    
    adam = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-5)
    params = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, params), 5.)
    grads_and_vars = list(zip(grads, params))
    train_op = adam.apply_gradients(grads_and_vars, name="apply_gradients")
    
    warm_up = 400
    batch_in_mask = np.ones((1, 1, seq_length), dtype=float)
    batch_out_mask = output_subsequent_mask(seq_length)
    batch_out_mask = batch_out_mask.reshape(1, seq_length, seq_length)
    def __print_seq(seq):
        return ' '.join([vocab_str[i] for i in seq])
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        
        for i in range(100_000):
            lr = noam_learning_rate(i + 1, warm_up, d_model)
            batch_in, batch_out = generate_data(batch_size, seq_length, vocab_size)
            _, batch_loss, batch_res = session.run([train_op, loss, results],
                                                   feed_dict={
                                                       learning_rate: lr,
                                                       inputs: batch_in,
                                                       outputs: batch_out[:, :-1],
                                                       expected: batch_out[:, 1:],
                                                       inputs_mask: batch_in_mask,
                                                       output_mask: batch_out_mask
                                                   })
            
            if i % 100 == 0:
                            print(f"step={i}\tloss={batch_loss: .6f}")
                            print(f"inp=  {__print_seq(batch_in[0])}")
                            print(f"exp={__print_seq(batch_out[0])}")
                            print(f"res=  {__print_seq(np.argmax(batch_res[0], -1))}")


In [219]:
if __name__ == '__main__':
    train()

Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
step=0	loss= 0.064823
inp=  9 6 2 2 0 5 1 6 5 2
exp=S 2 X X 1 5 0 X 2 6 9
res=  2 2 2 2 2 2 2 2 2 2
step=100	loss= 0.048373
inp=  5 9 4 4 1 9 5 1 5 8
exp=S 8 5 X X X 1 X 4 9 5
res=  X X X X X X X X 

step=6400	loss= 0.002392
inp=  0 7 7 8 1 4 9 3 1 1
exp=S 1 X 3 9 4 1 8 X 7 0
res=  1 X 3 9 4 8 8 X 7 0
step=6500	loss= 0.003317
inp=  9 2 0 2 4 2 0 4 6 2
exp=S X 6 X X 2 4 X 0 2 9
res=  X 6 X X 2 4 X 0 2 9
step=6600	loss= 0.001807
inp=  3 4 5 7 2 0 1 8 3 0
exp=S X X 8 1 0 2 7 5 4 3
res=  X X 8 1 0 2 7 5 4 3
step=6700	loss= 0.002367
inp=  0 3 6 8 0 8 8 0 2 9
exp=S 9 2 0 8 X X 8 6 3 0
res=  9 2 0 8 X X 8 6 3 0
step=6800	loss= 0.002516
inp=  0 0 4 0 0 8 8 3 1 8
exp=S 8 1 3 X 8 X 0 4 X 0
res=  8 1 3 X 8 0 0 4 X 0
step=6900	loss= 0.004135
inp=  9 3 3 0 1 1 5 3 2 6
exp=S 6 2 3 5 X 1 0 X 3 9
res=  6 2 X 5 X 1 0 3 3 9
step=7000	loss= 0.002498
inp=  6 7 8 2 1 7 2 2 1 9
exp=S 9 X 2 X X 1 2 8 7 6
res=  9 X 2 X X 1 2 8 7 6
step=7100	loss= 0.001548
inp=  6 3 1 9 7 2 0 5 2 2
exp=S 2 X 5 0 2 7 9 1 3 6
res=  2 X 5 0 2 7 9 1 3 6
step=7200	loss= 0.002678
inp=  8 7 9 4 4 9 8 7 7 7
exp=S X 7 X X X X 4 9 7 8
res=  X 7 X X X X 4 9 7 8
step=7300	loss= 0.002645
inp=  2 3 4 6 5 1 7 1 8 8
exp=S X 8 X 7 1 5 6 4 

step=14400	loss= 0.000737
inp=  8 4 6 3 7 8 4 7 1 0
exp=S 0 1 X X X 7 3 6 4 8
res=  0 1 X X X 7 3 6 4 8
step=14500	loss= 0.001352
inp=  0 6 1 0 6 6 8 8 0 1
exp=S X 0 X 8 6 X X 1 6 0
res=  X 0 X 8 6 X X 1 6 0
step=14600	loss= 0.001781
inp=  6 9 7 9 6 9 1 0 0 2
exp=S 2 X 0 1 9 X X 7 9 6
res=  2 X 0 1 9 X X 7 9 6
step=14700	loss= 0.002861
inp=  7 3 5 6 8 6 2 4 6 9
exp=S 9 6 4 2 X 8 6 5 3 7
res=  9 6 4 2 X 8 6 5 3 7
step=14800	loss= 0.001693
inp=  4 9 7 2 3 9 5 6 5 5
exp=S 5 X 6 5 X 3 2 7 9 4
res=  5 X 6 5 X 3 2 7 9 4
step=14900	loss= 0.000877
inp=  2 1 9 6 3 3 5 4 7 0
exp=S 0 7 4 5 X 3 6 9 1 2
res=  0 7 4 5 X 3 6 9 1 2
step=15000	loss= 0.001893
inp=  9 1 7 9 1 1 1 3 1 3
exp=S X 1 3 X 1 X X 7 1 9
res=  X X 3 X X X X 7 1 9
step=15100	loss= 0.001906
inp=  3 3 7 2 5 7 6 8 5 6
exp=S X X 8 6 X 5 2 7 X 3
res=  X X 8 6 X 5 2 7 X 3
step=15200	loss= 0.001368
inp=  4 1 2 6 2 5 4 7 2 9
exp=S 9 2 7 X 5 X 6 2 1 4
res=  9 2 7 X 5 X 6 2 1 4
step=15300	loss= 0.001140
inp=  6 3 5 3 0 8 1 3 3 6
exp=S X X 3 

step=22300	loss= 0.001374
inp=  5 9 1 2 5 3 8 3 4 5
exp=S 5 4 X 8 3 X 2 1 9 5
res=  5 4 X 8 3 X 2 1 X 5
step=22400	loss= 0.000291
inp=  1 9 0 8 1 2 6 9 6 2
exp=S X X X 6 2 X 8 0 9 1
res=  X X X 6 2 X 8 0 9 1
step=22500	loss= 0.000709
inp=  8 7 5 0 5 3 5 0 3 6
exp=S 6 X X 5 3 X 0 5 7 8
res=  6 X X 5 3 X 0 5 7 8
step=22600	loss= 0.001829
inp=  8 0 0 7 7 2 4 1 8 1
exp=S X X 1 4 2 X 7 X 0 8
res=  X X 1 4 2 X 7 X 0 8
step=22700	loss= 0.001730
inp=  9 1 5 6 4 1 6 3 5 3
exp=S X X 3 X X 4 6 5 1 9
res=  X X 3 X X 4 6 5 1 9
step=22800	loss= 0.000879
inp=  0 9 1 3 3 0 9 8 6 0
exp=S 0 6 8 X X X 3 1 9 0
res=  0 6 8 X X X 3 1 9 0
step=22900	loss= 0.000433
inp=  9 8 7 4 3 5 5 0 8 4
exp=S X X 0 X 5 3 4 7 8 9
res=  X X 0 X 5 3 4 7 8 9
step=23000	loss= 0.001069
inp=  7 3 7 8 2 5 7 2 6 0
exp=S 0 6 X 7 5 2 8 X 3 7
res=  0 6 X 7 5 2 8 X 3 7
step=23100	loss= 0.001102
inp=  9 7 0 5 0 4 3 3 5 9
exp=S X X X 3 4 X 5 0 7 9
res=  X X X 3 4 X 5 0 7 9
step=23200	loss= 0.000920
inp=  8 6 4 6 9 3 5 8 9 7
exp=S 7 X X 

step=30200	loss= 0.000747
inp=  2 0 3 4 7 8 3 7 0 6
exp=S 6 X X X 8 7 4 3 0 2
res=  6 X X X 8 7 4 3 0 2
step=30300	loss= 0.000711
inp=  6 4 5 3 6 9 2 4 4 9
exp=S X 4 X 2 9 X 3 5 4 6
res=  X 4 X 2 9 X 3 5 4 6
step=30400	loss= 0.000697
inp=  9 4 9 7 3 8 6 7 5 4
exp=S X 5 X 6 8 3 7 X 4 9
res=  X 5 X 6 8 3 7 X 4 9
step=30500	loss= 0.000640
inp=  2 2 3 7 4 4 2 8 3 0
exp=S 0 X 8 2 X 4 7 3 X 2
res=  0 X 8 2 X 4 7 3 X 2
step=30600	loss= 0.000268
inp=  6 5 0 4 5 0 0 3 4 8
exp=S 8 X 3 0 X X 4 0 5 6
res=  8 X 3 0 X X 4 0 5 6
step=30700	loss= 0.000784
inp=  3 3 7 6 7 4 7 6 5 8
exp=S 8 5 X 7 4 X 6 7 X 3
res=  8 5 X 7 4 X 6 7 X 3
step=30800	loss= 0.000428
inp=  9 7 3 8 1 7 4 8 0 0
exp=S X 0 X 4 X 1 8 3 7 9
res=  X 0 X 4 X 1 8 3 7 9
step=30900	loss= 0.000867
inp=  0 2 6 5 6 5 9 7 6 9
exp=S X 6 7 9 X X 5 6 2 0
res=  X 6 7 9 X X 5 6 2 0
step=31000	loss= 0.002324
inp=  6 0 9 4 0 0 9 4 0 8
exp=S 8 X X X 0 X 4 9 0 6
res=  8 0 X X X X 4 9 0 6
step=31100	loss= 0.000741
inp=  8 3 7 4 0 8 0 9 5 0
exp=S 0 5 9 

step=38100	loss= 0.000516
inp=  7 7 6 1 5 2 2 4 4 5
exp=S X X 4 X 2 5 1 6 X 7
res=  X X 4 X 2 5 1 6 X 7
step=38200	loss= 0.001013
inp=  0 4 7 6 4 7 7 2 6 5
exp=S 5 X 2 7 X X 6 7 4 0
res=  5 X 2 7 X X 6 7 4 0
step=38300	loss= 0.000626
inp=  9 6 6 8 0 5 1 1 5 4
exp=S 4 X X 1 5 0 8 X 6 9
res=  4 X X 1 5 0 8 X 6 9
step=38400	loss= 0.000260
inp=  9 2 1 9 7 7 7 0 1 0
exp=S X X 0 7 X 7 X 1 2 9
res=  X X 0 7 X 7 X 1 2 9
step=38500	loss= 0.000389
inp=  3 8 9 1 7 7 9 2 3 3
exp=S 3 X 2 X X 7 1 9 8 3
res=  3 X 2 X X 7 1 9 8 3
step=38600	loss= 0.000264
inp=  9 6 2 3 8 4 0 3 3 0
exp=S X 3 X 0 4 8 3 2 6 9
res=  X 3 X 0 4 8 3 2 6 9
step=38700	loss= 0.000816
inp=  1 2 0 0 9 3 2 5 0 1
exp=S X 0 5 X 3 9 X 0 2 1
res=  X 0 5 X 3 9 X 0 2 1
step=38800	loss= 0.000645
inp=  7 4 1 2 8 6 4 4 9 2
exp=S X 9 4 X 6 8 2 1 4 7
res=  X 9 4 X 6 8 2 1 4 7
step=38900	loss= 0.000926
inp=  5 3 9 2 9 1 4 4 8 7
exp=S 7 8 X 4 1 X 2 9 3 5
res=  7 8 X 4 1 X 2 9 3 5
step=39000	loss= 0.000747
inp=  9 2 3 9 9 4 9 8 5 0
exp=S 0 5 8 

step=46000	loss= 0.000418
inp=  8 6 8 2 2 1 7 1 8 9
exp=S 9 8 X 7 1 X 2 X 6 8
res=  9 8 X 7 1 X 2 X 6 8
step=46100	loss= 0.000822
inp=  7 3 2 9 8 0 6 2 1 4
exp=S 4 1 X 6 0 8 9 2 3 7
res=  4 1 X 6 0 8 9 2 3 7
step=46200	loss= 0.000667
inp=  9 6 2 0 9 6 6 8 4 0
exp=S X 4 8 6 X X 0 2 6 9
res=  X 4 8 6 X X 0 2 6 9
step=46300	loss= 0.000620
inp=  9 5 2 5 4 1 2 8 9 8
exp=S X X 8 X 1 4 X 2 5 9
res=  X X 8 X 1 4 X 2 5 9
step=46400	loss= 0.000209
inp=  1 7 1 1 6 0 5 2 7 1
exp=S X X 2 5 0 6 1 X 7 1
res=  X X 2 5 0 6 1 X 7 1
step=46500	loss= 0.000488
inp=  3 9 6 6 8 9 6 5 3 4
exp=S 4 X 5 6 X 8 X 6 9 3
res=  4 X 5 6 X 8 X 6 9 3
step=46600	loss= 0.000612
inp=  7 9 7 4 1 5 7 1 4 1
exp=S 1 X X 7 5 1 4 X 9 7
res=  1 X X 7 5 1 4 X 9 7
step=46700	loss= 0.000457
inp=  2 6 6 9 0 1 8 1 0 8
exp=S X X X 8 1 0 9 X 6 2
res=  X X X 8 1 0 9 X 6 2
step=46800	loss= 0.001337
inp=  5 7 7 9 7 2 7 7 0 6
exp=S 6 0 7 X 2 7 9 X 7 5
res=  6 0 X X 2 7 9 X 7 5
step=46900	loss= 0.000937
inp=  3 7 1 8 8 4 9 8 2 1
exp=S X 2 8 

step=53900	loss= 0.000540
inp=  3 6 6 0 9 6 9 8 8 2
exp=S 2 X 8 X 6 9 0 X 6 3
res=  2 X 8 X 6 9 0 X 6 3
step=54000	loss= 0.000434
inp=  1 9 0 2 9 0 1 1 0 0
exp=S X 0 1 X X X 2 0 9 1
res=  X 0 X X X X 2 0 9 1
step=54100	loss= 0.000269
inp=  6 0 0 5 1 0 1 4 8 0
exp=S X 8 4 X 0 1 5 X 0 6
res=  X 8 4 X 0 1 5 X 0 6
step=54200	loss= 0.000519
inp=  5 1 9 3 0 2 8 8 2 4
exp=S 4 X X 8 2 0 3 9 1 5
res=  4 X X 8 2 0 3 9 1 5
step=54300	loss= 0.001013
inp=  7 8 3 4 2 6 0 4 9 1
exp=S 1 9 X 0 6 2 4 3 8 7
res=  1 9 X 0 6 2 4 3 8 7
step=54400	loss= 0.000347
inp=  1 6 9 2 8 9 2 3 4 4
exp=S X 4 3 X X 8 2 9 6 1
res=  X 4 3 X X 8 2 9 6 1
step=54500	loss= 0.001008
inp=  1 8 8 9 0 9 2 5 7 5
exp=S X 7 5 2 X 0 9 X 8 1
res=  X 7 5 2 X 0 9 X 8 1
step=54600	loss= 0.000268
inp=  2 3 1 3 4 6 6 1 0 3
exp=S 3 0 X X 6 4 X 1 3 2
res=  3 0 X X 6 4 X 1 3 2
step=54700	loss= 0.000612
inp=  9 3 5 6 6 9 5 4 3 0
exp=S 0 X 4 X X X 6 5 3 9
res=  0 X 4 X X X 6 5 3 9
step=54800	loss= 0.000908
inp=  3 0 3 4 7 3 6 7 0 0
exp=S 0 X X 

step=61800	loss= 0.000525
inp=  6 8 8 3 4 4 4 9 9 6
exp=S X X 9 4 X 4 3 X 8 6
res=  X X 9 4 X 4 3 X 8 6
step=61900	loss= 0.000254
inp=  0 4 2 6 7 7 6 0 9 5
exp=S 5 9 X X X 7 6 2 4 0
res=  5 9 X X X 7 6 2 4 0
step=62000	loss= 0.000433
inp=  0 6 8 4 8 8 4 9 0 2
exp=S 2 X 9 X 8 X 4 8 6 0
res=  2 X 9 X 8 X 4 8 6 0
step=62100	loss= 0.000190
inp=  4 4 3 2 4 8 3 4 6 6
exp=S X 6 X X 8 4 2 3 X 4
res=  X 6 X X 8 4 2 3 X 4
step=62200	loss= 0.000209
inp=  6 1 2 3 0 4 6 6 3 4
exp=S X X 6 X 4 0 3 2 1 6
res=  X X 6 X 4 0 3 2 1 6
step=62300	loss= 0.000162
inp=  1 5 5 4 0 2 6 0 0 2
exp=S X 0 X 6 2 0 4 X 5 1
res=  X 0 X 6 2 0 4 X 5 1
step=62400	loss= 0.000545
inp=  2 0 9 5 0 5 0 5 3 5
exp=S X 3 5 0 X X 5 9 0 2
res=  X 3 5 0 X X 5 9 0 2
step=62500	loss= 0.000557
inp=  6 1 4 4 6 1 3 0 6 9
exp=S 9 6 0 3 X X X 4 1 6
res=  9 6 0 3 X X X 4 1 6
step=62600	loss= 0.000320
inp=  0 6 1 1 2 1 5 0 3 0
exp=S 0 3 X 5 1 2 X 1 6 0
res=  0 3 X 5 1 2 X 1 6 0
step=62700	loss= 0.000233
inp=  0 5 4 4 5 0 0 5 1 5
exp=S X 1 5 

step=69700	loss= 0.000587
inp=  9 4 7 3 3 1 1 0 1 9
exp=S X 1 0 X 1 X 3 7 4 9
res=  X 1 0 X 1 X 3 7 4 9
step=69800	loss= 0.000248
inp=  6 7 5 8 6 5 6 0 8 4
exp=S 4 X 0 6 X X 8 5 7 6
res=  4 X 0 6 X X 8 5 7 6
step=69900	loss= 0.000207
inp=  3 9 6 9 1 5 1 9 5 1
exp=S 1 X 9 X 5 1 X 6 9 3
res=  1 X 9 X 5 1 X 6 9 3
step=70000	loss= 0.000225
inp=  5 1 2 9 0 9 5 1 0 8
exp=S 8 X X X X 0 9 2 1 5
res=  8 X X X X 0 9 2 1 5
step=70100	loss= 0.000210
inp=  3 9 4 9 4 8 0 9 0 7
exp=S 7 X 9 0 8 X X 4 9 3
res=  7 X 9 0 8 X X 4 9 3
step=70200	loss= 0.000601
inp=  5 7 5 3 9 7 6 4 9 1
exp=S 1 X 4 6 X 9 3 X 7 5
res=  1 X 4 6 X 9 3 X 7 5
step=70300	loss= 0.000480
inp=  8 8 7 8 7 6 9 1 7 3
exp=S 3 7 1 9 6 X 8 7 X 8
res=  3 7 1 9 6 X 8 7 X 8
step=70400	loss= 0.000245
inp=  1 2 6 5 3 4 4 0 5 4
exp=S 4 X 0 X 4 3 5 6 2 1
res=  4 X 0 X 4 3 5 6 2 1
step=70500	loss= 0.001337
inp=  0 0 3 9 0 4 5 9 0 9
exp=S 9 X X 5 4 0 9 3 X 0
res=  9 X X 5 4 0 9 3 X 0
step=70600	loss= 0.000186
inp=  1 7 0 8 5 3 0 9 9 3
exp=S X X 9 

step=77600	loss= 0.000235
inp=  6 0 9 3 8 6 0 8 5 7
exp=S 7 5 X X X 8 3 9 0 6
res=  7 5 X X X 8 3 9 0 6
step=77700	loss= 0.000779
inp=  7 4 0 5 4 6 2 8 5 2
exp=S X X 8 2 6 X 5 0 4 7
res=  X X 8 2 6 X 5 0 4 7
step=77800	loss= 0.000137
inp=  1 9 3 8 7 9 4 0 7 4
exp=S X X 0 4 X 7 8 3 9 1
res=  X X 0 4 X 7 8 3 9 1
step=77900	loss= 0.000467
inp=  6 0 4 4 8 0 7 6 7 5
exp=S 5 X X 7 X 8 X 4 0 6
res=  5 X X 7 X 8 X 4 0 6
step=78000	loss= 0.000289
inp=  4 7 1 9 4 8 5 1 0 7
exp=S X 0 X 5 8 X 9 1 7 4
res=  X 0 X 5 8 X 9 1 7 4
step=78100	loss= 0.000233
inp=  6 4 3 2 2 9 9 5 4 7
exp=S 7 X 5 X 9 X 2 3 4 6
res=  7 X 5 X 9 X 2 3 4 6
step=78200	loss= 0.000201
inp=  7 0 1 5 9 2 1 7 4 6
exp=S 6 4 X X 2 9 5 1 0 7
res=  6 4 X X 2 9 5 1 0 7
step=78300	loss= 0.000288
inp=  3 1 3 6 6 3 5 8 4 9
exp=S 9 4 8 5 3 X 6 X 1 3
res=  9 4 8 5 3 X 6 X 1 3
step=78400	loss= 0.000180
inp=  9 5 6 3 4 5 4 6 2 4
exp=S 4 2 X X X 4 3 6 5 9
res=  4 2 X X X 4 3 6 5 9
step=78500	loss= 0.000955
inp=  7 6 3 5 5 4 4 0 8 0
exp=S X 8 0 

step=85500	loss= 0.000115
inp=  6 4 5 1 7 3 7 6 0 6
exp=S 6 0 X X 3 7 1 5 4 6
res=  6 0 X X 3 7 1 5 4 6
step=85600	loss= 0.000645
inp=  2 2 7 1 8 1 1 5 2 9
exp=S 9 2 5 1 X 8 1 7 X 2
res=  9 2 5 1 X 8 1 7 X 2
step=85700	loss= 0.000244
inp=  5 6 0 2 1 8 6 2 2 5
exp=S X 2 X X 8 1 2 0 6 5
res=  X 2 X X 8 1 2 0 6 5
step=85800	loss= 0.000503
inp=  4 1 3 1 3 1 2 4 6 2
exp=S X 6 X 2 1 X X 3 1 4
res=  X 6 X 2 1 X X 3 1 4
step=85900	loss= 0.000457
inp=  8 7 6 0 8 0 5 2 0 9
exp=S 9 0 2 5 X X 0 6 7 8
res=  9 0 2 5 X X 0 6 7 8
step=86000	loss= 0.000138
inp=  5 4 5 3 8 0 0 6 9 1
exp=S 1 9 6 X 0 8 3 X 4 5
res=  1 9 6 X 0 8 3 X 4 5
step=86100	loss= 0.000236
inp=  3 0 7 4 4 7 5 1 1 5
exp=S X X 1 5 X X 4 7 0 3
res=  X X 1 5 X X 4 7 0 3
step=86200	loss= 0.000213
inp=  4 8 0 9 8 0 7 5 7 3
exp=S 3 X 5 7 X X 9 0 8 4
res=  3 X 5 7 X X 9 0 8 4
step=86300	loss= 0.000252
inp=  4 0 7 7 6 9 0 3 9 5
exp=S 5 X 3 X 9 6 X 7 0 4
res=  5 X 3 X 9 6 X 7 0 4
step=86400	loss= 0.000252
inp=  3 2 1 2 0 4 2 6 8 7
exp=S 7 8 6 

step=93400	loss= 0.000452
inp=  1 0 2 0 3 8 8 0 2 8
exp=S 8 X 0 X 8 3 X 2 0 1
res=  8 X 0 X 8 3 X 2 0 1
step=93500	loss= 0.000322
inp=  3 1 5 0 1 1 8 0 3 5
exp=S X X X 8 1 X 0 5 1 3
res=  X X X 8 1 X 0 5 1 3
step=93600	loss= 0.000501
inp=  5 6 5 3 4 2 3 7 9 4
exp=S X 9 7 X 2 4 3 X 6 5
res=  X 9 7 X 2 4 3 X 6 5
step=93700	loss= 0.000461
inp=  3 0 8 5 7 6 0 5 9 0
exp=S 0 9 X X 6 7 5 8 0 3
res=  0 9 X X 6 7 5 8 0 3
step=93800	loss= 0.000147
inp=  7 0 2 6 2 6 2 8 1 5
exp=S 5 1 8 2 X X 6 2 0 7
res=  5 1 8 2 X X 6 2 0 7
step=93900	loss= 0.000274
inp=  7 4 8 0 8 5 0 7 5 8
exp=S 8 X X X 5 X 0 8 4 7
res=  8 X X X 5 X 0 8 4 7
step=94000	loss= 0.000159
inp=  0 5 4 0 7 2 5 2 4 8
exp=S 8 X X X 2 7 X 4 5 0
res=  8 X X X 2 7 X 4 5 0
step=94100	loss= 0.000421
inp=  5 3 2 3 0 6 9 6 4 0
exp=S X 4 X 9 6 0 X 2 3 5
res=  X 4 X 9 6 0 X 2 3 5
step=94200	loss= 0.000220
inp=  9 6 0 9 2 9 6 0 5 3
exp=S 3 5 X X 9 2 X 0 6 9
res=  3 5 X X 9 2 X 0 6 9
step=94300	loss= 0.000161
inp=  5 3 8 6 2 0 3 0 3 7
exp=S 7 3 X 