<a href="https://colab.research.google.com/github/felipeserna/holbertonschool-machine_learning/blob/master/supervised_learning/0x12-transformer_apps/transformer_applications.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# 3-dataset.py
#!/usr/bin/env python3
"""
Class that loads and preps a dataset for machine translation.
Portugese-English translation dataset.
Approximately 50000 training examples, 1100 validation examples,
and 2000 test examples.
https://www.programmersought.com/article/38506277799/
"""
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds


class Dataset():
    """
    Loads and preps a dataset for machine translation
    """
    def __init__(self, batch_size, max_len):
        """
        Class constructor
        """
        examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en',
                                       with_info=True,
                                       as_supervised=True)

        self.metadata = metadata

        self.data_train = examples['train']
        self.data_valid = examples['validation']

        tokenizer_pt, tokenizer_en = self.tokenize_dataset(self.data_train)
        # Portuguese tokenizer created from the training set
        self.tokenizer_pt = tokenizer_pt
        # English tokenizer created from the training set
        self.tokenizer_en = tokenizer_en

        # tokenizing the examples
        # Dataset.map Maps map_func across the elements of this dataset.
        self.data_train = self.data_train.map(self.tf_encode)

        # tokenizing the examples
        self.data_valid = self.data_valid.map(self.tf_encode)

        def filter_max_length(x, y, max_length=max_len):
            """
            function for .filter() method
            """
            return tf.logical_and(tf.size(x) <= max_length,
                                  tf.size(y) <= max_length)
        
        # Update data_train attribute
        self.data_train = self.data_train.filter(filter_max_length)
        self.data_train = self.data_train.cache()

        train_dataset_size = self.metadata.splits['train'].num_examples

        self.data_train = self.data_train.shuffle(train_dataset_size)
        padded_shapes = ([None], [None])
        self.data_train = self.data_train.padded_batch(batch_size,
                                                       padded_shapes=padded_shapes)
        
        self.data_train = self.data_train.prefetch(tf.data.experimental.AUTOTUNE)

        # Update data_valid attribute
        self.data_valid = self.data_valid.filter(filter_max_length)
        padded_shapes = ([None], [None])
        self.data_valid = self.data_valid.padded_batch(batch_size,
                                                       padded_shapes=padded_shapes)

    def tokenize_dataset(self, data):
        """
        Creates sub-word tokenizers for our dataset
        Returns: tokenizer_pt, tokenizer_en
        """
        tokenizer_pt = \
            tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
                (pt.numpy() for pt, en in data), target_vocab_size=2**15)

        tokenizer_en = \
            tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
                (en.numpy() for pt, en in data), target_vocab_size=2**15)

        return tokenizer_pt, tokenizer_en

    def encode(self, pt, en):
        """
        Encodes a translation into tokens.
        Returns: pt_tokens, en_tokens
        """
        pt_tokens = [self.tokenizer_pt.vocab_size] + self.tokenizer_pt.encode(
            pt.numpy()) + [self.tokenizer_pt.vocab_size + 1]

        en_tokens = [self.tokenizer_en.vocab_size] + self.tokenizer_en.encode(
            en.numpy()) + [self.tokenizer_en.vocab_size + 1]

        return pt_tokens, en_tokens

    def tf_encode(self, pt, en):
        """
        tf wrapper for the 'encode' instance method to be used with map()
        """
        result_pt, result_en = tf.py_function(func=self.encode, inp=[pt, en],
                                              Tout=[tf.int64, tf.int64])
        # None allows any value
        result_pt.set_shape([None])
        result_en.set_shape([None])

        return result_pt, result_en

In [5]:
# 4-create_masks.py
# https://www.tensorflow.org/text/tutorials/transformer
def create_padding_mask(seq):
    """
    It ensures that the model does not treat padding as the input.
    """
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    # (batch_size, 1, 1, seq_len)
    return seq[:, tf.newaxis, tf.newaxis, :]

In [6]:
# 4-create_masks.py
# https://www.tensorflow.org/text/tutorials/transformer
def create_look_ahead_mask(size):
    """
    Mask used to mask the future tokens in a sequence.
    """
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    # (seq_len, seq_len)
    return mask

In [7]:
# 4-create_masks.py
# https://www.tensorflow.org/text/tutorials/transformer
def create_masks(inputs, target):
    """
    Creates all masks for training/validation.
    Returns: encoder_mask, combined_mask, decoder_mask
    """
    # Encoder padding mask
    encoder_mask = create_padding_mask(inputs)

    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    decoder_mask = create_padding_mask(inputs)

    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
    dec_target_padding_mask = create_padding_mask(target)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return encoder_mask, combined_mask, decoder_mask

In [8]:
# 5-transformer.py
"""
Transformer from project 0x11. Attention.
You may need to make slight adjustments to this model
to get it to functionally train.
https://www.tensorflow.org/text/tutorials/transformer
"""
# import tensorflow.compat.v2 as tf
import numpy as np


def positional_encoding(max_seq_len, dm):
    """
    Calculates the positional encoding for a transformer.
    Returns: a numpy.ndarray of shape (max_seq_len, dm)
    containing the positional encoding vectors
    """
    PE = np.zeros([max_seq_len, dm])

    for pos in range(max_seq_len):
        for i in range(0, dm, 2):
            # sin to even indices
            PE[pos, i] = np.sin(pos / (10000 ** (i / dm)))
            # cos to odd indices
            PE[pos, i + 1] = np.cos(pos / (10000 ** (i / dm)))

    return PE

In [9]:
# 5-transformer.py
def sdp_attention(Q, K, V, mask=None):
    """
    Calculates the scaled dot product attention.
    Returns: output, weights
    """
    # (..., seq_len_q, seq_len_k)
    matmul_QK = tf.matmul(Q, K, transpose_b=True)

    # scale matmul_qk
    dk = tf.cast(tf.shape(K)[-1], tf.float32)
    scaled_attention_logits = matmul_QK / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    # (..., seq_len_q, seq_len_k)
    weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    output = tf.matmul(weights, V)  # (..., seq_len_q, depth_v)

    return output, weights

In [10]:
# 5-transformer.py
class MultiHeadAttention(tf.keras.layers.Layer):
    """
    Performs multi head attention
    """
    def __init__(self, dm, h):
        """
        Class constructor
        """
        super().__init__()
        # Number of heads
        self.h = h
        # Dimensionality of the model
        self.dm = dm
        # Depth of each attention head
        self.depth = dm // h
        # Dense layer used to generate the query matrix
        self.Wq = tf.keras.layers.Dense(units=dm)
        # Dense layer used to generate the key matrix
        self.Wk = tf.keras.layers.Dense(units=dm)
        # Dense layer used to generate the value matrix
        self.Wv = tf.keras.layers.Dense(units=dm)
        # Dense layer used to generate the attention output
        self.linear = tf.keras.layers.Dense(units=dm)

    def split_heads(self, x, batch):
        """
        Split the last dimension into (h, depth).
        Transpose the result such that the shape is
        (batch_size, h, seq_len, depth)
        """
        x = tf.reshape(x, (batch, -1, self.h, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, Q, K, V, mask):
        """
        Returns: output, weights
        """
        batch = tf.shape(Q)[0]
        # Helping Kelsie
        # batch = Q.get_shape().as_list()[0]

        # (batch, seq_len_q, dk)
        Q = self.Wq(Q)
        # (batch, seq_len_v, dk)
        K = self.Wk(K)
        # (batch, seq_len_v, dv)
        V = self.Wv(V)

        # (batch, h, seq_len_q, depth)
        Q = self.split_heads(Q, batch)
        # (batch, h, seq_len_k, depth)
        K = self.split_heads(K, batch)
        # (batch, h, seq_len_v, depth)
        V = self.split_heads(V, batch)

        # scaled_attention.shape == (batch, h, seq_len_q, depth)
        # weights.shape == (batch, h, seq_len_q, seq_len_k)
        scaled_attention, weights = sdp_attention(Q, K, V, mask)

        # (batch, seq_len_q, h, depth)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        # (batch, seq_len_q, dm)
        concat_attention = \
            tf.reshape(scaled_attention, (batch, -1, self.dm))

        # (batch, seq_len_q, dm)
        output = self.linear(concat_attention)

        return output, weights

In [11]:
# 5-transformer.py
class EncoderBlock(tf.keras.layers.Layer):
    """
    Creates an encoder block for a transformer
    """
    def __init__(self, dm, h, hidden, drop_rate=0.1):
        """
        Class constructor
        """
        super().__init__()
        # MultiHeadAttention layer
        self.mha = MultiHeadAttention(dm, h)
        # the hidden dense layer with hidden units and relu activation
        self.dense_hidden = tf.keras.layers.Dense(units=hidden,
                                                  activation='relu')
        # the output dense layer with dm units
        self.dense_output = tf.keras.layers.Dense(units=dm)
        # the first layer norm layer, with epsilon=1e-6
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        # the second layer norm layer, with epsilon=1e-6
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        # the first dropout layer
        self.dropout1 = tf.keras.layers.Dropout(rate=drop_rate)
        # the second dropout layer
        self.dropout2 = tf.keras.layers.Dropout(rate=drop_rate)

    def call(self, x, training, mask=None):
        """
        Returns: a tensor of shape (batch, input_seq_len, dm)
        containing the block’s output
        """
        # (batch, input_seq_len, dm)
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        # (batch, input_seq_len, dm)
        out1 = self.layernorm1(x + attn_output)

        # (batch, input_seq_len, dm)
        ffn_output = self.dense_hidden(out1)
        ffn_output = self.dense_output(ffn_output)
        ffn_output = self.dropout2(ffn_output, training=training)
        # (batch, input_seq_len, dm)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

In [12]:
# 5-transformer.py
class DecoderBlock(tf.keras.layers.Layer):
    """
    Creates a decoder block for a transformer
    """
    def __init__(self, dm, h, hidden, drop_rate=0.1):
        """
        Class constructor
        """
        super().__init__()
        # the first MultiHeadAttention layer
        self.mha1 = MultiHeadAttention(dm, h)
        # the second MultiHeadAttention layer
        self.mha2 = MultiHeadAttention(dm, h)
        # the hidden dense layer with hidden units and relu activation
        self.dense_hidden = tf.keras.layers.Dense(units=hidden,
                                                  activation='relu')
        # the output dense layer with dm units
        self.dense_output = tf.keras.layers.Dense(units=dm)
        # the first layer norm layer, with epsilon=1e-6
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        # the second layer norm layer, with epsilon=1e-6
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        # the third layer norm layer, with epsilon=1e-6
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        # the first dropout layer
        self.dropout1 = tf.keras.layers.Dropout(rate=drop_rate)
        # the second dropout layer
        self.dropout2 = tf.keras.layers.Dropout(rate=drop_rate)
        # the third dropout layer
        self.dropout3 = tf.keras.layers.Dropout(rate=drop_rate)

    def call(self, x, encoder_output, training, look_ahead_mask, padding_mask):
        """
        Returns: a tensor of shape (batch, target_seq_len, dm)
        containing the block’s output
        """
        # encoder_output.shape == (batch, input_seq_len, dm)

        # (batch, target_seq_len, dm)
        attn1, _ = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        # (batch, target_seq_len, dm)
        attn2, _ = self.mha2(out1, encoder_output, encoder_output,
                             padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        # (batch, target_seq_len, dm)
        out2 = self.layernorm2(attn2 + out1)

        # (batch, target_seq_len, dm)
        ffn_output = self.dense_hidden(out2)
        ffn_output = self.dense_output(ffn_output)
        ffn_output = self.dropout3(ffn_output, training=training)
        # (batch, target_seq_len, dm)
        out3 = self.layernorm3(ffn_output + out2)

        return out3

In [13]:
# 5-transformer.py
class Encoder(tf.keras.layers.Layer):
    """
    Creates the encoder for a transformer
    """
    def __init__(self, N, dm, h, hidden, input_vocab, max_seq_len,
                 drop_rate=0.1):
        """
        Class constructor
        """
        super().__init__()
        # number of blocks in the encoder
        self.N = N
        # dimensionality of the model
        self.dm = dm
        # the embedding layer for the inputs
        self.embedding = tf.keras.layers.Embedding(input_dim=input_vocab,
                                                   output_dim=dm)
        # numpy.ndarray (max_seq_len, dm) containing the positional encodings
        self.positional_encoding = positional_encoding(max_seq_len, self.dm)
        self.blocks = [EncoderBlock(dm, h, hidden, drop_rate)
                       for _ in range(N)]
        # the dropout layer, to be applied to the positional encodings
        self.dropout = tf.keras.layers.Dropout(rate=drop_rate)

    def call(self, x, training, mask):
        """
        Returns: a tensor of shape (batch, input_seq_len, dm)
        containing the encoder output
        """
        # input_seq_len = tf.shape(x)[1]
        # TypeError: slice indices must be integers
        # or None or have an __index__ method
        input_seq_len = x.shape[1]

        # Compute the embeddings
        # (batch, input_seq_len, dm)
        embeddings = self.embedding(x)
        # Scale the embeddings
        embeddings *= tf.math.sqrt(tf.cast(self.dm, tf.float32))
        # Sum the positional encodings with the embeddings
        embeddings += self.positional_encoding[:input_seq_len]

        output = self.dropout(embeddings, training=training)

        for i in range(self.N):
            output = self.blocks[i](output, training, mask)

        return output

In [14]:
# 5-transformer.py
class Decoder(tf.keras.layers.Layer):
    """
    Creates the decoder for a transformer
    """
    def __init__(self, N, dm, h, hidden, target_vocab, max_seq_len,
                 drop_rate=0.1):
        """
        Class constructor
        """
        super().__init__()
        # number of blocks in the decoder
        self.N = N
        # dimensionality of the model
        self.dm = dm
        # the embedding layer for the targets
        self.embedding = tf.keras.layers.Embedding(input_dim=target_vocab,
                                                   output_dim=dm)
        # numpy.ndarray (max_seq_len, dm) containing the positional encodings
        self.positional_encoding = positional_encoding(max_seq_len, dm)
        # a list of length N containing all of the DecoderBlock‘s
        self.blocks = [DecoderBlock(dm, h, hidden, drop_rate)
                       for _ in range(N)]
        # the dropout layer, to be applied to the positional encodings
        self.dropout = tf.keras.layers.Dropout(rate=drop_rate)

    def call(self, x, encoder_output, training, look_ahead_mask,
             padding_mask):
        """
        Returns: a tensor of shape (batch, target_seq_len, dm)
        containing the decoder output
        """
        target_seq_len = x.shape[1]

        # Compute the embeddings
        # (batch, target_seq_len, dm)
        embeddings = self.embedding(x)
        # Scale the embeddings
        embeddings *= tf.math.sqrt(tf.cast(self.dm, tf.float32))
        # Sum the positional encodings with the embeddings
        embeddings += self.positional_encoding[:target_seq_len]

        output = self.dropout(embeddings, training=training)

        for i in range(self.N):
            output = self.blocks[i](output, encoder_output, training,
                                    look_ahead_mask, padding_mask)

        return output

In [15]:
# 5-transformer.py
class Transformer(tf.keras.Model):
    """
    Creates a transformer network
    """
    def __init__(self, N, dm, h, hidden, input_vocab, target_vocab,
                 max_seq_input, max_seq_target, drop_rate=0.1):
        """
        Class constructor
        """
        super().__init__()
        # the encoder layer
        self.encoder = Encoder(N, dm, h, hidden, input_vocab, max_seq_input,
                               drop_rate)
        # the decoder layer
        self.decoder = Decoder(N, dm, h, hidden, target_vocab, max_seq_target,
                               drop_rate)
        # a final Dense layer with target_vocab units
        self.linear = tf.keras.layers.Dense(units=target_vocab)

    def call(self, inputs, target, training,
             encoder_mask, look_ahead_mask, decoder_mask):
        """
        Returns: a tensor of shape (batch, target_seq_len, target_vocab)
        containing the transformer output
        """
        # (batch, input_seq_len, dm)
        enc_output = self.encoder(inputs, training, encoder_mask)

        # dec_output.shape == (batch, target_seq_len, dm)
        # Error: dec_output, _ = self.decoder(target...)
        dec_output = self.decoder(target, enc_output, training,
                                  look_ahead_mask, decoder_mask)

        final_output = self.linear(dec_output)

        return final_output

In [16]:
# 5-train.py
"""
Creates and trains a transformer model
for machine translation of Portuguese to English
using our previously created dataset.
https://www.tensorflow.org/text/tutorials/transformer
"""
# import tensorflow.compat.v2 as tf
# Dataset = __import__('3-dataset').Dataset
# create_masks = __import__('4-create_masks').create_masks
# Transformer = __import__('5-transformer').Transformer


class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    """
    CustomSchedule class
    """
    def __init__(self, d_model, warmup_steps=4000):
        """
        Class constructor
        """
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        """
        call function
        """
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [17]:
# 5-train.py
def train_transformer(N, dm, h, hidden, max_len, batch_size, epochs):
    """
    Returns the trained model
    """
    data = Dataset(batch_size, max_len)

    learning_rate = CustomSchedule(dm)

    optimizer = \
        tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                 epsilon=1e-9)

    # sparse categorical crossentropy
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    
    def loss_function(real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = loss_object(real, pred)

        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask

        return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')

    input_vocab_size = data.tokenizer_pt.vocab_size + 2
    target_vocab_size = data.tokenizer_en.vocab_size + 2

    transformer = \
        Transformer(N=N, dm=dm, h=h,
                    hidden=hidden,
                    input_vocab=input_vocab_size,
                    target_vocab=target_vocab_size,
                    max_seq_input=max_len,
                    max_seq_target=max_len)
        
    def train_step(inp, tar):
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]

        enc_padding_mask, combined_mask, dec_padding_mask = \
            create_masks(inp, tar_inp)

        with tf.GradientTape() as tape:
            predictions = transformer(inp, tar_inp,
                                      True,
                                      enc_padding_mask,
                                      combined_mask,
                                      dec_padding_mask)
            loss = loss_function(tar_real, predictions)

        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients,
                                      transformer.trainable_variables))

        train_loss(loss)
        train_accuracy(tar_real, predictions)

    # training
    for epoch in range(epochs):

        train_loss.reset_states()
        train_accuracy.reset_states()
        
        for (batch, (inp, tar)) in enumerate(data.data_train):
            train_step(inp, tar)

            if batch % 50 == 0:
                print('Epoch {}, batch {}: loss {} accuracy {}'.format(
                    epoch + 1, batch,
                    train_loss.result(), train_accuracy.result()))

        print('Epoch {}: loss {} accuracy {}'.
              format(epoch + 1,
                     train_loss.result(),
                     train_accuracy.result()))

    return transformer

In [18]:
# 5-main.py
import tensorflow as tf


tf.compat.v1.set_random_seed(0)
transformer = train_transformer(4, 128, 8, 512, 32, 40, 2)
print(type(transformer))

[1mDownloading and preparing dataset ted_hrlr_translate/pt_to_en/1.0.0 (download: 124.94 MiB, generated: Unknown size, total: 124.94 MiB) to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…









HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incomplete9FA4OT/ted_hrlr_translate-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=51785.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incomplete9FA4OT/ted_hrlr_translate-validation.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1193.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incomplete9FA4OT/ted_hrlr_translate-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1803.0), HTML(value='')))

[1mDataset ted_hrlr_translate downloaded and prepared to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0. Subsequent calls will reuse this data.[0m
Epoch 1, batch 0: loss 10.247076034545898 accuracy 0.0
Epoch 1, batch 50: loss 10.212627410888672 accuracy 0.0012723066611215472
Epoch 1, batch 100: loss 10.133011817932129 accuracy 0.011149213649332523
Epoch 1, batch 150: loss 10.015902519226074 accuracy 0.015218124724924564
Epoch 1, batch 200: loss 9.859915733337402 accuracy 0.016002701595425606
Epoch 1, batch 250: loss 9.667802810668945 accuracy 0.01869341917335987
Epoch 1, batch 300: loss 9.447186470031738 accuracy 0.022725222632288933
Epoch 1, batch 350: loss 9.203824996948242 accuracy 0.02688455581665039
Epoch 1, batch 400: loss 8.951216697692871 accuracy 0.031244704499840736
Epoch 1, batch 450: loss 8.70673942565918 accuracy 0.03479441627860069
Epoch 1, batch 500: loss 8.490659713745117 accuracy 0.03763918951153755
Epoch 1, batch 550: loss 8.303115844726562 accuracy 0.