In [1]:
import os
import csv
import numpy as np
import tensorflow as tf

from model.seq2seq.Seq2Seq import *
from model.seq2seq_attn.Seq2Seq_Attn import *
from utils.utils import *

Loading JIT Compiled ChatSpace Model


In [2]:
# Determine what kind of model type we will use
# e.g. seq2seq, seq2seq_attn
model_type = 'seq2seq_attn'
attn_type = 'luong'
method = 'dot'

# GPU:1 allocation
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
@tf.function
def loss_function(true, pred, loss_obj):
    mask = tf.math.logical_not(tf.math.equal(true, 0))

    loss = loss_obj(true, pred)

    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    
    return tf.reduce_mean(loss)

In [4]:
def train():
    # Load data
    dataset = load_dataset(data_dir)
    
    num_batches_per_epoch = len(dataset) // batch_size
    
    # Load tokenizer
    enc_tokenizer = load_tokenizer('enc-tokenizer', (x for x, y in dataset), target_vocab_size=2**13)
    dec_tokenizer = load_tokenizer('dec-tokenizer', (y for x, y in dataset), target_vocab_size=2**13)
    enc_vocab_size = enc_tokenizer.vocab_size + 1
    dec_vocab_size = dec_tokenizer.vocab_size + 2
    print(f'enc_vocab_size: {enc_vocab_size}\tdec_vocab_size: {dec_vocab_size}')
    
    # Define the optimizer and the loss function
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    
    if model_type == 'seq2seq':
        # Set a configuration
        config = {'batch_size': batch_size,
                  'enc_max_len': enc_max_len+1,
                  'dec_max_len': dec_max_len+2,
                  'enc_unit': enc_unit,
                  'dec_unit': dec_unit,
                  'embed_dim': embed_dim,
                  'dropout_rate': dropout_rate,
                  'enc_vocab_size': enc_vocab_size,
                  'dec_vocab_size': dec_vocab_size,
                  'dec_sos_token': dec_tokenizer.vocab_size
                  }

        # Define the seq2seq model
        model = seq2seq(config)

        # Set a checkpoint directory
        checkpoint_dir = 'checkpoint/daily-korean/seq2seq'
        
    elif model_type == 'seq2seq_attn':
        # Set a configuration
        config = {'batch_size': batch_size,
                  'enc_max_len': enc_max_len+1,
                  'dec_max_len': dec_max_len+2,
                  'enc_unit': enc_unit,
                  'dec_unit': dec_unit,
                  'embed_dim': embed_dim,
                  'dropout_rate': dropout_rate,
                  'enc_vocab_size': enc_vocab_size,
                  'dec_vocab_size': dec_vocab_size,
                  'dec_sos_token': dec_tokenizer.vocab_size,
                  'attn_type': attn_type,
                  'method': method
                  }

        # Define the seq2seq model
        model = seq2seq_attn(config)

        # Set a checkpoint directory
        checkpoint_dir = 'checkpoint/daily-korean/seq2seq_{}_attn'.format(attn_type)
        
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
        
    checkpoint_prefix = os.path.join(checkpoint_dir, 'checkpoint')
    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
    
    epoch_loss = tf.keras.metrics.Mean()
    
    for epoch in range(epochs):
        epoch_loss.reset_states()
        
        train_batches = batch_dataset(dataset, batch_size, enc_tokenizer, dec_tokenizer, enc_max_len, dec_max_len)
        
        for batch_idx, (batch_x, batch_y) in enumerate(train_batches):
            loss = 0.
            with tf.GradientTape() as tape:
                outputs = model(batch_x, batch_y, True)
                
                if model_type == 'seq2seq':
                    preds = outputs
                elif model_type == 'seq2seq_attn':
                    preds = outputs[0]
                    attn_weights = outputs[1]
                    
                loss = loss_function(batch_y[:, 1:], preds, loss_obj)
            
            variables = model.trainable_variables
            gradients = tape.gradient(loss, variables)
            optimizer.apply_gradients(zip(gradients, variables))
            
            epoch_loss(loss)
            
            if (batch_idx + 1) % log_interval == 0:
                print(f'[Epoch {epoch + 1}|Step {batch_idx + 1}/{num_batches_per_epoch}] loss: {loss.numpy()} (Avg. {epoch_loss.result()})')
        
        model.save_weights(filepath=checkpoint_prefix)
    
    print("Training is Done.")

In [5]:
if __name__ == '__main__':
    np.random.seed(1234)
    tf.random.set_seed(1234)
    
    train()

enc_vocab_size: 8633	dec_vocab_size: 7921


W0226 03:24:55.764105 140005697693504 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/util/random_seed.py:58: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


[Epoch 1|Step 50/369] loss: 1.4856469631195068 (Avg. 1.7386714220046997)
[Epoch 1|Step 100/369] loss: 1.5124869346618652 (Avg. 1.6123584508895874)
[Epoch 1|Step 150/369] loss: 1.4398281574249268 (Avg. 1.5446010828018188)
[Epoch 1|Step 200/369] loss: 1.3429992198944092 (Avg. 1.5131295919418335)
[Epoch 1|Step 250/369] loss: 1.3298581838607788 (Avg. 1.4850895404815674)
[Epoch 1|Step 300/369] loss: 1.2465062141418457 (Avg. 1.4575309753417969)
[Epoch 1|Step 350/369] loss: 1.1726826429367065 (Avg. 1.4339226484298706)
[Epoch 2|Step 50/369] loss: 1.149631381034851 (Avg. 1.2216838598251343)
[Epoch 2|Step 100/369] loss: 1.2455790042877197 (Avg. 1.2257002592086792)
[Epoch 2|Step 150/369] loss: 1.2269006967544556 (Avg. 1.2083889245986938)
[Epoch 2|Step 200/369] loss: 1.0734096765518188 (Avg. 1.2010623216629028)
[Epoch 2|Step 250/369] loss: 1.191190481185913 (Avg. 1.1984002590179443)
[Epoch 2|Step 300/369] loss: 1.0066502094268799 (Avg. 1.1980574131011963)
[Epoch 2|Step 350/369] loss: 1.05883944034