# Transformer for Classify

このNotebookでは、Tensorflowのチュートリアルにある［言語理解のためのTransoformerモデル］を、  
分類タスク用にコードを修正したものです。　　

URL「Transformer model for language understanding」  
https://www.tensorflow.org/tutorials/text/transformer?hl=ja#top_of_page　　


## Analysis base

OSのフォルダは、以下のような構成を想定しています。
* /tf
    * /input
        * /data
        * /encoder
    * /output/checkponts
    * /notebook

In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf

import tarfile 
import glob
import os
import pandas as pd
from urllib.request import urlretrieve

import re
import mojimoji as mojimoji
from pyknp import Juman
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import time
import numpy as np
import matplotlib.pyplot as plt

## Setup input pipeline

データは［livedoor ニュースコーパス］を利用します。  

URL「livedoor ニュースコーパス」  
https://www.rondhuit.com/download.html#ldcc

### ニュースコーパスをダウンロードしてファイルを展開しall.csvを作成

In [None]:
FILEURL = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
FILEPATH = "../input/data/tmp/ldcc-20140209.tar.gz"
TEXTDIR = "../input/data/livedoor/"
DATADIR = "../input/data/"

In [None]:
urlretrieve(FILEURL, FILEPATH)

mode = "r:gz"
tar = tarfile.open(FILEPATH, mode) 
tar.extractall(TEXTDIR ) 
tar.close()

In [None]:
def extract_txt(filename):
    with open(filename) as text_file:
        # 0: URL, 1: timestamp
        text = text_file.readlines()[2:]
        text = [sentence.strip() for sentence in text]
        text = list(filter(lambda line: line != '', text))
        return ''.join(text)

In [None]:
categories = [ 
    name for name 
    in os.listdir( os.path.join(TEXTDIR, "text") ) 
    if os.path.isdir( os.path.join(TEXTDIR, "text", name) ) ]

categories = sorted(categories)

In [None]:
categories

In [None]:
table = str.maketrans({
    '\n': '',
    '\t': '　',
    '\r': '',
})

In [None]:
all_text = []
all_label = []

for cat in categories:
    files = glob.glob(os.path.join(TEXTDIR, "text", cat, "{}*.txt".format(cat)))
    files = sorted(files)
    body = [ extract_txt(elem).translate(table) for elem in files ]
    label = [cat] * len(body)
    
    all_text.extend(body)
    all_label.extend(label)

In [None]:
df = pd.DataFrame({'text' : all_text, 'label' : all_label})

In [None]:
df.to_csv(os.path.join(DATADIR, "all.csv"), header=True, index=False)

### 前処理をして、文章を単語に分け、トークン化用のencoderを作成

In [None]:
df = pd.read_csv(os.path.join(DATADIR, "all.csv"))

In [None]:
ENCPATH = "../input/encoder/livedoor"

In [None]:
def preprocessing(text):
        
    table = str.maketrans({
        '\n': '',
        '\t': '　',
        '\r': '',
    })
        
    text = text.translate(table)
    text = text.lower()
    text = re.sub(r'\d+', '0', text)
    text = mojimoji.han_to_zen(text)
    text = re.sub('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]', '', text)
    
    while len(text.encode('utf-8')) > 4096:
        text = text[:-1]
    
    return text

In [None]:
def parse(text):
    
    split_list = []
    jumanpp = Juman()
    result = jumanpp.analysis(text)
    for mrph in result.mrph_list():
        split_list.append(mrph.midasi)
        split_str = '　'.join(split_list)
    
    return split_str

In [None]:
df["pre_text"] = df["text"].apply(preprocessing)

In [None]:
df["tokens"] = df["pre_text"].apply(parse)

In [None]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(df.label.values)

df["target"]  = y

In [None]:
df.head()

In [None]:
df.to_csv(os.path.join(DATADIR, "all_tokens.csv"), header=True, index=False)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((df.tokens.values, df.target.values))

In [None]:
tokenizer = tfds.features.text.Tokenizer(alphanum_only=False)

vocabulary_set = set()
for text_tensor, _ in dataset:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

len(vocabulary_set)

Note: PCスペックに応じてvocabularyリストのトークン数を削減（高PCスペックの場合は不要）

In [None]:
while  len(vocabulary_set) > 32000 :
    vocabulary_set.pop()

len(vocabulary_set)

In [None]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [None]:
encoder.save_to_file(ENCPATH)

##### target と label のリストを保存

In [None]:
target_list = df[['target', 'label']]

In [None]:
target_list = target_list.drop_duplicates()

In [None]:
target_list .head()

In [None]:
target_list.to_csv(os.path.join(DATADIR, "target_list.csv"), header=True, index=False)

##### encoder のテスト

In [None]:
encoder = tfds.features.text.TokenTextEncoder.load_from_file(ENCPATH)

In [None]:
example_text = next(iter(dataset))[0].numpy().decode('utf-8')
print(example_text)

In [None]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

##### train.csv valid.csv を作成

In [None]:
df = pd.read_csv(os.path.join(DATADIR, "all_tokens.csv"))

In [None]:
df.head()

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(df.tokens.values, df.target.values, 
                                                  stratify=df.target.values, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [None]:
train_df = pd.DataFrame({'tokens' : xtrain, 'target' : ytrain})
valid_df = pd.DataFrame({'tokens' : xvalid, 'target' : yvalid})

In [None]:
train_df.to_csv(os.path.join(DATADIR, "train.csv"), header=True, index=False)
valid_df.to_csv(os.path.join(DATADIR, "valid.csv"), header=True, index=False)

##### tensorflow用のデータセット（train_dataset, val_dataset）を作成

In [None]:
train_data =  tf.data.Dataset.from_tensor_slices((train_df.tokens.values, train_df.target.values))
test_data = tf.data.Dataset.from_tensor_slices((valid_df.tokens.values, valid_df.target.values))

In [None]:
encoder.vocab_size

In [None]:
def encode(token, label):
  enc_token = [encoder.vocab_size] + encoder.encode(
      token.numpy()) + [encoder.vocab_size+1]  
  return enc_token, label

In [None]:
def tf_encode(token, label):
    result_t, result_l = tf.py_function(encode, [token, label], [tf.int64, tf.int64])    
    result_t.set_shape([None])
    result_l.set_shape([])
    return result_t, result_l

Note: To keep this example small and relatively fast, drop examples with a length of over 140 tokens.

In [None]:
MAX_LENGTH = 140

In [None]:
def filter_max_length(x, y, max_length=MAX_LENGTH):
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [None]:
BUFFER_SIZE = encoder.vocab_size+1

In [None]:
train_preprocessed = (
    train_data
    .map(tf_encode) 
    .filter(filter_max_length)
    # cache the dataset to memory to get a speedup while reading from it.
    .cache()
    .shuffle(BUFFER_SIZE))

val_preprocessed = (
    test_data
    .map(tf_encode)
    .filter(filter_max_length)) 

In [None]:
BATCH_SIZE = 64

In [None]:
train_dataset = (train_preprocessed
                 .padded_batch(BATCH_SIZE, padded_shapes=([None], []))
                 .prefetch(tf.data.experimental.AUTOTUNE))


val_dataset = (val_preprocessed
               .padded_batch(BATCH_SIZE, padded_shapes=([None], [])))

##  Positional encoding

In [None]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [None]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  pos_encoding = angle_rads[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
pos_encoding = positional_encoding(50, 512)
print (pos_encoding.shape)

plt.pcolormesh(pos_encoding[0], cmap='RdBu')
plt.xlabel('Depth')
plt.xlim((0, 512))
plt.ylabel('Position')
plt.colorbar()
plt.show()

## Masking

In [None]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [None]:
x = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
create_padding_mask(x)

In [None]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [None]:
x = tf.random.uniform((1, 3))
temp = create_look_ahead_mask(x.shape[1])
temp

## Scaled dot product attention

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.
  
  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

In [None]:
def print_out(q, k, v):
  temp_out, temp_attn = scaled_dot_product_attention(
      q, k, v, None)
  print ('Attention weights are:')
  print (temp_attn)
  print ('Output is:')
  print (temp_out)

In [None]:
np.set_printoptions(suppress=True)

temp_k = tf.constant([[10,0,0],
                      [0,10,0],
                      [0,0,10],
                      [0,0,10]], dtype=tf.float32)  # (4, 3)

temp_v = tf.constant([[   1,0],
                      [  10,0],
                      [ 100,5],
                      [1000,6]], dtype=tf.float32)  # (4, 2)

# This `query` aligns with the second `key`,
# so the second `value` is returned.
temp_q = tf.constant([[0, 10, 0]], dtype=tf.float32)  # (1, 3)
print_out(temp_q, temp_k, temp_v)

In [None]:
# This query aligns with a repeated key (third and fourth), 
# so all associated values get averaged.
temp_q = tf.constant([[0, 0, 10]], dtype=tf.float32)  # (1, 3)
print_out(temp_q, temp_k, temp_v)

In [None]:
# This query aligns equally with the first and second key, 
# so their values get averaged.
temp_q = tf.constant([[10, 10, 0]], dtype=tf.float32)  # (1, 3)
print_out(temp_q, temp_k, temp_v)

In [None]:
temp_q = tf.constant([[0, 0, 10], [0, 10, 0], [10, 10, 0]], dtype=tf.float32)  # (3, 3)
print_out(temp_q, temp_k, temp_v)

## Multi-head attention

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
    return output, attention_weights

In [None]:
temp_mha = MultiHeadAttention(d_model=512, num_heads=4)
y = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
out, attn = temp_mha(y, k=y, q=y, mask=None)
out.shape, attn.shape

## Point wise feed forward network

In [None]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [None]:
sample_ffn = point_wise_feed_forward_network(512, 2048)
sample_ffn(tf.random.uniform((64, 50, 512))).shape

## Encoder

Classify の場合にはDecoderは不要です。Encoderで得られた値をDenseレイヤーで識別したいクラスの次元数に変換します。  
識別次元数は後述のTransformerクラスで指定します。

### Encoder layer

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, training, mask):
  
    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
    print("enclayer{}".format(out2))
    
    return out2

In [None]:
sample_encoder_layer = EncoderLayer(512, 4, 2048)

sample_encoder_layer_output = sample_encoder_layer(
    tf.random.uniform((64, 50, 512)), False, None)

sample_encoder_layer_output.shape  # (batch_size, input_seq_len, d_model)

###  Encoder

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                            self.d_model)
    
    
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)
           
  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]
    #attention_weights = {}
    
    # adding embedding and position encoding.
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    print(x)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    print(x)
    x += self.pos_encoding[:, :seq_len, :]
    print(x)

    x = self.dropout(x, training=training)
    
    for i in range(self.num_layers):
        x = self.enc_layers[i](x, training, mask)
        #attention_weights['encoder_layer{}'.format(i+1)] = att_weights
       
    return x  # (batch_size, input_seq_len, d_model)

In [None]:
sample_encoder = Encoder(num_layers=2, d_model=512, num_heads=8, 
                         dff=2048, input_vocab_size=8500,
                         maximum_position_encoding=10000)
temp_input = tf.random.uniform((64, 62), dtype=tf.int64, minval=0, maxval=200)

sample_encoder_output = sample_encoder(temp_input, training=False, mask=None)

print (sample_encoder_output.shape)  # (batch_size, input_seq_len, d_model)

## Create the Transformer

In [None]:
NUMLABELS = 9

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                           input_vocab_size, pe_input, rate)
    self.dense1 = tf.keras.layers.Dense(d_model, activation='tanh')
    self.dropout1 = tf.keras.layers.Dropout(rate)   
    self.final_layer = tf.keras.layers.Dense(NUMLABELS, activation='softmax')
    
    
  def call(self, inp, tar, training, enc_padding_mask):

    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

    enc_output = self.dense1(enc_output[:,0])
    enc_output = self.dropout1(enc_output, training=training)
    final_output = self.final_layer(enc_output )  # (batch_size, tar_seq_len, target_vocab_size)
    
    return final_output

In [None]:
sample_transformer = Transformer(
    num_layers=4, d_model=128, num_heads=8, dff=512, 
    input_vocab_size=8500, target_vocab_size=8000, 
    pe_input=10000, pe_target=6000)

temp_input = tf.random.uniform((64, 62), dtype=tf.int64, minval=0, maxval=200)
temp_target = tf.random.uniform((64, 62), dtype=tf.int64, minval=0, maxval=200)

fn_out = sample_transformer(temp_input, temp_target, training=False, enc_padding_mask=None)

fn_out.shape  # (batch_size, tar_seq_len, target_vocab_size)

## Set hyperparameters

In [None]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = encoder.vocab_size + 2
target_vocab_size = encoder.vocab_size + 2
dropout_rate = 0.1

In [None]:
input_vocab_size

## Optimizer

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [None]:
temp_learning_rate_schedule = CustomSchedule(d_model)

plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

In [None]:
def loss_function(labels, pred):
  loss_ = loss_object(labels, pred)
  return loss_

In [None]:
#Computes the (weighted) mean of the given values
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')

## Training and checkpointing

In [None]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, 
                          pe_input=input_vocab_size, 
                          pe_target=target_vocab_size,
                          rate=dropout_rate)

In [None]:
def create_masks(inp):
  enc_padding_mask = create_padding_mask(inp)
  return enc_padding_mask

In [None]:
CKPTPATH = "../output/checkpoints/"

In [None]:
ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, CKPTPATH, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

In [None]:
EPOCHS = 100

In [None]:
train_loss_results = []
train_accuracy_results = []
val_loss_results = []
val_accuracy_results = []

In [None]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=([None, None]), dtype=tf.int64),
    tf.TensorSpec(shape=([None]), dtype=tf.int64),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
  
  enc_padding_mask = create_masks(inp)
  
  with tf.GradientTape() as tape:
    predictions = transformer(inp, tar, True, enc_padding_mask)        
    loss = loss_function(tar, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables) 
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
  train_loss(loss)
  train_accuracy(tar, predictions)

In [None]:
val_step_signature = [
    tf.TensorSpec(shape=([None, None]), dtype=tf.int64),
    tf.TensorSpec(shape=([None]), dtype=tf.int64),
]

@tf.function(input_signature=val_step_signature)
def val_step(inp, tar):
  enc_padding_mask = create_masks(inp)
  predictions = transformer(inp, tar, False, enc_padding_mask)
  loss = loss_function(tar, predictions)
  val_loss(loss)
  val_accuracy(tar, predictions)

In [None]:
for epoch in range(EPOCHS):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()
  val_loss.reset_states()
  val_accuracy.reset_states()
  
  # inp -> dataset.tokes, tar -> dataset.target
  for (batch, (inp, tar)) in enumerate(train_dataset):
    train_step(inp, tar)
    
    if batch % 50 == 0:
      print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))

  for (batch, (inp, tar)) in enumerate(val_dataset):
    val_step(inp, tar)
     
  if (epoch + 1) % 5 == 0:
    ckpt_save_path = ckpt_manager.save()
    print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))
    
  print ('Epoch {} Loss {:.4f} Accuracy {:.4f} VAL_Loss {:.4f} VAL_Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result(),
                                                val_loss.result(),
                                                val_accuracy.result()))

  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
    
  train_loss_results.append(train_loss.result())
  train_accuracy_results.append(train_accuracy.result())
  val_loss_results.append(val_loss.result())
  val_accuracy_results.append(val_accuracy.result())

In [None]:
fig, axes = plt.subplots(2, sharex=True, figsize=(12, 8))
fig.suptitle('Training Metrics')

axes[0].set_ylabel("Loss", fontsize=14)
axes[0].plot(train_loss_results,label='train loss')
axes[0].plot(val_loss_results, label='val loss')
axes[0].legend()

axes[1].set_ylabel("Accuracy", fontsize=14)
axes[1].set_xlabel("Epoch", fontsize=14)
axes[1].plot(train_accuracy_results, label='train accuracy')
axes[1].plot(val_accuracy_results, label='val accuracy')
axes[1].legend()
plt.show()

In [None]:
transformer.summary()

## Evaluate

In [None]:
def evaluate(filename, tar):
  
  with open(filename) as text_file:
    # 0: URL, 1: timestamp
    text = text_file.readlines()[2:]
    text = [sentence.strip() for sentence in text]
    text = list(filter(lambda line: line != '', text))
    text = ''.join(text)
    print(text)
  
  table = str.maketrans({
    '\n': '',
    '\t': '　',
    '\r': '',
  })
  text = text.translate(table)

  text = preprocessing(text)
  text = parse(text)
  print(text)
  
  start_token = [encoder.vocab_size]
  end_token = [encoder.vocab_size + 1]
      
  tokens = start_token + encoder.encode(text) + end_token
  inp = tokens[0:140]
  encoder_input = tf.expand_dims(inp, 0)
  print(inp)

  # predictions.shape == (batch_size, seq_len, vocab_size)
  enc_padding_mask = create_masks(encoder_input)
  predictions = transformer(encoder_input, tar, False, enc_padding_mask)
  predictions = predictions[0:, :]

  array_pred = predictions.numpy()
  print("predictions : {}".format(array_pred))
  print("sort by predictions : {}".format(np.sort(array_pred)))
  print("sort by target : {}".format(np.argsort(array_pred)))
  
  return np.argmax(predictions.numpy())

In [None]:
evaluate("../input/data/livedoor/text/it-life-hack/it-life-hack-6296655.txt", "it-life-hack")