In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
    %tensorflow_version 2.x
except Exception:
    pass
# !pip install tensorflow_probability==0.8.0rc0 --upgrade
!pip install sentencepiece
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-macosx_10_9_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m944.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


2023-09-22 15:19:27.820628: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
!mkdir -p checkpoints
!mkdir -p datasets

In [3]:
# print tensorflow versions
!pip freeze | grep tensorflow
!nvidia-smi

tensorflow==2.13.0
tensorflow-addons==0.21.0
tensorflow-datasets==4.9.3
tensorflow-estimator==2.13.0
tensorflow-io-gcs-filesystem==0.34.0
tensorflow-metadata==1.14.0
/bin/bash: nvidia-smi: command not found


In [4]:
print('is gpu available?: ', tf.test.is_gpu_available())
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
is gpu available?:  False


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 6935092614327954695
 xla_global_id: -1]

In [5]:
class Embeddinglayer(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        # model hyper parameter variables
        super(Embeddinglayer, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
    
    def call(self, sequences):
        max_sequence_len = sequences.shape[1]
        output = self.embedding(sequences) * tf.sqrt(tf.cast(self.d_model, dtype=tf.float32))
        output += self.positional_encoding(max_sequence_len)
        
        return output
    
    def positional_encoding(self, max_len):
        pos = np.expand_dims(np.arange(0, max_len), axis=1)
        index = np.expand_dims(np.arange(0, self.d_model), axis=0)
        
        pe = self.angle(pos, index)
        
        pe[:, 0::2] = np.sin(pe[:, 0::2])
        pe[:, 1::2] = np.cos(pe[:, 1::2])        
        
        pe = np.expand_dims(pe, axis=0)
        return tf.cast(pe, dtype=tf.float32)
        
    def angle(self, pos, index):
        return pos / np.power(10000, (index - index % 2) / np.float32(self.d_model))
     


In [6]:
class ScaledDotProductAttention(tf.keras.layers.Layer):
    def __init__(self, d_h):
        super(ScaledDotProductAttention, self).__init__()
        self.d_h = d_h
        
    def call(self, query, key, value, mask=None):
        # query = (batch_size, attention_head_count, seq_length, d_h)
        # matmul_q_and_transposed_k = (batch_size, attention_head_count, seq_length, seq_length)
        matmul_q_and_transposed_k = tf.matmul(query, key, transpose_b=True)
        
        scale = tf.sqrt(tf.cast(self.d_h, dtype=tf.float32))
        scaled_attention_score = matmul_q_and_transposed_k / scale
        if mask is not None:
            scaled_attention_score += (mask * -1e9)
        
        attention_weight = tf.nn.softmax(scaled_attention_score, axis=-1)
        
        return tf.matmul(attention_weight, value), attention_weight

In [None]:

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, attention_head_count, d_model, dropout_prob):
        super(MultiHeadAttention, self).__init__()

        # model hyper parameter variables
        self.attention_head_count = attention_head_count
        self.d_model = d_model
        self.dropout_prob = dropout_prob

        if d_model % attention_head_count != 0:
            raise ValueError(
                f"d_model({d_model}) % attention_head_count({attention_head_count}) is not zero."
                f"d_model must be multiple of attention_head_count."
            )
        
        self.d_h = d_model // attention_head_count
        
        self.w_query = tf.keras.layers.Dense(d_model)
        self.w_key = tf.keras.layers.Dense(d_model)
        self.w_value = tf.keras.layers.Dense(d_model)
        
        self.scaled_dot_product = ScaledDotProductAttention(self.d_h)
        
        self.ff = tf.keras.layers.Dense(d_model)
    
    def call(self, query, key, value, mask=None):
        # query=input
        batch_size = tf.shape(query)[0]
        # query_shape = (batch_size, sentene_length, d_model)
        query = self.w_query(query)
        
        key = self.w_key(key)
        
        value = self.w_value(value)
        # query_shape = (batch_size, attention_head_count, sentene_length, d_h)
        query = self.split_head(query, batch_size)
        key = self.split_head(key, batch_size)
        value = self.split_head(value, batch_size)
        
        output, attention = self.scaled_dot_product(query, key, value, mask)
        
        output = self.concat_head(output, batch_size)
        
        return self.ff(output), attention
        
    
    def split_head(self, tensor, batch_size):
        # input tensor: (batch_size, seq_len, d_model)
        return tf.transpose(
            tf.reshape(
                tensor, 
                (batch_size, -1, self.attention_head_count, self.d_h)
                # tensor: (batch_size, seq_len_splited, attention_head_count, d_h)
            ),
            [0, 2, 1, 3]
            # tensor: (batch_size, attention_head_count, seq_len_splited, d_h)
        )
    
    def concat_head(self, tensor, batch_size):
        return tf.reshape(
            tf.transpose(tensor, [0, 2, 1, 3]), 
            (batch_size, -1, self.attention_head_count * self.d_h)