# Evn*

In [None]:
# imports
import argparse
import os
import random
import shutil
import json
import zipfile
import math
import copy
import collections
import re

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# import sentencepiece as spm
import tensorflow as tf
import tensorflow.keras.backend as K

from tqdm.notebook import tqdm, trange

In [None]:
# 환경 설정
args = {
    # random seed value
    "seed": 1234
}
args = argparse.Namespace(**args)

print(args)

Namespace(seed=1234)


In [None]:
# random seed 설정
random.seed(args.seed)
np.random.seed(args.seed)
tf.random.set_seed(args.seed)

In [None]:
# gpu 사용량 확인
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



# Config

In [None]:
args.d_model = 8  # d_model: model hidden dim
args.n_head = 2  # n_head: multi head attention head number
args.d_head = 3  # d_head: multi head attention head dim
args.dropout = 0.1  # dropout: dropout rate
args.d_ff = 32  # d_ff: feed forward dim
args.norm_eps = 1e-9  # norm_eps: layernormal epsilon
args.n_layer = 6  # n_layer: layer number
args.n_seq = 16  # n_seq: sequence max number
args.n_vocab = 16  # n_vocab: vocab count
args.i_pad = 0  # i_pad: vocab pad id

args

Namespace(d_ff=32, d_head=3, d_model=8, dropout=0.1, i_pad=0, n_head=2, n_layer=6, n_seq=16, n_vocab=16, norm_eps=1e-09, seed=1234)

# Inputs

In [None]:
# 입력 문장
sentences = [
    ['나는 오늘 행복해', '나도 기분이 매우 좋아'],
    # ['나는 오늘 기분이 좋아', '나도 매우 행복하다'],
]

In [None]:
# 각 문장을 띄어쓰기 단위로 분할
words = []
for pair in sentences:
    for sentence in pair:
        words.extend(sentence.split())

# 중복 단어 제거
words = list(dict.fromkeys(words))

# 각 단어별 고유한 번호 부여
word_to_id = {'[PAD]': 0, '[UNK]': 1, '[BOS]': 2, '[EOS]': 3}
for word in words:
    word_to_id[word] = len(word_to_id)

# 각 숫자별 단어 부여
id_to_word = {_id:word for word, _id in word_to_id.items()}

word_to_id, id_to_word

({'[BOS]': 2,
  '[EOS]': 3,
  '[PAD]': 0,
  '[UNK]': 1,
  '기분이': 8,
  '나는': 4,
  '나도': 7,
  '매우': 9,
  '오늘': 5,
  '좋아': 10,
  '행복해': 6},
 {0: '[PAD]',
  1: '[UNK]',
  2: '[BOS]',
  3: '[EOS]',
  4: '나는',
  5: '오늘',
  6: '행복해',
  7: '나도',
  8: '기분이',
  9: '매우',
  10: '좋아'})

In [None]:
# train source, target 데이터 생성
train_src_ids, tarin_tgt_ids = [], []
for pair in sentences:
    train_src_ids.append([word_to_id[word] for word in pair[0].split()])
    tarin_tgt_ids.append([word_to_id[word] for word in pair[1].split()])
train_src_ids, tarin_tgt_ids

([[4, 5, 6]], [[7, 8, 9, 10]])

In [None]:
# train enc_inputs, dec_inputs, dec_label 생성
train_enc_inputs, train_dec_inputs, train_dec_labels = [], [], []
for source_id, target_id in zip(train_src_ids, tarin_tgt_ids):
    train_enc_inputs.append(source_id)
    train_dec_inputs.append([word_to_id['[BOS]']] + target_id)
    train_dec_labels.append(target_id + [word_to_id['[EOS]']])
train_enc_inputs, train_dec_inputs, train_dec_labels

([[4, 5, 6]], [[2, 7, 8, 9, 10]], [[7, 8, 9, 10, 3]])

In [None]:
# 문장의 길이를 모두 동일하게 변경 (최대길이 4)
for row in train_enc_inputs:
    row += [0] * (4 - len(row))

# 문장의 길이를 모두 동일하게 변경 (최대길이 6)
for row in train_dec_inputs:
    row += [0] * (6 - len(row))

# 문장의 길이를 모두 동일하게 변경 (최대길이 6)
for row in train_dec_labels:
    row += [0] * (6 - len(row))

train_enc_inputs, train_dec_inputs, train_dec_labels

([[4, 5, 6, 0]], [[2, 7, 8, 9, 10, 0]], [[7, 8, 9, 10, 3, 0]])

In [None]:
# numpy array로 변환
train_enc_inputs = np.array(train_enc_inputs)
train_dec_inputs = np.array(train_dec_inputs)
train_dec_labels = np.array(train_dec_labels)

train_enc_inputs, train_dec_inputs, train_dec_labels

(array([[4, 5, 6, 0]]),
 array([[ 2,  7,  8,  9, 10,  0]]),
 array([[ 7,  8,  9, 10,  3,  0]]))

In [None]:
args.n_vocab = len(word_to_id)
args.n_seq = 6

args

Namespace(d_ff=32, d_head=3, d_model=8, dropout=0.1, i_pad=0, n_head=2, n_layer=6, n_seq=6, n_vocab=11, norm_eps=1e-09, seed=1234)

In [None]:
# embedding with random weight
embed_weight = np.random.randint(-90, 100, (args.n_vocab, args.d_model)) / 100

embed = tf.keras.layers.Embedding(args.n_vocab, args.d_model, weights=[embed_weight])
embed_weight

array([[-0.06,  0.15, -0.9 ,  0.31,  0.08,  0.  ,  0.71,  0.24],
       [ 0.31, -0.69,  0.29,  0.9 ,  0.26,  0.15,  0.24,  0.79],
       [-0.89,  0.52, -0.87, -0.6 ,  0.5 ,  0.19, -0.71, -0.64],
       [-0.22,  0.33, -0.26,  0.6 ,  0.94,  0.46,  0.82,  0.62],
       [-0.75, -0.18,  0.4 ,  0.54,  0.17,  0.4 , -0.11, -0.44],
       [-0.33,  0.93, -0.54, -0.02,  0.71,  0.31,  0.8 , -0.88],
       [ 0.19, -0.55, -0.72, -0.14,  0.27, -0.09, -0.82, -0.15],
       [-0.75, -0.7 , -0.74, -0.29,  0.23,  0.06,  0.36, -0.33],
       [-0.41, -0.28,  0.95,  0.65, -0.03,  0.73,  0.74, -0.06],
       [-0.29, -0.62, -0.89, -0.79,  0.55, -0.6 , -0.52,  0.08],
       [ 0.63, -0.34,  0.62, -0.78,  0.06, -0.82, -0.43, -0.03]])

In [None]:
# encoder hidden
hidden_enc = embed(train_enc_inputs)
hidden_enc

<tf.Tensor: shape=(1, 4, 8), dtype=float32, numpy=
array([[[-0.75, -0.18,  0.4 ,  0.54,  0.17,  0.4 , -0.11, -0.44],
        [-0.33,  0.93, -0.54, -0.02,  0.71,  0.31,  0.8 , -0.88],
        [ 0.19, -0.55, -0.72, -0.14,  0.27, -0.09, -0.82, -0.15],
        [-0.06,  0.15, -0.9 ,  0.31,  0.08,  0.  ,  0.71,  0.24]]],
      dtype=float32)>

In [None]:
# decoder hidden
hidden_dec = embed(train_dec_inputs)
hidden_dec

<tf.Tensor: shape=(1, 6, 8), dtype=float32, numpy=
array([[[-0.89,  0.52, -0.87, -0.6 ,  0.5 ,  0.19, -0.71, -0.64],
        [-0.75, -0.7 , -0.74, -0.29,  0.23,  0.06,  0.36, -0.33],
        [-0.41, -0.28,  0.95,  0.65, -0.03,  0.73,  0.74, -0.06],
        [-0.29, -0.62, -0.89, -0.79,  0.55, -0.6 , -0.52,  0.08],
        [ 0.63, -0.34,  0.62, -0.78,  0.06, -0.82, -0.43, -0.03],
        [-0.06,  0.15, -0.9 ,  0.31,  0.08,  0.  ,  0.71,  0.24]]],
      dtype=float32)>

# Mask

## PAD Mask

In [None]:
def get_pad_mask(tokens, i_pad=0):
    """
    pad mask 계산하는 함수
    :param tokens: tokens (bs, n_seq)
    :param i_pad: id of pad
    :return mask: pad mask (pad: 1, other: 0)
    """
    # 0인 부분 확인
    mask = tf.math.equal(tokens, i_pad)
    # boolean -> float 32
    mask = tf.cast(mask, tf.float32)
    # expand dimension for n_seq
    mask = tf.expand_dims(mask, axis=1)
    # print(mask)
    return mask

In [None]:
enc_pad_mask = get_pad_mask(train_enc_inputs)
enc_pad_mask

<tf.Tensor: shape=(1, 1, 4), dtype=float32, numpy=array([[[0., 0., 0., 1.]]], dtype=float32)>

## Causal Mask

In [None]:
def get_causal_mask(tokens, i_pad=0):
    """
    causal mask 계산하는 함수
    :param tokens: tokens (bs, n_seq)
    :param i_pad: id of pad
    :return mask: causal and pad mask (causal or pad: 1, other: 0)
    """
    # 개수 조회
    n_seq = tf.shape(tokens)[1]
    # make ahead mask
    mask = 1 - tf.linalg.band_part(tf.ones((n_seq, n_seq)), -1, 0)
    # expand dim for bs
    mask = tf.expand_dims(mask, axis=0)
    # get pad_mask
    pad_mask = get_pad_mask(tokens, i_pad)
    # mask all ahead_mask or pad_mask
    mask = tf.maximum(mask, pad_mask)
    return mask

In [None]:
dec_causal_mask = get_causal_mask(train_dec_inputs)
dec_causal_mask

<tf.Tensor: shape=(1, 6, 6), dtype=float32, numpy=
array([[[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.]]], dtype=float32)>

## Mask 생성

In [None]:
# Encoder Self Attetnion mask
enc_self_mask = get_pad_mask(train_enc_inputs)
enc_self_mask

<tf.Tensor: shape=(1, 1, 4), dtype=float32, numpy=array([[[0., 0., 0., 1.]]], dtype=float32)>

In [None]:
# Decoder Self Attetnion mask
dec_self_mask = get_causal_mask(train_dec_inputs)
dec_self_mask

<tf.Tensor: shape=(1, 6, 6), dtype=float32, numpy=
array([[[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.]]], dtype=float32)>

In [None]:
# Encoder-Decoder Attetnion mask
enc_dec_mask = get_pad_mask(train_enc_inputs)
enc_dec_mask

<tf.Tensor: shape=(1, 1, 4), dtype=float32, numpy=array([[[0., 0., 0., 1.]]], dtype=float32)>

# Scaled dot product attention

In [None]:
class ScaleDotProductAttention(tf.keras.layers.Layer):
    """
    Scale Dot Product Attention Class
    """
    def __init__(self, name="scale_dot_product_attention"):
        """
        생성자
        :param name: layer name
        """
        super().__init__(name=name)

    def call(self, inputs):
        """
        layer 실행
        :param inputs: Q, K, V, attn_mask tuple
        :return attn_out: attention 실행 결과
        """
        Q, K, V, attn_mask = inputs
        # print(inputs)
        # matmul Q, K (transpose_b=True)
        attn_score = tf.matmul(Q, K, transpose_b=True)
        # print(attn_score)
        # get scale = d_model ** 0.5
        scale = tf.math.sqrt(tf.cast(tf.shape(K)[-1], tf.float32))
        # print(scale)
        # divide by scale
        attn_scale = tf.math.divide(attn_score, scale)
        # print(attn_scale)
        # do mask (subtract 1e-9 for masked value)
        attn_scale -= 1.e9 * attn_mask
        # print(attn_scale)
        # calculate attention prob
        attn_prob = tf.nn.softmax(attn_scale, axis=-1)
        # print(attn_prob)
        # weighted sum of V
        attn_out = tf.matmul(attn_prob, V)
        # print(attn_out)
        return attn_out

In [None]:
# Encoder Self Attetnion
Q = hidden_enc
K = hidden_enc
V = hidden_enc

attention = ScaleDotProductAttention()
attn_out = attention((Q, K, V, enc_self_mask))
attn_out

<tf.Tensor: shape=(1, 4, 8), dtype=float32, numpy=
array([[[-0.38933986,  0.07112095, -0.16520979,  0.20013933,
          0.36103696,  0.25173742, -0.00415151, -0.50428855],
        [-0.34047824,  0.44622278, -0.3593463 ,  0.0855836 ,
          0.5193899 ,  0.26613465,  0.3384088 , -0.6653293 ],
        [-0.17536503, -0.10861103, -0.38723904,  0.06453344,
          0.3467061 ,  0.13054141, -0.25767758, -0.39562   ],
        [-0.28988382,  0.19854634, -0.34112462,  0.09473331,
          0.43631238,  0.21673402,  0.08101988, -0.547999  ]]],
      dtype=float32)>

In [None]:
# Decoder Self Attetnion
Q = hidden_dec
K = hidden_dec
V = hidden_dec

attn_out = attention((Q, K, V, dec_self_mask))
attn_out

<tf.Tensor: shape=(1, 6, 8), dtype=float32, numpy=
array([[[-0.89      ,  0.52      , -0.87      , -0.6       ,
          0.5       ,  0.19      , -0.71      , -0.64      ],
        [-0.81045854, -0.17314747, -0.7961401 , -0.4238724 ,
          0.34659854,  0.11614002, -0.10207558, -0.46387237],
        [-0.56144136, -0.25976264,  0.28146774,  0.2427778 ,
          0.11005872,  0.49197903,  0.43478104, -0.20985326],
        [-0.5747526 , -0.30629176, -0.7071965 , -0.5053049 ,
          0.41316307, -0.12226334, -0.2576017 , -0.22629057],
        [-0.09017085, -0.32786977, -0.06507014, -0.5238511 ,
          0.2422172 , -0.33344442, -0.24474701, -0.1312621 ],
        [-0.43553102, -0.3018549 , -0.30060858, -0.3476541 ,
          0.28297472, -0.0423666 , -0.08249234, -0.21670395]]],
      dtype=float32)>

In [None]:
# Encoder-Decoder Attetnion
Q = hidden_dec
K = hidden_enc
V = hidden_enc

attn_out = attention((Q, K, V, enc_dec_mask))
attn_out

<tf.Tensor: shape=(1, 6, 8), dtype=float32, numpy=
array([[[-0.26965153,  0.15192112, -0.35190547,  0.08786301,
          0.4234491 ,  0.2022787 ,  0.02853429, -0.5244904 ],
        [-0.28569198,  0.08716378, -0.30829674,  0.11374261,
          0.39413947,  0.20345482, -0.02774149, -0.49769774],
        [-0.43647614,  0.11320846, -0.11715899,  0.22951071,
          0.36516273,  0.27868778,  0.05409794, -0.52934116],
        [-0.16213647, -0.08165368, -0.41408706,  0.04850309,
          0.36060563,  0.12689751, -0.2367178 , -0.40590388],
        [-0.21452299, -0.07886577, -0.34552276,  0.08998163,
          0.34810677,  0.15240252, -0.21430917, -0.41410154],
        [-0.28988382,  0.19854634, -0.34112462,  0.09473331,
          0.43631238,  0.21673402,  0.08101988, -0.547999  ]]],
      dtype=float32)>

# Multi Head Attention

In [None]:
args

Namespace(d_ff=32, d_head=3, d_model=8, dropout=0.1, i_pad=0, n_head=2, n_layer=6, n_seq=6, n_vocab=11, norm_eps=1e-09, seed=1234)

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    """
    Multi Head Attention Class
    """
    def __init__(self, args, name="multi_head_attention"):
        """
        생성자
        :param args: Args 객체
        :param name: layer name
        """
        super().__init__(name=name)

        self.d_model = args.d_model
        self.n_head = args.n_head
        self.d_head = args.d_head

        # Q, K, V input dense layer
        self.W_Q = tf.keras.layers.Dense(self.n_head * self.d_head)  # n_head = h
        self.W_K = tf.keras.layers.Dense(self.n_head * self.d_head)
        self.W_V = tf.keras.layers.Dense(self.n_head * self.d_head)
        # Scale Dot Product Attention class
        self.attention = ScaleDotProductAttention(name="self_attention")
        # output dense layer
        self.W_O = tf.keras.layers.Dense(self.d_model)

    def call(self, inputs):
        """
        layer 실행
        :param inputs: Q, K, V, attn_mask tuple
        :return attn_out: attention 실행 결과
        """
        Q, K, V, attn_mask = inputs
        print(inputs)
        # Q_m = self.W_Q(Q) # Q = (4,8) -> (4,6)
        # print(Q_m) # ->이때 4 = 입력 길이 , 8 = d_model , 6 = h_head * d-head
        # tf.reshape(Q_m, [-1, tf.shape(Q)[1], self.n_head, self.d_head]) # 쪼갰어 긴거를
        # print(Q_m)
        # Q_m = tf.transpose(Q_m, [0,2,1,3]) # 잘라진거를 
        # print(Q_m)

        # build multihead Q, K, V
        Q_m = tf.transpose(tf.reshape(self.W_Q(Q), [-1, tf.shape(Q)[1], self.n_head, self.d_head]), [0, 2, 1, 3])  # (bs, Q_len, d_model) -> (bs, n_head, Q_len, d_head)
        K_m = tf.transpose(tf.reshape(self.W_K(K), [-1, tf.shape(K)[1], self.n_head, self.d_head]), [0, 2, 1, 3])  # (bs, n_head, K_len, d_head)
        V_m = tf.transpose(tf.reshape(self.W_V(V), [-1, tf.shape(V)[1], self.n_head, self.d_head]), [0, 2, 1, 3])  # (bs, n_head, K_len, d_head)
        print(Q_m, K_m, V_m)
        # # build multihead mask
        attn_mask_m = tf.expand_dims(attn_mask, axis=1) # 연산 시키려면 차원 같게 해줘야 되니까 mask의 차원을 하나 늘려준거야-> 브로드 캐스팅하려고
        print(attn_mask_m)
        # Scale Dot Product Attention with multi head Q, K, V, attn_mask
        attn_out_m = self.attention((Q_m, K_m, V_m, attn_mask_m))  # (bs, n_head, Q_len, d_head)
        print(attn_out_m)
        # transpose and reshape
        attn_out_t = tf.transpose(attn_out_m, perm=[0, 2, 1, 3])  # (bs, Q_len, n_head, d_head)
        print(attn_out_t)
        attn_out_c = tf.reshape(attn_out_t, [-1, tf.shape(Q)[1], self.n_head * self.d_head])  # (bs, Q_len, d_model)
        print(attn_out_c)
        # linear for output
        attn_out = self.W_O(attn_out_c) # (bs, Q_len, d_model)
        # print(attn_out)
        return attn_out

In [None]:
# Encoder Self Attetnion
Q = hidden_enc
K = hidden_enc
V = hidden_enc

attention = MultiHeadAttention(args)
attn_out = attention((Q, K, V, enc_self_mask))
attn_out

(<tf.Tensor: shape=(1, 4, 8), dtype=float32, numpy=
array([[[-0.75, -0.18,  0.4 ,  0.54,  0.17,  0.4 , -0.11, -0.44],
        [-0.33,  0.93, -0.54, -0.02,  0.71,  0.31,  0.8 , -0.88],
        [ 0.19, -0.55, -0.72, -0.14,  0.27, -0.09, -0.82, -0.15],
        [-0.06,  0.15, -0.9 ,  0.31,  0.08,  0.  ,  0.71,  0.24]]],
      dtype=float32)>, <tf.Tensor: shape=(1, 4, 8), dtype=float32, numpy=
array([[[-0.75, -0.18,  0.4 ,  0.54,  0.17,  0.4 , -0.11, -0.44],
        [-0.33,  0.93, -0.54, -0.02,  0.71,  0.31,  0.8 , -0.88],
        [ 0.19, -0.55, -0.72, -0.14,  0.27, -0.09, -0.82, -0.15],
        [-0.06,  0.15, -0.9 ,  0.31,  0.08,  0.  ,  0.71,  0.24]]],
      dtype=float32)>, <tf.Tensor: shape=(1, 4, 8), dtype=float32, numpy=
array([[[-0.75, -0.18,  0.4 ,  0.54,  0.17,  0.4 , -0.11, -0.44],
        [-0.33,  0.93, -0.54, -0.02,  0.71,  0.31,  0.8 , -0.88],
        [ 0.19, -0.55, -0.72, -0.14,  0.27, -0.09, -0.82, -0.15],
        [-0.06,  0.15, -0.9 ,  0.31,  0.08,  0.  ,  0.71,  0.24]]],
  

<tf.Tensor: shape=(1, 4, 8), dtype=float32, numpy=
array([[[-0.49435332, -0.2405638 , -0.25250086,  0.38179183,
         -0.00297513, -0.05331107, -0.18338923,  0.37701666],
        [-0.6845567 , -0.00480074, -0.04757795,  0.45028195,
         -0.0696075 , -0.03893483, -0.17173499,  0.42125192],
        [-0.549003  , -0.22678691, -0.23868808,  0.42498147,
         -0.04111366, -0.02519875, -0.1796784 ,  0.39987102],
        [-0.6031012 , -0.02820833, -0.07853937,  0.38559395,
          0.00413214, -0.09124707, -0.16833101,  0.39549625]]],
      dtype=float32)>

In [None]:
# 디코더 마스크는 casual mask니까 다르지
dec_self_mask

<tf.Tensor: shape=(1, 6, 6), dtype=float32, numpy=
array([[[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.]]], dtype=float32)>

In [None]:
# Decoder Self Attetnion
Q = hidden_dec
K = hidden_dec
V = hidden_dec

attn_out = attention((Q, K, V, dec_self_mask)) # 마스크만 다른거지
attn_out

(<tf.Tensor: shape=(1, 6, 8), dtype=float32, numpy=
array([[[-0.52,  0.46,  0.92,  0.55, -0.49, -0.43,  0.57,  0.81],
        [ 0.88, -0.39,  0.87,  0.16, -0.86, -0.87,  0.37, -0.02],
        [-0.44,  0.04,  0.85, -0.27, -0.06, -0.85,  0.34, -0.18],
        [-0.18,  0.94, -0.76, -0.09, -0.3 ,  0.12, -0.82, -0.49],
        [ 0.71, -0.21, -0.27,  0.19,  0.26, -0.73, -0.89, -0.89],
        [-0.65, -0.69,  0.6 ,  0.83, -0.71, -0.49,  0.85,  0.74]]],
      dtype=float32)>, <tf.Tensor: shape=(1, 6, 8), dtype=float32, numpy=
array([[[-0.52,  0.46,  0.92,  0.55, -0.49, -0.43,  0.57,  0.81],
        [ 0.88, -0.39,  0.87,  0.16, -0.86, -0.87,  0.37, -0.02],
        [-0.44,  0.04,  0.85, -0.27, -0.06, -0.85,  0.34, -0.18],
        [-0.18,  0.94, -0.76, -0.09, -0.3 ,  0.12, -0.82, -0.49],
        [ 0.71, -0.21, -0.27,  0.19,  0.26, -0.73, -0.89, -0.89],
        [-0.65, -0.69,  0.6 ,  0.83, -0.71, -0.49,  0.85,  0.74]]],
      dtype=float32)>, <tf.Tensor: shape=(1, 6, 8), dtype=float32, numpy=
arra

In [None]:
# Encoder-Decoder Attetnion
Q = hidden_dec
K = hidden_enc
V = hidden_enc

attn_out = attention((Q, K, V, enc_dec_mask)) # k에다 마스크 거니까 shape이 같겟지
attn_out

(<tf.Tensor: shape=(1, 6, 8), dtype=float32, numpy=
array([[[-0.89,  0.52, -0.87, -0.6 ,  0.5 ,  0.19, -0.71, -0.64],
        [-0.75, -0.7 , -0.74, -0.29,  0.23,  0.06,  0.36, -0.33],
        [-0.41, -0.28,  0.95,  0.65, -0.03,  0.73,  0.74, -0.06],
        [-0.29, -0.62, -0.89, -0.79,  0.55, -0.6 , -0.52,  0.08],
        [ 0.63, -0.34,  0.62, -0.78,  0.06, -0.82, -0.43, -0.03],
        [-0.06,  0.15, -0.9 ,  0.31,  0.08,  0.  ,  0.71,  0.24]]],
      dtype=float32)>, <tf.Tensor: shape=(1, 4, 8), dtype=float32, numpy=
array([[[-0.75, -0.18,  0.4 ,  0.54,  0.17,  0.4 , -0.11, -0.44],
        [-0.33,  0.93, -0.54, -0.02,  0.71,  0.31,  0.8 , -0.88],
        [ 0.19, -0.55, -0.72, -0.14,  0.27, -0.09, -0.82, -0.15],
        [-0.06,  0.15, -0.9 ,  0.31,  0.08,  0.  ,  0.71,  0.24]]],
      dtype=float32)>, <tf.Tensor: shape=(1, 4, 8), dtype=float32, numpy=
array([[[-0.75, -0.18,  0.4 ,  0.54,  0.17,  0.4 , -0.11, -0.44],
        [-0.33,  0.93, -0.54, -0.02,  0.71,  0.31,  0.8 , -0.88],
    