In [5]:
def get_positional_encoding(max_length, d_model):
    """Generate positional encodings using sine and cosine functions."""
    angle_rads = np.zeros((max_length, d_model))
    position = np.arange(max_length)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    angle_rads = position * angle_rates
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])  # Even indices: sin
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])  # Odd indices: cos
    return tf.constant(angle_rads, dtype=tf.float32)

print(get_positional_encoding(5, 10))

tf.Tensor(
[[ 0.0000000e+00  1.0000000e+00  0.0000000e+00  1.0000000e+00
   0.0000000e+00  1.0000000e+00  0.0000000e+00  1.0000000e+00
   0.0000000e+00  1.0000000e+00]
 [ 8.4147096e-01  5.4030228e-01  1.5782665e-01  9.8746681e-01
   2.5116222e-02  9.9968451e-01  3.9810613e-03  9.9999207e-01
   6.3095731e-04  9.9999982e-01]
 [ 9.0929741e-01 -4.1614684e-01  3.1169716e-01  9.5018148e-01
   5.0216600e-02  9.9873835e-01  7.9620592e-03  9.9996829e-01
   1.2619144e-03  9.9999923e-01]
 [ 1.4112000e-01 -9.8999250e-01  4.5775455e-01  8.8907862e-01
   7.5285293e-02  9.9716204e-01  1.1942931e-02  9.9992865e-01
   1.8928709e-03  9.9999821e-01]
 [-7.5680250e-01 -6.5364361e-01  5.9233773e-01  8.0568975e-01
   1.0030649e-01  9.9495661e-01  1.5923614e-02  9.9987322e-01
   2.5238267e-03  9.9999684e-01]], shape=(5, 10), dtype=float32)


In [6]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Hyperparameters
max_length = 500    # Maximum sequence length
num_words = 10000   # Vocabulary size (top 10,000 words)
d_model = 128       # Embedding dimension
num_heads = 8       # Number of attention heads
num_layers = 2      # Number of encoder layers
dff = 512           # Feed-forward network dimension
dropout_rate = 0.1  # Dropout rate

# --- Data Preparation ---
# Load IMDB dataset
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words)

# Prepend [CLS] token (using 0) to each sequence
x_train_cls = [[0] + list(seq) for seq in x_train]
x_test_cls = [[0] + list(seq) for seq in x_test]

# Pad sequences to max_length, post-padding with 0s
x_train_padded = pad_sequences(x_train_cls, maxlen=max_length, padding='post', truncating='post')
x_test_padded = pad_sequences(x_test_cls, maxlen=max_length, padding='post', truncating='post')

print(x_train_padded[0])

[   0    1   14   22   16   43  530  973 1622 1385   65  458 4468   66
 3941    4  173   36  256    5   25  100   43  838  112   50  670    2
    9   35  480  284    5  150    4  172  112  167    2  336  385   39
    4  172 4536 1111   17  546   38   13  447    4  192   50   16    6
  147 2025   19   14   22    4 1920 4613  469    4   22   71   87   12
   16   43  530   38   76   15   13 1247    4   22   17  515   17   12
   16  626   18    2    5   62  386   12    8  316    8  106    5    4
 2223 5244   16  480   66 3785   33    4  130   12   16   38  619    5
   25  124   51   36  135   48   25 1415   33    6   22   12  215   28
   77   52    5   14  407   16   82    2    8    4  107  117 5952   15
  256    4    2    7 3766    5  723   36   71   43  530  476   26  400
  317   46    7    4    2 1029   13  104   88    4  381   15  297   98
   32 2071   56   26  141    6  194 7486   18    4  226   22   21  134
  476   26  480    5  144   30 5535   18   51   36   28  224   92   25
  104 

In [7]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Hyperparameters
max_length = 500    # Maximum sequence length
num_words = 10000   # Vocabulary size (top 10,000 words)
d_model = 128       # Embedding dimension
num_heads = 8       # Number of attention heads
num_layers = 2      # Number of encoder layers
dff = 512           # Feed-forward network dimension
dropout_rate = 0.1  # Dropout rate

# --- Data Preparation ---
# Load IMDB dataset
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words)

# Prepend [CLS] token (using 0) to each sequence
x_train_cls = [[0] + list(seq) for seq in x_train]
x_test_cls = [[0] + list(seq) for seq in x_test]

# Pad sequences to max_length, post-padding with 0s
x_train_padded = pad_sequences(x_train_cls, maxlen=max_length, padding='post', truncating='post')
x_test_padded = pad_sequences(x_test_cls, maxlen=max_length, padding='post', truncating='post')

x_train_dataset = tf.data.Dataset.from_tensor_slices((x_train_padded, y_train)).batch(32)
x_test_dataset = tf.data.Dataset.from_tensor_slices((x_test_padded, y_test)).batch(32)

# --- Positional Encoding Function ---
def get_positional_encoding(max_length, d_model):
    """Generate positional encodings using sine and cosine functions."""
    angle_rads = np.zeros((max_length, d_model))
    position = np.arange(max_length)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    angle_rads = position * angle_rates
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])  # Even indices: sin
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])  # Odd indices: cos
    return tf.constant(angle_rads, dtype=tf.float32)

# --- Build the Model ---
# Input layer
inputs = tf.keras.Input(shape=(max_length,))

# Embedding layer (mask_zero=True to ignore padding)
embedding_layer = tf.keras.layers.Embedding(input_dim=num_words, output_dim=d_model, mask_zero=True)
embeddings = embedding_layer(inputs)

# Add positional encoding
pos_encoding = get_positional_encoding(max_length, d_model)
embeddings = embeddings + pos_encoding

# Transformer Encoder Layers
x = embeddings
for _ in range(num_layers):
    # Multi-Head Self-Attention
    attn_output = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=d_model // num_heads
    )(x, x)
    attn_output = tf.keras.layers.Dropout(dropout_rate)(attn_output)
    out1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + attn_output)

    # Feed-Forward Network
    ffn_output = tf.keras.layers.Dense(dff, activation='relu')(out1)
    ffn_output = tf.keras.layers.Dense(d_model)(ffn_output)
    ffn_output = tf.keras.layers.Dropout(dropout_rate)(ffn_output)
    out2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(out1 + ffn_output)
    x = out2

# Extract [CLS] token output (first token)
cls_output = x[:, 0, :]

# Classification head
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(cls_output)

# Create the model
model = tf.keras.Model(inputs=inputs, outputs=outputs)

# --- Compile and Train ---
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    x_train_dataset,
    epochs=10,
    batch_size=32,
    validation_data=x_test_dataset
)

# --- Optional: Print model summary ---
model.summary()

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m657s[0m 827ms/step - accuracy: 0.5136 - loss: 0.7575 - val_accuracy: 0.7791 - val_loss: 0.4920
Epoch 2/10
[1m532/782[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m2:33[0m 614ms/step - accuracy: 0.7950 - loss: 0.4755

KeyboardInterrupt: 

In [3]:
import matplotlib.pyplot as plt

# Loss 그래프 그리기
plt.figure(figsize=(8, 6))
plt.plot(history.history['loss'], color='red', label='Training Loss')
plt.plot(history.history['val_loss'], color='blue', label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss per Epoch')
plt.legend()
plt.show()

# Accuracy 그래프 그리기
plt.figure(figsize=(8, 6))
plt.plot(history.history['accuracy'], color='red', label='Training Accuracy')
plt.plot(history.history['val_accuracy'], color='blue', label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy per Epoch')
plt.legend()
plt.show()


NameError: name 'history' is not defined

<Figure size 800x600 with 0 Axes>