In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, MultiHeadAttention, Add, Dense, Input
import random

2024-09-04 13:22:19.994363: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


PRE_PROCESS

In [2]:
df = pd.read_csv("cards.csv", usecols=['name', 'type', 'desc', 'atk', 'def', 'level', 'race'])
df['atk'] = df['atk'].fillna(0)
df['def'] = df['def'].fillna(0)
df['level'] = df['level'].fillna(0)

NAME_START_TOKEN = "<NAME_START>"
TYPE_START_TOKEN = "<TYPE_START>"
ATK_START_TOKEN = "<ATK_START>"
DEF_START_TOKEN = "<DEF_START>"
LVL_START_TOKEN = "<LVL_START>"
RACE_START_TOKEN = "<RACE_START>"

NAME_END_TOKEN = "<NAME_END>"
TYPE_END_TOKEN = "<TYPE_END>"
ATK_END_TOKEN = "<ATK_END>"
DEF_END_TOKEN = "<DEF_END>"
LVL_END_TOKEN = "<LVL_END>"
RACE_END_TOKEN = "<RACE_END>"
MASK_TOKEN = "<UNK>"

SPECIAL_TOKENS = [NAME_START_TOKEN, TYPE_START_TOKEN, ATK_START_TOKEN, DEF_START_TOKEN, LVL_START_TOKEN, RACE_START_TOKEN, NAME_END_TOKEN, TYPE_END_TOKEN, ATK_END_TOKEN, DEF_END_TOKEN, LVL_END_TOKEN, RACE_END_TOKEN]

df['text'] = df.apply(lambda row: f"{NAME_START_TOKEN} {row['name']} {NAME_END_TOKEN} {TYPE_START_TOKEN} {row['type']} {TYPE_END_TOKEN} {ATK_START_TOKEN} {int(row['atk'])} {ATK_END_TOKEN} {DEF_START_TOKEN} {int(row['def'])} {DEF_END_TOKEN} {LVL_START_TOKEN} {int(row['level'])} {LVL_END_TOKEN} {RACE_START_TOKEN} {row['race']} {RACE_END_TOKEN}", axis=1)


# using all of them blows up my RAM
print(len(df['text']))
input_entries = df['text']
target_entries = input_entries.copy()
print(len(input_entries))
print(input_entries[:10])

13281
2500
10227    <NAME_START> Shinato, King of a Higher Plane <...
9456     <NAME_START> Red-Eyes Spirit <NAME_END> <TYPE_...
1868     <NAME_START> Cipher Spectrum <NAME_END> <TYPE_...
3329     <NAME_START> Dragunity Legion <NAME_END> <TYPE...
8119     <NAME_START> Number 3: Numeron Gate Trini <NAM...
1289     <NAME_START> Blue Medicine <NAME_END> <TYPE_ST...
6027     <NAME_START> Jester Confit <NAME_END> <TYPE_ST...
10309    <NAME_START> Silent Graveyard <NAME_END> <TYPE...
135      <NAME_START> Adamancipator Seeker <NAME_END> <...
3273     <NAME_START> Dragoncarnation <NAME_END> <TYPE_...
Name: text, dtype: object


In [3]:
masking_prob = 0.3
masked_entries = []
for entry in input_entries:
    tokens = entry.split(' ')
    masked_entry = []
    for token in tokens:
        if np.random.rand() < masking_prob:
            masked_entry.append(MASK_TOKEN)
        else:
            masked_entry.append(token)
    masked_entries.append(" ".join(masked_entry))

masked_entries = pd.Series(masked_entries)
print(masked_entries[:10])

0    <NAME_START> Shinato, <UNK> of a Higher <UNK> ...
1    <NAME_START> Red-Eyes <UNK> <NAME_END> <TYPE_S...
2    <NAME_START> Cipher <UNK> <NAME_END> <TYPE_STA...
3    <NAME_START> Dragunity Legion <NAME_END> <TYPE...
4    <UNK> Number <UNK> <UNK> <UNK> Trini <NAME_END...
5    <NAME_START> Blue Medicine <NAME_END> <TYPE_ST...
6    <NAME_START> <UNK> <UNK> <NAME_END> <TYPE_STAR...
7    <NAME_START> Silent Graveyard <NAME_END> <TYPE...
8    <NAME_START> <UNK> Seeker <NAME_END> <TYPE_STA...
9    <NAME_START> Dragoncarnation <UNK> <UNK> Trap ...
dtype: object


In [4]:
tokenizer = Tokenizer(oov_token=MASK_TOKEN, filters="")
tokenizer.fit_on_texts(input_entries)
total_words = len(tokenizer.word_index) + 1

print(total_words) # VOCAB SIZE
print(tokenizer.to_json())

3340


In [5]:
input_sequences_encoded = tokenizer.texts_to_sequences([ t for t in masked_entries])
target_sequences_encoded = tokenizer.texts_to_sequences([ t for t in target_entries])


max_sequence_len = max([len(x) for x in input_sequences_encoded])
print(max_sequence_len)

26


In [6]:
input_sequences_padded = np.array(pad_sequences(input_sequences_encoded, maxlen=max_sequence_len, padding='post'))
target_sequences_padded = np.array(pad_sequences(target_sequences_encoded, maxlen=max_sequence_len, padding='post'))

target_sequences_shifted = np.roll(target_sequences_padded, 1, axis=1)

target_sequences_shifted[:,0] = tokenizer.word_index['<name_start>']

print(input_sequences_padded.shape)
print(target_sequences_padded.shape)
print(target_sequences_shifted.shape)

(2500, 26)
(2500, 26)
(2500, 26)


# Transformer Model

In [7]:
# Hyperparameters
num_layers=1
embedding_dim = 300
num_heads = 12
ff_dim = 1024
dropout_rate=0.3

def positional_encoding(length, depth):
    # depth is dimensionality of encoding, length is input sequence length
    depth = depth/2

    positions = np.arange(length)[:, np.newaxis] # (length, 1)
    depths = np.arange(depth)[np.newaxis, :]/depth # (1, depth)

    angle_rates = 1 / (10000**depths) # (1, depth)
    angle_rads = positions * angle_rates # (pos, depth)

    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=1)

    return tf.cast(pos_encoding, dtype=tf.float32)

#Encoder Embedding
encoder_inputs = Input(shape=(max_sequence_len,))
context = tf.keras.layers.Embedding(len(tokenizer.word_index) + 1, embedding_dim)(encoder_inputs)
context += positional_encoding(len(tokenizer.word_index) + 1, embedding_dim)[tf.newaxis, :max_sequence_len, :]

# Encoder
context = tf.keras.layers.Dropout(dropout_rate)(context)
for i in range(num_layers):
    attention_output = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim, dropout=dropout_rate)(query=context, key=context, value=context)
    context = tf.keras.layers.Add()([context, attention_output])
    context = tf.keras.layers.LayerNormalization()(context)

    sequential = tf.keras.Sequential([
        tf.keras.layers.Dense(ff_dim, activation='relu'),
        tf.keras.layers.Dense(embedding_dim),
        tf.keras.layers.Dropout(dropout_rate)
    ])

    context = tf.keras.layers.Add()([context, sequential(context)])
    context = tf.keras.layers.LayerNormalization()(context)

# Decoder Embedding
decoder_inputs = Input(shape=(max_sequence_len,))
x = tf.keras.layers.Embedding(len(tokenizer.word_index) + 1, embedding_dim)(decoder_inputs)
x = x + positional_encoding(len(tokenizer.word_index) + 1, embedding_dim)[tf.newaxis, :max_sequence_len, :]

#Decoder
x = tf.keras.layers.Dropout(dropout_rate)(x)
for i in range(num_layers):
    causal_attention_output = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim, dropout=dropout_rate)(query=x, key=x, value=x, use_causal_mask = True)
    x = tf.keras.layers.Add()([x, causal_attention_output])
    x = tf.keras.layers.LayerNormalization()(x)

    cross_attention_output = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim, dropout=dropout_rate)(query=x, key=context, value=context)
    x = tf.keras.layers.Add()([x, cross_attention_output])
    x = tf.keras.layers.LayerNormalization()(x)

    sequential = tf.keras.Sequential([
        tf.keras.layers.Dense(ff_dim, activation='relu'),
        tf.keras.layers.Dense(embedding_dim),
        tf.keras.layers.Dropout(dropout_rate)
    ])

    x = tf.keras.layers.Add()([x, sequential(x)])
    x = tf.keras.layers.LayerNormalization()(x)

# Final Out
out = tf.keras.layers.Dense(len(tokenizer.word_index) + 1, activation='softmax')(x)


model = Model([encoder_inputs, decoder_inputs], out)

optimizer = Adam(learning_rate=0.0001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)
model.summary()

In [8]:
class TransformerPredictionCallback(tf.keras.callbacks.Callback):
    def __init__(self, input_sequences, target_sequences, target_tokenizer, transformer_model, max_sequence_len, sample_size=5):
        self.input_sequences = input_sequences
        self.target_sequences = target_sequences
        self.target_tokenizer = target_tokenizer
        self.transformer_model = transformer_model
        self.max_sequence_len = max_sequence_len
        self.sample_size = sample_size

    def decode_sequence(self, input_seq):
        # Generate empty target sequence of length 1
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = tokenizer.word_index['<name_start>']

        # Sampling loop
        decoded_sentence = '<name_start>'
        for i in range(max_sequence_len):
            output_tokens = model([input_seq, np.array(pad_sequences(target_seq, maxlen=self.max_sequence_len, padding='post'))], training=False)

            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, i, :])
            if sampled_token_index != 0:
              sampled_word = tokenizer.index_word[sampled_token_index]
            else:
              sampled_word = 'pad'
            decoded_sentence += ' ' + sampled_word

            # Exit condition: either hit max length or find stop token
            if sampled_word == '<race_end>' or len(decoded_sentence) > 500:
                stop_condition = True

            new_value = np.zeros((1, 1))
            new_value[0, 0] = sampled_token_index

            target_seq = np.concatenate([target_seq, new_value], axis=1)

        return decoded_sentence


    def on_epoch_end(self, epoch, logs=None):
        if epoch % 10 == 0:
          print(f'\nEpoch {epoch + 1} Predictions:')
          for i in range(self.sample_size):
              choice = random.randint(0, len(self.input_sequences))
              input_seq = self.input_sequences[choice:choice+1]
              decoded_sentence = self.decode_sequence(input_seq)
              actual_sentence = ' '.join([self.target_tokenizer.index_word[index] if index > 0 else "pad" for index in self.target_sequences[choice]])
              print(f'Input {choice}: {input_seq}')
              print(f'Predicted: {decoded_sentence}')
              print(f'Actual: {actual_sentence}\n')

In [9]:
prediction_callback = TransformerPredictionCallback(
    input_sequences=input_sequences_padded,
    target_sequences=target_sequences_padded,
    target_tokenizer=tokenizer,
    transformer_model=model,
    max_sequence_len=max_sequence_len,
    sample_size=3  # Number of samples to display each epoch
)

model.fit([input_sequences_padded, target_sequences_padded], target_sequences_shifted, epochs=30, batch_size=32, validation_split=0.1, callbacks=[prediction_callback])

Epoch 1/30
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - loss: 4.4110
Epoch 1 Predictions:
Input 1568: [[   3  477   23    1 2463    4    5    1   17    6    1    2    8    9
     2   10    1    2   12    1   85    1    0    0    0    0]]
Predicted:  <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start> <name_start>
Actual: <name_start> trial of the princesses <name_end> <type_start> spell card <type_end> <atk_start> 0 <atk_end> <def_start> 0 <def_end> <lvl_start> 0 <lvl_end> <race_start> equip <race_end> pad pad pad pad

Inpu


KeyboardInterrupt



GENERATE

In [None]:
test_example = ["<NAME_START> King <UNK> <NAME_END> <TYPE_START> Ritual Effect <UNK> <TYPE_END> <ATK_START> 5000 <ATK_END> <DEF_START> 3000 <DEF_END> <LVL_START> 10 <LVL_END> <RACE_START> Monster <RACE_END>"]
encoded_test_exmaple = tokenizer.texts_to_sequences([ t for t in test_example])
input_test_example = np.array(pad_sequences(encoded_test_exmaple, maxlen=max_sequence_len, padding='post'))

target_seq = np.zeros((1, 1))
target_seq[0, 0] = tokenizer.word_index['<name_start>']

# Sampling loop
decoded_sentence = '<name_start>'
for i in range(max_sequence_len):
    output_tokens = model([input_test_example, np.array(pad_sequences(target_seq, maxlen=max_sequence_len, padding='post'))], training=False)

    # Sample a token
    sampled_token_index = np.argmax(output_tokens[0, i, :])
    if sampled_token_index != 0:
      sampled_word = tokenizer.index_word[sampled_token_index]
    else:
      sampled_word = 'pad'
    decoded_sentence += ' ' + sampled_word

    # Exit condition: either hit max length or find stop token
    if sampled_word == '<race_end>' or len(decoded_sentence) > 500:
        stop_condition = True

    new_value = np.zeros((1, 1))
    new_value[0, 0] = sampled_token_index

    target_seq = np.concatenate([target_seq, new_value], axis=1)


print(decoded_sentence)
