In [73]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
import random

PRE_PROCESS

In [76]:
df = pd.read_csv("cards.csv", usecols=['name', 'type', 'desc', 'atk', 'def', 'level', 'race'])
df['atk'] = df['atk'].fillna(0)
df['def'] = df['def'].fillna(0)
df['level'] = df['level'].fillna(0)

NAME_START_TOKEN = "<NAME_START>"
TYPE_START_TOKEN = "<TYPE_START>"
ATK_START_TOKEN = "<ATK_START>"
DEF_START_TOKEN = "<DEF_START>"
LVL_START_TOKEN = "<LVL_START>"
RACE_START_TOKEN = "<RACE_START>"

NAME_END_TOKEN = "<NAME_END>"
TYPE_END_TOKEN = "<TYPE_END>"
ATK_END_TOKEN = "<ATK_END>"
DEF_END_TOKEN = "<DEF_END>"
LVL_END_TOKEN = "<LVL_END>"
RACE_END_TOKEN = "<RACE_END>"
MASK_TOKEN = "<UNK>"

SPECIAL_TOKENS = [NAME_START_TOKEN, TYPE_START_TOKEN, ATK_START_TOKEN, DEF_START_TOKEN, LVL_START_TOKEN, RACE_START_TOKEN, NAME_END_TOKEN, TYPE_END_TOKEN, ATK_END_TOKEN, DEF_END_TOKEN, LVL_END_TOKEN, RACE_END_TOKEN]

df['text'] = df.apply(lambda row: f"{NAME_START_TOKEN} {row['name']} {NAME_END_TOKEN} {TYPE_START_TOKEN} {row['type']} {TYPE_END_TOKEN} {ATK_START_TOKEN} {int(row['atk'])} {ATK_END_TOKEN} {DEF_START_TOKEN} {int(row['def'])} {DEF_END_TOKEN} {LVL_START_TOKEN} {int(row['level'])} {LVL_END_TOKEN} {RACE_START_TOKEN} {row['race']} {RACE_END_TOKEN}", axis=1)


# using all of them blows up my RAM
print(len(df['text']))
input_entries = df['text'].sample(n=2500, random_state=1)
target_entries = input_entries.copy()
print(len(input_entries))
print(input_entries[:10])

1000
10227    <NAME_START> Shinato, King of a Higher Plane <...
9456     <NAME_START> Red-Eyes Spirit <NAME_END> <TYPE_...
1868     <NAME_START> Cipher Spectrum <NAME_END> <TYPE_...
3329     <NAME_START> Dragunity Legion <NAME_END> <TYPE...
8119     <NAME_START> Number 3: Numeron Gate Trini <NAM...
1289     <NAME_START> Blue Medicine <NAME_END> <TYPE_ST...
6027     <NAME_START> Jester Confit <NAME_END> <TYPE_ST...
10309    <NAME_START> Silent Graveyard <NAME_END> <TYPE...
135      <NAME_START> Adamancipator Seeker <NAME_END> <...
3273     <NAME_START> Dragoncarnation <NAME_END> <TYPE_...
Name: text, dtype: object


In [77]:
masking_prob = 0.3
masked_entries = []
for entry in input_entries:
    tokens = entry.split(' ')
    masked_entry = []
    for token in tokens:
        if np.random.rand() < masking_prob:
            masked_entry.append(MASK_TOKEN)
        else:
            masked_entry.append(token)
    masked_entries.append(" ".join(masked_entry))

masked_entries = pd.Series(masked_entries)
print(masked_entries[:10])

0    <NAME_START> <UNK> King <UNK> a Higher <UNK> <...
1    <NAME_START> Red-Eyes <UNK> <NAME_END> <TYPE_S...
2    <NAME_START> Cipher Spectrum <NAME_END> <TYPE_...
3    <NAME_START> <UNK> Legion <NAME_END> <TYPE_STA...
4    <NAME_START> Number <UNK> Numeron Gate Trini <...
5    <NAME_START> Blue <UNK> <NAME_END> <TYPE_START...
6    <NAME_START> Jester <UNK> <NAME_END> <TYPE_STA...
7    <NAME_START> Silent Graveyard <NAME_END> <TYPE...
8    <NAME_START> <UNK> <UNK> <NAME_END> <TYPE_STAR...
9    <NAME_START> <UNK> <NAME_END> <TYPE_START> Tra...
dtype: object


In [78]:
tokenizer = Tokenizer(oov_token=MASK_TOKEN, filters="")
tokenizer.fit_on_texts(input_entries)
total_words = len(tokenizer.word_index) + 1

print(total_words) # VOCAB SIZE
print(tokenizer.to_json())

1690
{"class_name": "Tokenizer", "config": {"num_words": null, "filters": "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n", "lower": true, "split": " ", "char_level": false, "oov_token": "<UNK>", "document_count": 1000, "word_counts": "{\"name\": 2000, \"start\": 6000, \"shinato\": 1, \"king\": 14, \"of\": 108, \"a\": 9, \"higher\": 1, \"plane\": 1, \"end\": 6000, \"type\": 2000, \"ritual\": 22, \"effect\": 406, \"monster\": 621, \"atk\": 2000, \"3300\": 4, \"def\": 2000, \"3000\": 30, \"lvl\": 2000, \"8\": 59, \"race\": 2000, \"fairy\": 46, \"red\": 9, \"eyes\": 13, \"spirit\": 7, \"trap\": 159, \"card\": 370, \"0\": 1308, \"normal\": 235, \"cipher\": 1, \"spectrum\": 1, \"dragunity\": 3, \"legion\": 1, \"number\": 10, \"3\": 73, \"numeron\": 2, \"gate\": 7, \"trini\": 1, \"xyz\": 42, \"1000\": 100, \"100\": 37, \"1\": 62, \"machine\": 73, \"blue\": 8, \"medicine\": 1, \"spell\": 211, \"jester\": 1, \"confit\": 1, \"spellcaster\": 61, \"silent\": 3, \"graveyard\": 2, \"quick\": 33, \"play\": 3

In [79]:
input_sequences_encoded = tokenizer.texts_to_sequences([ t for t in masked_entries])
target_sequences_encoded = tokenizer.texts_to_sequences([ t for t in target_entries])


max_sequence_len = max([len(x) for x in input_sequences_encoded])
print(max_sequence_len)

38


In [80]:
input_sequences_padded = np.array(pad_sequences(input_sequences_encoded, maxlen=max_sequence_len, padding='post'))
target_sequences_padded = np.array(pad_sequences(target_sequences_encoded, maxlen=max_sequence_len, padding='post'))

target_sequences_shifted = np.roll(target_sequences_padded, -1, axis=1)

print(input_sequences_padded.shape)
print(target_sequences_padded.shape)
print(target_sequences_shifted.shape)

(1000, 38)
(1000, 38, 1690)


LSTM Model

In [81]:
latent_dim = 1024
embedding_dim = 150

# ENCODER
encoder_inputs=  Input(shape=(max_sequence_len,))
encoder_embedding = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)

_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]


#DECODER
decoder_inputs = Input(shape=(max_sequence_len,))
decoder_embedding = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop')
model.summary()

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_output, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_state_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_lstm_output)

decoder_model = Model(
    [decoder_inputs] + decoder_state_inputs,
    [decoder_outputs] + decoder_states
)

In [None]:
class Seq2SeqPredictionCallback(tf.keras.callbacks.Callback):
    def __init__(self, input_sequences, target_sequences, target_tokenizer, encoder_model, decoder_model, sample_size=5):
        self.input_sequences = input_sequences
        self.target_sequences = target_sequences
        self.target_tokenizer = target_tokenizer
        self.encoder_model = encoder_model
        self.decoder_model = decoder_model
        self.sample_size = sample_size

    def decode_sequence(self, input_seq):
        # Encode the input as state vectors
        states_value = self.encoder_model.predict(input_seq, verbose=0)

        # Generate empty target sequence of length 1
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = self.target_tokenizer.word_index['<name_start>']

        # Sampling loop
        stop_condition = False
        decoded_sentence = ''
        while not stop_condition:
            output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value, verbose=0)

            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            if sampled_token_index != 0:
              sampled_word = self.target_tokenizer.index_word[sampled_token_index]
            else:
              sampled_word = 'pad'
            decoded_sentence += ' ' + sampled_word

            # Exit condition: either hit max length or find stop token
            if sampled_word == '<race_end>' or len(decoded_sentence) > 500:
                stop_condition = True

            # Update the target sequence and states
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]

        return decoded_sentence

    def on_epoch_end(self, epoch, logs=None):
        if epoch % 20 == 0 and epoch != 0:
          print(f'\nEpoch {epoch + 1} Predictions:')
          for i in range(self.sample_size):
              choice = random.randint(0, len(self.input_sequences))
              input_seq = self.input_sequences[choice:choice+1]
              decoded_sentence = self.decode_sequence(input_seq)
              actual_sentence = ' '.join([self.target_tokenizer.index_word[index] if index > 0 else "pad" for index in self.target_sequences[choice]])
              print(f'Input {choice}: {input_seq}')
              print(f'Predicted: {decoded_sentence}')
              print(f'Actual: {actual_sentence}\n')

In [None]:
prediction_callback = Seq2SeqPredictionCallback(
    input_sequences=input_sequences_padded,
    target_sequences=target_sequences_padded,
    target_tokenizer=tokenizer,
    encoder_model=encoder_model,
    decoder_model=decoder_model,
    sample_size=3  # Number of samples to display each epoch
)

model.fit([input_sequences_padded, target_sequences_padded], target_data_one_hot, epochs=100, batch_size=64)

GENERATE

In [None]:
test_example = ["<NAME_START> king of a <UNK> <NAME_END> <TYPE_START> Ritual Effect <UNK> <TYPE_END> <ATK_START> 5000 <ATK_END> <DEF_START> 3000 <DEF_END> <LVL_START> 10 <LVL_END> <RACE_START> Monster <RACE_END>"]
encoded_test_exmaple = tokenizer.texts_to_sequences([ t for t in test_example])
input_test_example = np.array(pad_sequences(encoded_test_exmaple, maxlen=max_sequence_len, padding='pre'))


states_value = encoder_model.predict(input_test_example, verbose=0)
target_seq = np.zeros((1,1))
target_seq[0,0] = tokenizer.word_index[NAME_START_TOKEN.lower()]

stop_condition = False
decoded_card = ''
while not stop_condition:
  output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

  sampled_token_index = np.argmax(output_tokens[0, -1, :])
  if sampled_token_index != 0:
    sampled_word = tokenizer.index_word[sampled_token_index]
    decoded_card += " " + sampled_word

    if sampled_word == '<race_end>' or len(decoded_card) > 1000:
          stop_condition = True

  states_value = [h, c]
  target_seq = np.zeros((1,1))
  target_seq[0,0] = sampled_token_index

print(decoded_card)
