In [1]:
import numpy as np
import random
from neuralnetlib.layers import Input, Embedding, LSTM, Dense
from neuralnetlib.model import Model
from neuralnetlib.preprocessing import one_hot_encode
from neuralnetlib.callbacks import EarlyStopping

In [2]:
with open('dinos.txt', 'r', encoding='utf-8') as f:
    names = [line.strip() for line in f]

print(names[:5])  # we display the first 5 names of the list to see if they were correctly loaded

['Aachenosaurus', 'Aardonyx', 'Abdallahsaurus', 'Abelisaurus', 'Abrictosaurus']


In [3]:
lengths = [len(name) for name in names]
max_length = max(lengths)
print(f"Maximum length: {max_length}")

Maximum length: 26


In [4]:
# Constants
PAD_TOKEN = ''  # Padding token (index 0)
EOS_TOKEN = '$'  # End of sequence token (index 1)
max_length = 15  # Maximum sequence length

# Mapping dictionaries
char_to_index = {PAD_TOKEN: 0, EOS_TOKEN: 1}
index_to_char = {0: PAD_TOKEN, 1: EOS_TOKEN}

# Extract unique characters and sort them
unique_chars = sorted(set(''.join(names)))

# Build character <-> index mappings starting at 2
for idx, char in enumerate(unique_chars, start=2):
    char_to_index[char] = idx
    index_to_char[idx] = char

vocab_size = len(char_to_index)
print(f"Vocab size: {vocab_size}")

Vocab size: 54


In [5]:
# Training sequences
sequences = []
next_chars = []

# Create sequences and their next characters
for name in names:
    name = name.lower()
    name_chars = list(name) + [EOS_TOKEN]

    for i in range(len(name_chars) - 1):
        # Extract sequence
        seq = name_chars[max(0, i - max_length + 1):i + 1]

        # Padding and conversion to indices
        padded_seq = [0] * (max_length - len(seq)) + [char_to_index[char] for char in seq]

        sequences.append(padded_seq)
        next_chars.append(char_to_index[name_chars[i + 1]])

# Convert to numpy arrays
X = np.array(sequences)
y = one_hot_encode(np.array(next_chars), vocab_size)

print(f"Vocabulary size: {vocab_size}")
print(f"Shape of X data: {X.shape}")
print(f"Shape of y labels: {y.shape}")

# Display an example for verification
print(f"\nExample for {names[0]}:")
print(f"Input sequence: {X[5]}")
print(f"Expected output: {y[5]}")

# Visualize tokens for the first example
print("\nDecoding example sequence:")
print([index_to_char[idx] for idx in X[5]])
print(f"Next character: {index_to_char[next_chars[5]]}")

Vocabulary size: 54
Shape of X data: (18374, 15)
Shape of y labels: (18374, 54)

Example for Aachenosaurus:
Input sequence: [ 0  0  0  0  0  0  0  0  0 28 28 30 35 32 41]
Expected output: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]

Decoding example sequence:
['', '', '', '', '', '', '', '', '', 'a', 'a', 'c', 'h', 'e', 'n']
Next character: o


In [6]:
# Model creation
embedding_dim = 32
lstm_units = 128

model = Model()
model.add(Input(max_length))
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(LSTM(units=lstm_units))
model.add(Dense(units=vocab_size, activation='softmax'))

model.compile(loss_function='categorical_crossentropy', optimizer='adam')
model.summary()

Model
-------------------------------------------------
Layer 1: Input(input_shape=(15,))
Layer 2: Embedding(input_dim=54, output_dim=32)
Layer 3: LSTM(units=128, return_sequences=False, return_state=False)
Layer 4: Dense(units=54)
Layer 5: Activation(Softmax)
-------------------------------------------------
Loss function: CategoricalCrossentropy
Optimizer: Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, clip_norm=None, clip_value=None)
-------------------------------------------------


In [7]:
# Create EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='loss',
    patience=5,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X, y,
    epochs=100,
    batch_size=64,
    callbacks=[early_stopping],
    validation_data=(X, y),
)



KeyboardInterrupt: 

In [None]:
# Generate new names
def generate_name(model, min_length=5):
    current_sequence = [0] * max_length
    generated_name = ""

    while len(generated_name) < max_length:
        x = np.array([current_sequence])
        preds = model.predict(x)[0]

        # Select next character using random.choices
        next_char_idx = random.choices(range(vocab_size), weights=preds, k=1)[0]
        next_char = index_to_char[next_char_idx]

        # STOP if minimum length reached and EOS encountered
        if len(generated_name) >= min_length and next_char == EOS_TOKEN:
            break

        # Add character if it's neither PAD nor EOS
        if next_char not in [PAD_TOKEN, EOS_TOKEN]:
            generated_name += next_char

        # Update current sequence
        current_sequence = current_sequence[1:] + [next_char_idx]

    return generated_name.capitalize() if len(generated_name) >= min_length else None

# Generate multiple names
generated_names = []
number_of_names = 5
min_length = 5

while len(generated_names) < number_of_names:
    name = generate_name(model, min_length)
    if name is not None and name not in generated_names:
        generated_names.append(name)

# Display results
print("\nGenerated names:")
for name in generated_names:
    print(f"{name} ({len(name)} characters)")

# Check originality
print("\nAre all names original?", all(name.lower() not in [n.lower() for n in names] for name in generated_names))

# Length statistics
lengths = [len(name) for name in generated_names]
print(f"\nAverage length: {sum(lengths)/len(lengths):.1f} characters")
print(f"Minimum length: {min(lengths)} characters")
print(f"Maximum length: {max(lengths)} characters")