In [1]:
import numpy as np

from neuralnetlib.layers import Input, Embedding, LSTM, Dense
from neuralnetlib.models import Sequential
from neuralnetlib.preprocessing import one_hot_encode
from neuralnetlib.callbacks import EarlyStopping

In [2]:
with open('dinos.txt', 'r', encoding='utf-8') as f:
    names = [line.strip() for line in f]

print(names[:5])  # display the first 5 names of the list to check if they were loaded correctly

['Aachenosaurus', 'Aardonyx', 'Abdallahsaurus', 'Abelisaurus', 'Abrictosaurus']


In [3]:
lengths = [len(name) for name in names]
max_length = max(lengths)
print(f"Maximum length: {max_length}")

Maximum length: 26


In [4]:
# Constants
PAD_TOKEN = ''   # Padding token (index 0)
EOS_TOKEN = '$'  # End of sequence token (index 1)
max_length = 15  # Maximum sequence length

# Mapping dictionaries
char_to_index = {PAD_TOKEN: 0, EOS_TOKEN: 1}
index_to_char = {0: PAD_TOKEN, 1: EOS_TOKEN}

# Extract unique characters and sort them
unique_chars = sorted(set(''.join(names)))

# Build character <-> index mappings starting at index 2
for idx, char in enumerate(unique_chars, start=2):
    char_to_index[char] = idx
    index_to_char[idx] = char

vocab_size = len(char_to_index)
print(f"Vocab size: {vocab_size}")

Vocab size: 54


In [5]:
# Training sequences
sequences = []
next_chars = []

# Create sequences and next characters
for name in names:
    name = name.lower()
    name_chars = list(name) + [EOS_TOKEN]

    for i in range(len(name_chars) - 1):
        # Extract sequence
        seq = name_chars[max(0, i - max_length + 1):i + 1]

        # Padding and conversion to indices
        padded_seq = [0] * (max_length - len(seq)) + [char_to_index[char] for char in seq]

        sequences.append(padded_seq)
        next_chars.append(char_to_index[name_chars[i + 1]])

# Convert to numpy arrays
X = np.array(sequences)
y = one_hot_encode(np.array(next_chars), vocab_size)

print(f"Vocabulary size: {vocab_size}")
print(f"X data shape: {X.shape}")
print(f"y labels shape: {y.shape}")

# Display an example for verification
print(f"\nExample for {names[0]}:")
print(f"Input sequence: {X[5]}")
print(f"Expected output: {y[5]}")

# Visualize tokens for the first example
print("\nDecoding example sequence:")
print([index_to_char[idx] for idx in X[5]])
print(f"Next character: {index_to_char[next_chars[5]]}")

Vocabulary size: 54
X data shape: (18374, 15)
y labels shape: (18374, 54)

Example for Aachenosaurus:
Input sequence: [ 0  0  0  0  0  0  0  0  0 28 28 30 35 32 41]
Expected output: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]

Decoding example sequence:
['', '', '', '', '', '', '', '', '', 'a', 'a', 'c', 'h', 'e', 'n']
Next character: o


In [6]:
# Model definition
embedding_dim = 32
lstm_units = 128

model = Sequential()
model.add(Input(max_length))
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(LSTM(units=lstm_units))
model.add(Dense(units=vocab_size, activation='softmax'))

model.compile(loss_function='categorical_crossentropy', optimizer='adam')
model.summary()

Sequential(gradient_clip_threshold=5.0, enable_padding=False, padding_size=32, random_state=1733490659107227600)
-------------------------------------------------
Layer 1: Input(input_shape=(15,))
Layer 2: Embedding(input_dim=54, output_dim=32)
Layer 3: LSTM(units=128, return_sequences=False, return_state=False, random_state=None, clip_value=5.0)
Layer 4: Dense(units=54)
Layer 5: Activation(Softmax)
-------------------------------------------------
Loss function: CategoricalCrossentropy
Optimizer: Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, clip_norm=None, clip_value=None)
-------------------------------------------------



In [7]:
# Early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# Model training
history = model.fit(
    X, y,
    epochs=100,
    batch_size=64,
    callbacks=[early_stopping],
    validation_split=0.1,
    metrics=['accuracy']
)


Early stopping triggered after epoch 19




In [8]:
# Generate multiple names
generated_names = []
number_of_names = 5
min_length = 5

while len(generated_names) < number_of_names:
    # Start sequence with a single padding token
    sequence_start = np.array([[char_to_index[PAD_TOKEN]]])  # shape: (1, 1)

    # Generate a sequence
    tokens = model.generate_sequence(
        sequence_start=sequence_start,
        max_length=max_length,
        stop_token=char_to_index[EOS_TOKEN],
        min_length=min_length,
        temperature=1.2
    )

    # Convert indices to characters (excluding padding and end tokens)
    name = ''.join(index_to_char[idx] for idx in tokens[0]
                   if idx not in [char_to_index[PAD_TOKEN], char_to_index[EOS_TOKEN]])
    name = name.capitalize()

    # Check if the name is long enough, unique, and contains at least one vowel
    if len(name) >= min_length and name not in generated_names and any(c in 'aeiou' for c in name.lower()):
        generated_names.append(name)

# Display results
print("\nGenerated names:")
for name in generated_names:
    print(f"{name} ({len(name)} characters)")

# Check originality
print("\nAre all names original?", all(name.lower() not in [n.lower() for n in names] for name in generated_names))

# Length statistics
lengths = [len(name) for name in generated_names]
print(f"\nAverage length: {sum(lengths)/len(lengths):.1f} characters")
print(f"Minimum length: {min(lengths)} characters")
print(f"Maximum length: {max(lengths)} characters")


Generated names:
Ganetgosaurus (13 characters)
Szhuiodondon (12 characters)
Aurus (5 characters)
Egpossraurus (12 characters)
Usium (5 characters)

Are all names original? True

Average length: 9.4 characters
Minimum length: 5 characters
Maximum length: 13 characters
