<a href="https://colab.research.google.com/github/oceanwaved/NameGen/blob/main/src/NameGen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Username Generator**

by Oceanwave

Runtime -> "Run all", scroll to bottom

In [None]:
use_github_training_data = True
use_github_model = True
training = False
download_model = False

data_url = 'https://raw.githubusercontent.com/oceanwaved/NameGen/main/data/63k_names.txt'
model_url = 'https://raw.githubusercontent.com/oceanwaved/NameGen/main/models/model.pth'

In [None]:
# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split

# Progress bar
from tqdm import tqdm

# Has array functions
import numpy as np

# For character array
import string

# To upload/download files
from google.colab import files
import requests

In [None]:
# Hyperparameters
hidden_size = 256
batch_size = 32
num_layers = 1
embedding_dim = 8
num_epochs = 5
learning_rate = 0.005

In [None]:
# Get training data
if use_github_training_data:
    response = requests.get(data_url)
    if response.status_code == 200:
        usernames = [line.strip() for line in response.text.split('\n')]
    else:
        print('Failed to download the file. Status code:', response.status_code)
else:
    # Upload data, format of one name per line
    uploaded = files.upload()
    file_name = next(iter(uploaded))
    with open(file_name, 'r') as file:
        usernames = [line.strip() for line in file]

# Print 10 names
np.random.shuffle(usernames)
print("Example Names")
print(usernames[:10])

Example Names
['Elambo', 'ReDjionisuu', 'bustward', 'Avexster', 'MaFa', 'Mukuorbarius', 'Lineuzin', 'shaifman', 'Rosters', '4n4n']


In [None]:
# Prepare data
chars = list(string.ascii_letters + string.digits) + ['<SOS>', '<EOS>', '<PAD>']
max_chars = len(max(usernames, key=len)) + 2

# Lookup tables
char_to_index = {char: index for index, char in enumerate(chars)}
index_to_char = {index: char for index, char in enumerate(chars)}

In [None]:
# Encode function
def encode_sequence(sequence):
    return [char_to_index[char] for char in sequence]

# Pad function
def pad_sequence(sequence, max_length):
    return [char_to_index['<PAD>']] * (max_length - len(sequence)) + sequence

In [None]:
# Prepare Dataset
class UsernameDataset(Dataset):
    def __init__(self, usernames, max_length):
        # Inputs, targets
        self.X_data = []
        self.y_data = []

        # Add data
        for username in usernames:
            # Encode
            sequence = ['<SOS>'] + list(username) + ['<EOS>']
            encoded_sequence = encode_sequence(sequence)

            # Put inputs and targets in dataset
            for i in range(1, len(encoded_sequence)):
                input_sequence = encoded_sequence[:i]
                padded_input_sequence = pad_sequence(input_sequence, max_length)
                next_char = encoded_sequence[i]
                self.X_data.append(padded_input_sequence)
                self.y_data.append(next_char)

    # Used for DataLoader
    def __len__(self):
        return len(self.X_data)

    def __getitem__(self, index):
        X = self.X_data[index]
        y = self.y_data[index]
        return torch.tensor(X, dtype=torch.long), torch.tensor(y, dtype=torch.long)   #Convert type

In [None]:
# Create Dataset and DataLoaders
dataset = UsernameDataset(usernames, max_chars)

# Parameters
test_ratio = 0.1
total_size = len(dataset)
test_size = int(test_ratio * total_size)
train_size = total_size - test_size

# Splitting the dataset
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders for train and test datasets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

In [None]:
# Define input/output sizes
input_size = len(chars)
output_size = len(chars)

In [None]:
# Define model
class UsernameGenerator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, embedding_dim):
        super(UsernameGenerator, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(input_size, embedding_dim)

        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True)

        # Fully Connected layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input)
        out, hidden = self.lstm(embedded, hidden)
        out = out[:, -1, :]
        out = self.fc(out)
        return out, hidden

    # Reset memory for batch
    def init_hidden(self, batch_size):
        return (
            torch.zeros(num_layers, batch_size, hidden_size),
            torch.zeros(num_layers, batch_size, hidden_size)
        )

In [None]:
# Make model
model = UsernameGenerator(input_size, hidden_size, output_size, num_layers, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Model Training
# ~1 hr with default hyperparameters and 63k name dataset
def train(model, train_loader, test_loader, num_epochs, batch_size, output_size):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_examples = 0

        for (X_batch, y_batch) in tqdm(train_loader, desc=f"epoch {epoch + 1}"):
            # Reset from last batch
            optimizer.zero_grad()
            hidden = model.init_hidden(batch_size)

            # Get output
            output, hidden = model(X_batch, hidden)

            # Calculate loss
            output = output.float()
            one_hot = F.one_hot(y_batch, num_classes=output_size).float()
            loss = criterion(output, one_hot)

            # Step
            loss.backward()
            optimizer.step()

            # Increment total loss
            train_loss += loss.item() * batch_size
            train_examples += batch_size

        # Print total loss
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/train_examples}')

        # Test
        model.eval()
        val_loss = 0
        val_examples = 0
        with torch.no_grad():
            for (X_batch, y_batch) in tqdm(test_loader, desc="Validating"):
                hidden = model.init_hidden(batch_size)
                output, hidden = model(X_batch, hidden)
                output = output.float()
                one_hot = F.one_hot(y_batch, num_classes=output_size).float()
                loss = criterion(output, one_hot)
                val_loss += loss.item() * batch_size
                val_examples += batch_size

        if(epoch + 1 == num_epochs):
            print()

        print(f'Epoch {epoch+1}/{num_epochs}, Test Loss: {val_loss/val_examples}')

In [None]:
# Upload model (if applicable, skip if no model)
if use_github_model:
    response = requests.get(model_url)
    if response.status_code == 200:
        # Save the model file locally in Colab
        with open('model.pth', 'wb') as file:
            file.write(response.content)
        model.load_state_dict(torch.load('model.pth'))
        print('Model downloaded and saved successfully.')
    else:
        print('Failed to download the model. Status code:', response.status_code)

Model downloaded and saved successfully.


In [None]:
# Train Model
if training:
    train(model, train_loader, test_loader, num_epochs, batch_size, output_size)

In [None]:
# Username Generator
def generate_username(model, seed="", minimum_length=3):
    # Go out of training mode
    model.eval()

    # <SOS> at start of the sequence
    sequence = ['<SOS>'] + list(seed)

    # No tweaks
    with torch.no_grad():
        # Reset memory (only one in batch)
        hidden = model.init_hidden(1)

        # While username (minus <SOS>) is less than max chars - 2
        while len(sequence) - 1 < max_chars - 2:
            # Convert type
            encoded_sequence = encode_sequence(sequence)
            padded_sequence = pad_sequence(encoded_sequence, max_chars)
            X_tensor = torch.tensor(padded_sequence, dtype=torch.long).unsqueeze(0)

            # Get distribution
            output, hidden = model(X_tensor, hidden)
            probabilities = F.softmax(output.squeeze(), dim=-1)

            # Exclude SOS and PAD, EOS if < 3
            indices_to_zero = [char_to_index['<SOS>'], char_to_index['<PAD>']]
            if len(sequence) - 1 < minimum_length:
                indices_to_zero.append(char_to_index['<EOS>'])
            probabilities[indices_to_zero] = 0.0

            # Renormalize
            if probabilities.sum() == 0:
                valid_indices = [i for i in range(len(probabilities)) if i not in indices_to_zero]
                probabilities[valid_indices] = 1 / len(valid_indices)
            probabilities /= probabilities.sum()

            # Find next character
            next_char_index = torch.multinomial(probabilities, 1).item()
            next_char = index_to_char[next_char_index]

            # Break for EOS
            if next_char == '<EOS>':
                break

            # Add character to sequence
            sequence.append(next_char)

        # Go back into training mode
        model.train()

        # Exclude <SOS>
        return ''.join(sequence[1:])

In [None]:
# Make 50 usernames
username_list = []
for i in range(50):
    generated_username = generate_username(model)
    # generated_username = generate_username(model, seed="Seed")
    username_list.append(generated_username)

# Print them
for username in username_list:
    print(username)

Nissegg
rowu01
Lolwoy8
Natatwock
LancyWaia
Bigbigpander1
PeMageblock
myry819
Lilitai
TheFowShaman
mrnrora
BlackBr1n
EJCIIIs
ElLordGuerren
Mejlvangth
FoxyBox2234
voloke
NurpllalieI
DarkSouls
Aquallemight
Loqnized
Archerer
WiWTRS1
CreatCittt
Hellen
MoRou
KebuKu
9gsylen
Illonrim133
SoulTV
KillerTHU
tptdug
WispbalfRor3
Laydos
Bugla
sharpua
Copet
Soulavini
Dytygned
ni6942032
IKbones
ESEROPBAA
CakavElo2
arucas
xR1bIo
Pedaitix
duskythpolator
kud0nwarrior
Goomph
Teomorgab


In [None]:
# Save model for later
if download_model:
    torch.save(model.state_dict(), 'model.pth')
    files.download('model.pth')