Name AI
=======

Name AI is an machine learning application that can come up with brandable domain names.

How it works
============

For more understanding how it works, please refer to my blogpost: [Let AI come up with your next domain name!](https://codebuffet.co/2017/03/31/let-ai-come-up-with-your-next-domain-name/)

---

Copyright (C) 2017 Peter Willemsen

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [1]:
import os
os.environ['KERAS_BACKEND'] = 'theano'
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation, Dropout
from keras.layers import GRU
from keras.regularizers import l2, activity_l2
from keras.optimizers import RMSprop
from IPython.display import clear_output
import numpy as np
import math
from datetime import datetime
import time
import random
import sys
import string
import h5py
import csv

Using Theano backend.


In [12]:
seed = 80085
np.random.seed(seed)  # for reproducibility
random.seed(seed)
weights_path = './model.hdf5'

csv.register_dialect('eugene', delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)

Create character indices, so that each character is linked to a number.

In [5]:
mask = '_'
chars = None
maxlen = 3
domains = []
next_chars = []
maxlen2 = 0

with open('../syntethic_data/data.csv', 'r') as f:
    reader = csv.reader(f, 'eugene')
    text = ""
    for row in reader:
        to_append = row[0] + ":" + row[1]
        text += to_append
        maxlen2 = max(maxlen2, len(to_append))
    chars = sorted(list(set(text + mask)))
    
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
print(('total chars: %d, maxlen2: %d' % (len(chars), maxlen2)))

total chars: 48, maxlen2: 253


In [6]:
def pad_name(name, max_length, skip = 0):
    padded_name = mask * (max_length - len(name) - skip)
    padded_name += name
    
    return padded_name

We count the longest domain name in the corpus. This length will be used to scale the rest of the data, so that no domain will be cut off. The rest will be left-padded with '_'.

Put the data in a time-series window of 3 steps. A window of 3 characters means that the network will keep in mind the past 3 characters, while predicting the next single character.

In [7]:
step = 1
batch_size = 512
with open('../syntethic_data/data.csv', 'r') as f:
    padded_names = ""
    reader = csv.reader(f, 'eugene')
    for row in reader:
        line = row[0] + ":" + row[1]
        padded_names += pad_name(line, maxlen2)

for i in range(0, len(padded_names) - maxlen, step):
    domains.append(padded_names[i: i + maxlen])
    next_chars.append(padded_names[i + maxlen])

for i in range(0, maxlen2*2):
    print("%s -> %s" % (domains[i], next_chars[i]))

print('nb sequences:', len(domains), len(next_chars))
print("batch size: %d maxlen2: %d" % (batch_size, maxlen2))

___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
___ -> _
_

Vectorization - convert the time-series window from a list of characters to a list of numbers (the character embeddings)

In [8]:
X = np.zeros((len(domains), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(domains), len(chars)), dtype=np.bool)
for i, domain in enumerate(domains):
    for t, char in enumerate(domain):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Our model consists of a single GRU layer with 128 units (which is quite small). My goal was that Name AI would learn the linguistics of what makes a name brandable, in which case such a small model is a perfect fit. 

This model has a small dropout to make sure that we don't jolt our loss down to near-zero.

In [9]:
model = Sequential()
model.add(GRU(128, dropout_W=0.015, dropout_U=0.015, input_shape=(maxlen, len(chars)), return_sequences=False, stateful=False))

#for a hidden layer, uncomment this one
#model.add(GRU(128, dropout_W=0.015, dropout_U=0.015, return_sequences=False, stateful=False))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [10]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [13]:
if os.path.exists(weights_path):
    print("loading existing model..")
    model.load_weights(weights_path)

In [14]:
def generate_alphabet_names():
    seeds = string.ascii_lowercase
    generate_names_amount = len(seeds)
    diversity = random.uniform(0.05, 0.5)

    print("Name AI by Peter Willemsen <peter@codebuffet.co>\nCreating %d names with diversity %f" % (generate_names_amount, diversity))
    for i in range(0, generate_names_amount):
        seed = pad_name(seeds[i], maxlen)
        sentence = seed
        generated = seed
        domains = generated

        for i in range(maxlen2 * 1):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            domains += next_char
        print(domains.replace("_", ""))

Train the model, output generated text after each iteration. We make sure we don't shuffle our data, otherwise the order of the 3-step window will be broken.

In [17]:
iteration = 0
while True:
    print('Iteration', iteration)
    model.fit(X, y, batch_size=batch_size, shuffle=False, nb_epoch=1)
    model.save_weights(weights_path, overwrite=True)
    clear_output()    
    generate_alphabet_names()      
        
    iteration += 1

Name AI by Peter Willemsen <peter@codebuffet.co>
Creating 26 names with diversity 0.275346


INFO (theano.gof.compilelock): Refreshing lock /home/peter/.theano/compiledir_Linux-4.8--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-3.5.2-64/lock_dir/lock


an


KeyError: 'b'

In [None]:
generate_alphabet_names()