In [1]:
import tensorflow as tf
import numpy as np
import os

In [2]:
names = set()
for filename in ['male.txt', 'female.txt']:
    for line in open(os.path.join('../data/names', filename)):
        if len(line.strip()):
            names.add(line.strip().lower())


In [3]:
print len(names), 'names'
print 'longest:', max(map(len, names))
by_len = sorted(names, key=len)
print '99th percentile longest:', len(by_len[int(len(names) * 0.95)])

7583 names
longest: 54
99th percentile longest: 9


In [4]:
chars = list('abcdefghijklmnopqrstuvwxyz') + ['<END>', '<NULL>']
indices_for_chars = {c: i for i, c in enumerate(chars)}

NAME_MAX_LEN = 10 # include the <END> char

def name_to_vec(name, maxlen=NAME_MAX_LEN):
    v = np.zeros(maxlen, dtype=int)
    null_idx = indices_for_chars['<NULL>']
    v.fill(null_idx)
    for i, c in enumerate(name):
        if i >= maxlen: break
        n = indices_for_chars.get(c, null_idx)
        v[i] = n
    v[min(len(name), maxlen-1)] = indices_for_chars['<END>']
    return v

def vec_to_name(vec):
    name = ''
    for x in vec:
        char = chars[x]
        if len(char) == 1:
            name += char
        elif char == '<END>':
            return name
    return name

print name_to_vec('nate')
assert vec_to_name(name_to_vec('nate')) == 'nate'
assert vec_to_name(name_to_vec('aaaaaaaaaaaa')) == 'aaaaaaaaa'

[13  0 19  4 26 27 27 27 27 27]


In [5]:
name_vecs = np.array([name_to_vec(n) for n in names])
print name_vecs.shape

(7583, 10)


In [6]:
def weight_var(shape, stddev=0.1, weight_decay=0, name=None):
    initial = tf.truncated_normal(shape, stddev=stddev)
    v = tf.Variable(initial, name=name)
    if weight_decay > 0:
        l2 = tf.nn.l2_loss(v) * weight_decay
        tf.add_to_collection('losses', l2)
    return v

def leaky_relu(x, leak=0.2, name="lrelu"):
    with tf.variable_scope(name):
        f1 = 0.5 * (1 + leak)
        f2 = 0.5 * (1 - leak)
        return f1 * x + f2 * abs(x)

def relu(x):
    # return tf.nn.relu(x)
    return leaky_relu(x)

def create_conv(input, out_channels, patch_size=5, stride=1, batch_norm=False, dropout=False):
    in_channels = input.get_shape()[-1].value
    w = weight_var([patch_size, patch_size, in_channels, out_channels])
    b = weight_var([out_channels], stddev=0)
    conv = tf.nn.conv2d(input, w, strides=[1,stride,stride,1], padding='SAME')
    if batch_norm: conv = create_batch_norm(conv)
    activation = relu(conv + b)
    if dropout: activation = create_dropout(activation)
    return activation
    
def text_conv(input, out_channels, patch_size=5, stride=1, dropout=False, pool_size=1):
    in_channels = input.get_shape()[-1].value
    w = weight_var([patch_size, in_channels, out_channels])
    b = weight_var([out_channels], stddev=0)
    conv = tf.nn.conv1d(input, w, stride=stride, padding='SAME')
    activation = relu(conv + b)
    # TODO: max_pooling
    if dropout: activation = create_dropout(activation)
    return activation

def create_dropout(units):
    return tf.nn.dropout(units, dropout)

def create_fc(input, out_size):
    # input_dropped = tf.nn.dropout(input, dropout_keep_prob)
    in_size = input.get_shape()[-1].value
    w = weight_var([in_size, out_size], weight_decay=0.004)
    b = weight_var([out_size], weight_decay=0.004)
    x = tf.matmul(input, w)
    return relu(x + b)

In [7]:
name_placeholder = tf.placeholder(shape=[None, NAME_MAX_LEN], dtype=tf.int32, name='names')

In [8]:
Z_SIZE = 64

def encoder_lstm(names):
    with tf.variable_scope('encoder'):
        cells = [tf.nn.rnn_cell.LSTMCell(size, state_is_tuple=True) for size in [len(chars), 64]]
        lstm = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
        one_hot = tf.one_hot(names, len(chars), dtype=tf.float32)
        outputs, state = tf.nn.dynamic_rnn(lstm, one_hot, dtype=tf.float32)
        outputs_flat = tf.reshape(outputs, [-1, 64 * NAME_MAX_LEN])
        z_mean = create_fc(outputs_flat, Z_SIZE)
        z_stddev = create_fc(outputs_flat, Z_SIZE)
        return z_mean, z_stddev

def encoder_conv(names):
    with tf.variable_scope('encoder'):
        one_hot = tf.one_hot(names, len(chars), dtype=tf.float32)
        conv1 = text_conv(one_hot, 64)
        conv2 = text_conv(one_hot, 64)
        fc1 = create_fc(tf.reshape(conv2, [-1, NAME_MAX_LEN * 64]), 128)
        z_mean = create_fc(fc1, Z_SIZE)
        z_stddev = create_fc(fc1, Z_SIZE)
        return z_mean, z_stddev
    
# def generator(noise, name='generator'):
#     with tf.variable_scope(name, reuse=None):
#         cells = [tf.nn.rnn_cell.LSTMCell(size, state_is_tuple=True) for size in [NOISE_SIZE, 256, len(chars)]]
#         lstm = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
#         noise_repeated_over_time = tf.tile(tf.reshape(noise, [-1, 1, NOISE_SIZE]), [1, NAME_MAX_LEN, 1])
#         outputs, state = tf.nn.dynamic_rnn(lstm, noise_repeated_over_time, dtype=tf.float32)
#         output_chars = tf.reshape(tf.argmax(tf.nn.softmax(outputs), axis=2), [-1, NAME_MAX_LEN])
#         output_chars = tf.cast(output_chars, tf.int32)
#     return output_chars

# generated_names = generator(noise)

z_mean, z_stddev = encoder_lstm(name_placeholder)

In [9]:
session = tf.Session()
session.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))

In [10]:
print session.run(z_mean, feed_dict={name_placeholder: [name_to_vec('nate')]})

[[-0.00247853  0.02004284 -0.0181652  -0.04191967  0.05001117 -0.01602871
   0.02587532  0.13481548  0.07789069 -0.02169716  0.08349291 -0.0429353
  -0.03166916 -0.01963357  0.02664084 -0.00100241  0.03409065  0.08553927
   0.07906856  0.00153807  0.31431997 -0.02886212  0.03868855  0.25834271
  -0.03920429  0.0406871   0.04773729  0.029155    0.05626151 -0.00365246
  -0.00656544  0.02019122 -0.00066316 -0.02825847  0.0101981   0.18162087
   0.05051333  0.15712184  0.11209555 -0.01296621 -0.01769583 -0.01094536
   0.06383231  0.06080098  0.18247971  0.02820658  0.0322004   0.09901164
  -0.01733317 -0.01050556 -0.02658109 -0.03360737  0.07080974  0.04814442
  -0.00804588 -0.02840882 -0.02741902  0.01266014 -0.01211043  0.04052842
   0.09770438 -0.00944409  0.17789587  0.03068392]]


In [11]:
def sample_z(z_mean, z_stddev):
    samples = tf.random_normal(tf.shape(z_stddev), 0, 1, dtype=tf.float32)
    return z_mean + samples * z_stddev

z_vals = sample_z(z_mean, z_stddev)

In [12]:
def decoder_lstm(z):
    z_repeated_over_time = tf.tile(tf.reshape(z, [-1, 1, Z_SIZE]), [1, NAME_MAX_LEN, 1])
    cells = [tf.nn.rnn_cell.LSTMCell(size, state_is_tuple=True) for size in [Z_SIZE, 256, len(chars)]]
    lstm = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
    outputs, state = tf.nn.dynamic_rnn(lstm, z_repeated_over_time, dtype=tf.float32)
    return outputs

z_input = tf.placeholder(tf.float32, [None, Z_SIZE], name='z_input')
use_z_input = tf.placeholder(tf.int32, shape=[], name="use_z_input_condition")
decoder_input = tf.cond(use_z_input > 0, lambda: z_input, lambda: z_vals)

decoded = decoder_lstm(decoder_input)

In [13]:
diff_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(decoded, name_placeholder))
kl_divergence = tf.reduce_mean(0.5 * tf.reduce_sum(tf.square(z_mean) + tf.square(z_stddev) - tf.log(tf.square(z_stddev)) - 1, 1))
loss = diff_loss + kl_divergence

In [14]:
decoded_vecs = tf.argmax(decoded, axis=2)

In [15]:
learn_rate = tf.placeholder(tf.float32, name='learning_rate')
optimizer = tf.train.AdamOptimizer(learn_rate)
global_step = tf.contrib.framework.get_or_create_global_step()
train_step = optimizer.minimize(loss, global_step=global_step)
session.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))

In [16]:
save_path = 'models/nva2'

session = tf.Session()
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
session.run(init_op)

saver = None
if save_path:
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state(save_path)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(session, ckpt.model_checkpoint_path)
        print 'Restored from checkpoint', ckpt.model_checkpoint_path
    else:
        print 'Did not restore from checkpoint'
else:
    print 'Will not save progress'

Restored from checkpoint models/nva2/model.ckpt-60600


In [17]:
train = False
while train:
    names = name_vecs[np.random.randint(name_vecs.shape[0], size=64), :]
    feed_dict = {
        name_placeholder: names,
        z_input: np.zeros((64, Z_SIZE)),
        use_z_input: 0,
        learn_rate: 0.001
    }
    _, loss_, step_ = session.run([train_step, loss, global_step], feed_dict=feed_dict)
    if step_ % 200 == 0:
        output_ = session.run(decoded_vecs, feed_dict=feed_dict)
        print "Step: {0}; loss: {1}".format(step_, loss_)
        # print names[0]
        # print output_[0]
        print " example encoding: {} -> {}".format(vec_to_name(names[0]), vec_to_name(output_[0]))
        if step_ % 600 == 0:
            saver.save(session, save_path + '/model.ckpt', global_step=step_)
            print 'Saved'

In [18]:
def reconstruct(name):
    feed_dict = {
        name_placeholder: np.array([name_to_vec(name)]),
        z_input: np.zeros((64, Z_SIZE)),
        use_z_input: 0,
        learn_rate: 0.01
    }
    output_ = session.run(decoded_vecs, feed_dict=feed_dict)
    return vec_to_name(output_[0])

for name in ['nate']:
    print name, '->', reconstruct(name)

nate -> nate


In [19]:
# reconstruct is pretty good at reconstructing american names, even long ones:
for name in ['nate', 'will', 'chen', 'atty', 'arielle', 'nathaniel', 'kimberly', 'erica', 'zoe']:
    print name, '->', reconstruct(name)

# although notably, it's bad at reconstructing names with less-frequent letters

nate -> nate
will -> will
chen -> chen
atty -> atty
arielle -> arielle
nathaniel -> nathaniel
kimberly -> kimberly
erica -> eriaa
zoe -> noe


In [20]:
# it's decent at some english words that 'sound' like ames:
for name in ['word', 'happy', 'winter', 'candle', 'cherish']:
    print name, '->', reconstruct(name)

word -> word
happy -> happy
winter -> winter
candle -> candle
cherish -> cherish


In [21]:
# it's worse at more longer, more "wordy" names:
for name in ['embedding', 'automobile', 'air', 'larynx']:
    print name, '->', reconstruct(name)

embedding -> emerenine
automobile -> attoooler
air -> arr
larynx -> laryne


In [22]:
# predictably, it's terrible at things that aren't even pronouncable strings of letters:
for name in ['ufhoe', 'xyzy', 'ihwrfoecoei']:
    print name, '->', reconstruct(name)
# it even seems to try to turn some into slightly more name-like strings:

ufhoe -> ueooo
xyzy -> eyyy
ihwrfoecoei -> shereodos


In [23]:
# so reconstruction quality seems like a pretty good of how "name-ish" a word is
# want to give your kid a cool, ~original~ name no one has, but that sounds good?
# what good english words sound like names, but aren't?

# first, let's build a 'nameliness' score:
def nameliness(word):
    r = reconstruct(word)
    return sum([1 if a == b else 0 for a, b in zip(word, r)]) / float(len(word))

for name in ['nate', 'july', 'fridge', 'gienigoe', 'chzsiucf', 'xyxyzzy']:
    print name, ':', nameliness(name)

nate : 1.0
july : 1.0
fridge : 0.833333333333
gienigoe : 0.625
chzsiucf : 0.375
xyxyzzy : 0.0


In [24]:
# let's grab the top 10k english words, remove the things that are names, and see which can be reconstructed best:
# source: https://github.com/first20hours/google-10000-english

top_words = list(word.strip() for word in open('../data/google-10000-english.txt'))
top_words = list(word for word in top_words if word not in names)
print len(top_words)
top_words = top_words[:1000] # this is actually kinda slow, so let's stick with the top 1k
nameliness_scores = {word: nameliness(word) for word in top_words}
print [w for w in top_words if nameliness_scores[w] == 1]

9268
['the', 'and', 'to', 'a', 'in', 'for', 'on', 'that', 'this', 'i', 'not', 'or', 'are', 'at', 'all', 'more', 'an', 'was', 'we', 'can', 'us', 'has', 'free', 'one', 'other', 'do', 'no', 'time', 'they', 'site', 'he', 'any', 'there', 'so', 'get', 'e', 'am', 'were', 's', 'these', 'than', 'find', 'date', 'had', 'list', 'name', 'just', 'state', 'day', 'n', 'world', 're', 'go', 'b', 'last', 'most', 'buy', 'them', 'her', 't', 'add', 'best', 'then', 'good', 'well', 'd', 'm', 'she', 'r', 'many', 'de', 'set', 'mail', 'full', 'games', 'p', 'part', 'center', 'must', 'store', 'made', 'line', 'did', 'those', 'car', 'area', 'want', 'o', 'file', 'both', 'care', 'end', 'him', 'per', 'north', 'posts', 'shop', 'old', 'main', 'call', 'non', 'shall', 'class', 'still', 'money', 'man', 'card', 'jobs', 'food', 'press', 'sale', 'print', 'credit', 'join', 'men', 'sales', 'note', 'gallery', 'table', 'start', 'model', 'cost', 'better', 'july', 'come', 'cart', 'san', 'standard', 'less', 'got', 'let', 'stores', 's

In [25]:
# let's build a big lookup table of all the names and their embeddings:
def make_batches(list, size=128):
    batches = []
    while len(list):
        batches.append(list[:min(len(list), size)])
        list = list[len(batches[-1]):]
    return batches

embeddings = {}

for batch in make_batches(list(names)):
    feed_dict = {
        name_placeholder: np.array([name_to_vec(name) for name in batch]),
        z_input: np.zeros((len(batch), Z_SIZE)),
        use_z_input: 0
    }
    output_ = session.run(z_mean, feed_dict=feed_dict)
    for name, vec in zip(batch, output_):
        embeddings[name] = vec
    # print 'processed {}/{}'.format(len(embeddings), len(names))

In [26]:
def embed(name):
    feed_dict = {
        name_placeholder: np.array([name_to_vec(name)]),
        z_input: np.zeros((1, Z_SIZE)),
        use_z_input: 0
    }
    output_ = session.run(z_mean, feed_dict=feed_dict)
    return output_[0]

def nearest(embedding):
    def distance(name):
        return np.linalg.norm(embedding - embeddings[name])
    return min(embeddings.iterkeys(), key=distance)

def unembed(embedding):
    feed_dict = {
        name_placeholder: np.zeros((1, NAME_MAX_LEN)),
        z_input: np.array([embedding]),
        use_z_input: 1
    }
    output_ = session.run(decoded_vecs, feed_dict=feed_dict)
    return vec_to_name(output_[0])

assert unembed(embed('nate')) == 'nate'

# print nearest(embed('nate'))
for name in ['nate', 'yikes', 'panda', 'ixzhxzi', 'justxn']:
    print name, 'is closest to', nearest(embed(name))


nate is closest to nate
yikes is closest to giles
panda is closest to hanna
ixzhxzi is closest to desirae
justxn is closest to justin


In [27]:
# what happens if we try to interpolate names?
def blend_names(name1, name2):
    e1 = embed(name1)
    e2 = embed(name2)
    for i in range(11):
        blend = i / 10.0
        print unembed(e1 * (1 - blend) + e2 * blend)

blend_names('amy', 'francisco')

amy
amy
ammy
ammy
aarey
nareey
nariee
faanies
franiiss
fransisoo
fransisoo


In [28]:
blend_names('nathaniel', 'chen')

nathaniel
nathaniel
nathaniel
kathnnie
cathnnee
cathnee
hathney
phanne
chene
chene
chen


In [29]:
blend_names('will', 'william')

will
will
will
wille
wille
wille
willie
willie
willie
willia
william


In [30]:
blend_names('nathaniel', 'leinahtan')

nathaniel
nathaniel
nathaniel
nathaniel
nnthaniil
nnthaniie
nntaaniia
innaaniia
iinaaatin
linntttnn
leinnttnn


In [31]:
print nearest(np.zeros(Z_SIZE))
print unembed(np.zeros(Z_SIZE))

selia
seeera


In [32]:
# what if we multiply names?:
for name in ['nate', 'willy', 'sam', 'polly', 'jacob']:
    print name, '* 2 =', unembed(embed(name) * 2)

nate * 2 = naty
willy * 2 = willy
sam * 2 = sam
polly * 2 = pool
jacob * 2 = jaoo


In [33]:
# what's the opposite of a name?
for name in ['nancy', 'barry', 'chance', 'rachel', 'gloria']:
    print '-' + name, '=', unembed(-embed(name))

-nancy = hhhctttsss
-barry = jsuu
-chance = ddhhdddsrs
-rachel = hhhtrrrrrr
-gloria = sselaa


In [34]:
# can we do addition and subtraction?
print unembed(embed('alberta') - embed('albert') + embed('robert'))
print unembed(embed('alberta') - embed('albert') + embed('justin'))
print unembed(embed('alberta') - embed('albert') + embed('joseph'))

print unembed(embed('alberta') - embed('albert') + embed('nate')) # doesn't work so well with names ending in vowels

roberta
justina
josepha
nanee


In [35]:
# let's generate some random names:
def generate():
    return unembed(np.random.normal(size=Z_SIZE))
for _ in range(30):
    print generate()

fooa
rssirrrrrr
ssio
eettte
gimarr
dedei
hharo
keran
iicuyyyyyy
purrlo
goroory
nanta
karia
dalor
um
ahallioa
jerlrili
hotron
marleaa
issi
samsooa
snnt
spuuzzzrro
alria
fuaro
keeaa
ju
pare
calonn
asilia


In [36]:
# what if we train a GAN to mimick the latent vectors produced by real names?

In [39]:
def unembed_multi(embeddings):
    feed_dict = {
        name_placeholder: np.zeros((1, NAME_MAX_LEN)),
        z_input: np.array(embeddings),
        use_z_input: 1
    }
    output_ = session.run(decoded_vecs, feed_dict=feed_dict)
    return [vec_to_name(op) for op in output_]

names = open('names.txt', 'w')
for i in xrange(10000):
    count = 32
    embeddings = [np.random.normal(size=Z_SIZE) for _ in xrange(count)]
    for name in unembed_multi(embeddings):
        names.write(name + '\n')
    if i % 100 == 0: print i / 10000.0

0.0
0.01
0.02
0.03
0.04
0.05
0.06
0.07
0.08
0.09
0.1
0.11
0.12
0.13
0.14
0.15
0.16
0.17
0.18
0.19
0.2
0.21
0.22
0.23
0.24
0.25
0.26
0.27
0.28
0.29
0.3
0.31
0.32
0.33
0.34
0.35
0.36
0.37
0.38
0.39
0.4
0.41
0.42
0.43
0.44
0.45
0.46
0.47
0.48
0.49
0.5
0.51
0.52
0.53
0.54
0.55
0.56
0.57
0.58
0.59
0.6
0.61
0.62
0.63
0.64
0.65
0.66
0.67
0.68
0.69
0.7
0.71
0.72
0.73
0.74
0.75
0.76
0.77
0.78
0.79
0.8
0.81
0.82
0.83
0.84
0.85
0.86
0.87
0.88
0.89
0.9
0.91
0.92
0.93
0.94
0.95
0.96
0.97
0.98
0.99
