In [1]:
import tensorflow as tf
import numpy as np
import os

In [2]:
names = set()
for filename in ['male.txt', 'female.txt']:
    for line in open(os.path.join('../data/names', filename)):
        if len(line.strip()):
            names.add(line.strip().lower())


In [3]:
print len(names), 'names'
print 'longest:', max(map(len, names))
by_len = sorted(names, key=len)
print '99th percentile longest:', len(by_len[int(len(names) * 0.95)])

7583 names
longest: 54
99th percentile longest: 9


In [4]:
chars = list('abcdefghijklmnopqrstuvwxyz') + ['<END>', '<NULL>']
indices_for_chars = {c: i for i, c in enumerate(chars)}

NAME_MAX_LEN = 10 # include the <END> char

def name_to_vec(name, maxlen=NAME_MAX_LEN):
    v = np.zeros(maxlen, dtype=int)
    null_idx = indices_for_chars['<NULL>']
    v.fill(null_idx)
    for i, c in enumerate(name):
        if i >= maxlen: break
        n = indices_for_chars.get(c, null_idx)
        v[i] = n
    v[min(len(name), maxlen-1)] = indices_for_chars['<END>']
    return v

def vec_to_name(vec):
    name = ''
    for x in vec:
        char = chars[x]
        if len(char) == 1:
            name += char
        elif char == '<END>':
            return name
    return name

print name_to_vec('nate')
assert vec_to_name(name_to_vec('nate')) == 'nate'
assert vec_to_name(name_to_vec('aaaaaaaaaaaa')) == 'aaaaaaaaa'

[13  0 19  4 26 27 27 27 27 27]


In [5]:
name_vecs = np.array([name_to_vec(n) for n in names])
print name_vecs.shape

(7583, 10)


In [6]:
def weight_var(shape, stddev=0.1, weight_decay=0, name=None):
    initial = tf.truncated_normal(shape, stddev=stddev)
    v = tf.Variable(initial, name=name)
    if weight_decay > 0:
        l2 = tf.nn.l2_loss(v) * weight_decay
        tf.add_to_collection('losses', l2)
    return v

def leaky_relu(x, leak=0.2, name="lrelu"):
    with tf.variable_scope(name):
        f1 = 0.5 * (1 + leak)
        f2 = 0.5 * (1 - leak)
        return f1 * x + f2 * abs(x)

def relu(x):
    # return tf.nn.relu(x)
    return leaky_relu(x)

def create_conv(input, out_channels, patch_size=5, stride=1, batch_norm=False, dropout=False):
    in_channels = input.get_shape()[-1].value
    w = weight_var([patch_size, patch_size, in_channels, out_channels])
    b = weight_var([out_channels], stddev=0)
    conv = tf.nn.conv2d(input, w, strides=[1,stride,stride,1], padding='SAME')
    if batch_norm: conv = create_batch_norm(conv)
    activation = relu(conv + b)
    if dropout: activation = create_dropout(activation)
    return activation
    
def text_conv(input, out_channels, patch_size=5, stride=1, dropout=False, pool_size=1):
    in_channels = input.get_shape()[-1].value
    w = weight_var([patch_size, in_channels, out_channels])
    b = weight_var([out_channels], stddev=0)
    conv = tf.nn.conv1d(input, w, stride=stride, padding='SAME')
    activation = relu(conv + b)
    # TODO: max_pooling
    if dropout: activation = create_dropout(activation)
    return activation

def create_dropout(units):
    return tf.nn.dropout(units, dropout)

def create_fc(input, out_size):
    # input_dropped = tf.nn.dropout(input, dropout_keep_prob)
    in_size = input.get_shape()[-1].value
    w = weight_var([in_size, out_size], weight_decay=0.004)
    b = weight_var([out_size], weight_decay=0.004)
    x = tf.matmul(input, w)
    return relu(x + b)

In [7]:
name_placeholder = tf.placeholder(shape=[None, NAME_MAX_LEN], dtype=tf.int32, name='names')

In [8]:
Z_SIZE = 64

def encoder_lstm(names):
    with tf.variable_scope('encoder'):
        cells = [tf.nn.rnn_cell.LSTMCell(size, state_is_tuple=True) for size in [len(chars), 64]]
        lstm = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
        one_hot = tf.one_hot(names, len(chars), dtype=tf.float32)
        outputs, state = tf.nn.dynamic_rnn(lstm, one_hot, dtype=tf.float32)
        outputs_flat = tf.reshape(outputs, [-1, 64 * NAME_MAX_LEN])
        z_mean = create_fc(outputs_flat, Z_SIZE)
        z_stddev = create_fc(outputs_flat, Z_SIZE)
        return z_mean, z_stddev

def encoder_conv(names):
    with tf.variable_scope('encoder'):
        one_hot = tf.one_hot(names, len(chars), dtype=tf.float32)
        conv1 = text_conv(one_hot, 64)
        conv2 = text_conv(one_hot, 64)
        fc1 = create_fc(tf.reshape(conv2, [-1, NAME_MAX_LEN * 64]), 128)
        z_mean = create_fc(fc1, Z_SIZE)
        z_stddev = create_fc(fc1, Z_SIZE)
        return z_mean, z_stddev
    
# def generator(noise, name='generator'):
#     with tf.variable_scope(name, reuse=None):
#         cells = [tf.nn.rnn_cell.LSTMCell(size, state_is_tuple=True) for size in [NOISE_SIZE, 256, len(chars)]]
#         lstm = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
#         noise_repeated_over_time = tf.tile(tf.reshape(noise, [-1, 1, NOISE_SIZE]), [1, NAME_MAX_LEN, 1])
#         outputs, state = tf.nn.dynamic_rnn(lstm, noise_repeated_over_time, dtype=tf.float32)
#         output_chars = tf.reshape(tf.argmax(tf.nn.softmax(outputs), axis=2), [-1, NAME_MAX_LEN])
#         output_chars = tf.cast(output_chars, tf.int32)
#     return output_chars

# generated_names = generator(noise)

z_mean, z_stddev = encoder_lstm(name_placeholder)

In [9]:
session = tf.Session()
session.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))

In [10]:
print session.run(z_mean, feed_dict={name_placeholder: [name_to_vec('nate')]})

[[-0.01720497  0.06152692  0.01489044 -0.00332805  0.11789916 -0.0294213
  -0.00113157 -0.01073818  0.0393256   0.07124054  0.08411945 -0.02391899
  -0.00480187 -0.00608682 -0.00331387 -0.01766554 -0.00191268  0.09316398
   0.00934732 -0.0098442  -0.01982485  0.16940635  0.02345115 -0.01490644
  -0.02381536  0.18268323 -0.01417166  0.13008356 -0.01922372  0.14771619
  -0.03145097 -0.03391168  0.00539082 -0.01963281  0.00139857 -0.03431068
  -0.00906854  0.05548166  0.00848088 -0.01805742  0.03736436  0.00411171
   0.09423582  0.08919111 -0.00632786  0.16512939 -0.03048882  0.08582069
  -0.01247227  0.15202031  0.09132294  0.1232969  -0.02480896  0.09067419
   0.11355287  0.09660275  0.05903743 -0.03926057 -0.02319293 -0.03711216
   0.0961521   0.09397506  0.12297711 -0.0115314 ]]


In [11]:
def sample_z(z_mean, z_stddev):
    samples = tf.random_normal(tf.shape(z_stddev), 0, 1, dtype=tf.float32)
    return z_mean + samples * z_stddev

z_vals = sample_z(z_mean, z_stddev)

In [12]:
def decoder_lstm(z):
    z_repeated_over_time = tf.tile(tf.reshape(z, [-1, 1, Z_SIZE]), [1, NAME_MAX_LEN, 1])
    cells = [tf.nn.rnn_cell.LSTMCell(size, state_is_tuple=True) for size in [Z_SIZE, 256, len(chars)]]
    lstm = tf.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
    outputs, state = tf.nn.dynamic_rnn(lstm, z_repeated_over_time, dtype=tf.float32)
    return outputs

z_input = tf.placeholder(tf.float32, [None, Z_SIZE], name='z_input')
use_z_input = tf.placeholder(tf.int32, shape=[], name="use_z_input_condition")
decoder_input = tf.cond(use_z_input > 0, lambda: z_input, lambda: z_vals)

decoded = decoder_lstm(decoder_input)

In [13]:
diff_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(decoded, name_placeholder))
kl_divergence = tf.reduce_mean(0.5 * tf.reduce_sum(tf.square(z_mean) + tf.square(z_stddev) - tf.log(tf.square(z_stddev)) - 1, 1))
loss = diff_loss + kl_divergence

In [14]:
decoded_vecs = tf.argmax(decoded, axis=2)

In [15]:
learn_rate = tf.placeholder(tf.float32, name='learning_rate')
optimizer = tf.train.AdamOptimizer(learn_rate)
global_step = tf.contrib.framework.get_or_create_global_step()
train_step = optimizer.minimize(loss, global_step=global_step)
session.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))

In [16]:
save_path = None

session = tf.Session()
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
session.run(init_op)

saver = None
if save_path:
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state(save_path)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(session, ckpt.model_checkpoint_path)
        print 'Restored from checkpoint', ckpt.model_checkpoint_path
    else:
        print 'Did not restore from checkpoint'
else:
    print 'Will not save progress'

Will not save progress


In [None]:
def reconstruct(name):
    feed_dict = {
        name_placeholder: np.array([name_to_vec(name)]),
        z_input: np.zeros((64, Z_SIZE)),
        use_z_input: 0,
        learn_rate: 0.01
    }
    output_ = session.run(decoded_vecs, feed_dict=feed_dict)
    return vec_to_name(output_[0])

for name in ['nate']:
    print name, '->', reconstruct(name)

nate -> yyyyymmmmm


In [None]:
train = True

test_names = ['nate', 'will', 'joe', 'justin']

while train:
    names = name_vecs[np.random.randint(name_vecs.shape[0], size=64), :]
    feed_dict = {
        name_placeholder: names,
        z_input: np.zeros((64, Z_SIZE)),
        use_z_input: 0,
        learn_rate: 0.001
    }
    _, loss_, step_ = session.run([train_step, loss, global_step], feed_dict=feed_dict)
    if step_ % 600 == 0:
        output_ = session.run(decoded_vecs, feed_dict=feed_dict)
        print "Step: {0}; loss: {1}".format(step_, loss_)
        for n in test_names:
            print n, '->', reconstruct(n)
        # print names[0]
        # print output_[0]
        print " example encoding: {} -> {}".format(vec_to_name(names[0]), vec_to_name(output_[0]))
        if step_ % 600 == 0:
            if saver:
                saver.save(session, save_path + '/model.ckpt', global_step=step_)
                print 'Saved'

Step: 600; loss: 1566.54138184
nate -> aaaa
will -> aaaa
joe -> aaa
justin -> aaaaaa
 example encoding: bell -> aaaa
Step: 1200; loss: 1542.34008789
nate -> aaaa
will -> aaaa
joe -> aaa
justin -> aaaaaa
 example encoding: kingston -> aaaaaeeea
Step: 1800; loss: 1530.46826172
nate -> aaaa
will -> aaaa
joe -> aaa
justin -> aaaaaaa
 example encoding: erasmus -> aaaaaee
Step: 2400; loss: 1498.74853516
nate -> aaaa
will -> aaaa
joe -> aaa
justin -> aaaaaa
 example encoding: nona -> aaaa
Step: 3000; loss: 1488.49914551
nate -> aaaa
will -> aaaa
joe -> aaa
justin -> aaaaaa
 example encoding: les -> aaa
Step: 3600; loss: 1483.14575195
nate -> aaaa
will -> aaaa
joe -> aaa
justin -> aaaaaaa
 example encoding: berrie -> caaaaa
Step: 4200; loss: 1403.80224609
nate -> aaaa
will -> aaaa
joe -> aaa
justin -> aaaaaa
 example encoding: valida -> aaaaaa
Step: 4800; loss: 1425.92895508
nate -> baea
will -> aaaa
joe -> jaa
justin -> laeeeeh
 example encoding: augusto -> baaeeea
Step: 5400; loss: 1374.9265

In [None]:
# reconstruct is pretty good at reconstructing american names, even long ones:
for name in ['nate', 'will', 'chen', 'atty', 'arielle', 'nathaniel', 'kimberly', 'erica', 'zoe']:
    print name, '->', reconstruct(name)

# although notably, it's bad at reconstructing names with less-frequent letters

In [None]:
# it's decent at some english words that 'sound' like ames:
for name in ['word', 'happy', 'winter', 'candle', 'cherish']:
    print name, '->', reconstruct(name)

In [None]:
# it's worse at more longer, more "wordy" names:
for name in ['embedding', 'automobile', 'air', 'larynx']:
    print name, '->', reconstruct(name)

In [None]:
# predictably, it's terrible at things that aren't even pronouncable strings of letters:
for name in ['ufhoe', 'xyzy', 'ihwrfoecoei']:
    print name, '->', reconstruct(name)
# it even seems to try to turn some into slightly more name-like strings:

In [None]:
# so reconstruction quality seems like a pretty good of how "name-ish" a word is
# want to give your kid a cool, ~original~ name no one has, but that sounds good?
# what good english words sound like names, but aren't?

# first, let's build a 'nameliness' score:
def nameliness(word):
    r = reconstruct(word)
    return sum([1 if a == b else 0 for a, b in zip(word, r)]) / float(len(word))

for name in ['nate', 'july', 'fridge', 'gienigoe', 'chzsiucf', 'xyxyzzy']:
    print name, ':', nameliness(name)

In [None]:
# let's grab the top 10k english words, remove the things that are names, and see which can be reconstructed best:
# source: https://github.com/first20hours/google-10000-english

top_words = list(word.strip() for word in open('../data/google-10000-english.txt'))
top_words = list(word for word in top_words if word not in names)
print len(top_words)
top_words = top_words[:1000] # this is actually kinda slow, so let's stick with the top 1k
nameliness_scores = {word: nameliness(word) for word in top_words}
print [w for w in top_words if nameliness_scores[w] == 1]

In [None]:
# let's build a big lookup table of all the names and their embeddings:
def make_batches(list, size=128):
    batches = []
    while len(list):
        batches.append(list[:min(len(list), size)])
        list = list[len(batches[-1]):]
    return batches

embeddings = {}

for batch in make_batches(list(names)):
    feed_dict = {
        name_placeholder: np.array([name_to_vec(name) for name in batch]),
        z_input: np.zeros((len(batch), Z_SIZE)),
        use_z_input: 0
    }
    output_ = session.run(z_mean, feed_dict=feed_dict)
    for name, vec in zip(batch, output_):
        embeddings[name] = vec
    # print 'processed {}/{}'.format(len(embeddings), len(names))

In [None]:
def embed(name):
    feed_dict = {
        name_placeholder: np.array([name_to_vec(name)]),
        z_input: np.zeros((1, Z_SIZE)),
        use_z_input: 0
    }
    output_ = session.run(z_mean, feed_dict=feed_dict)
    return output_[0]

def nearest(embedding):
    def distance(name):
        return np.linalg.norm(embedding - embeddings[name])
    return min(embeddings.iterkeys(), key=distance)

def unembed(embedding):
    feed_dict = {
        name_placeholder: np.zeros((1, NAME_MAX_LEN)),
        z_input: np.array([embedding]),
        use_z_input: 1
    }
    output_ = session.run(decoded_vecs, feed_dict=feed_dict)
    return vec_to_name(output_[0])

assert unembed(embed('nate')) == 'nate'

# print nearest(embed('nate'))
for name in ['nate', 'yikes', 'panda', 'ixzhxzi', 'justxn']:
    print name, 'is closest to', nearest(embed(name))


In [None]:
# what happens if we try to interpolate names?
def blend_names(name1, name2):
    e1 = embed(name1)
    e2 = embed(name2)
    for i in range(11):
        blend = i / 10.0
        print unembed(e1 * (1 - blend) + e2 * blend)

blend_names('amy', 'francisco')

In [None]:
blend_names('nathaniel', 'chen')

In [None]:
blend_names('nathaniel', 'leinahtan')

In [None]:
print nearest(np.zeros(Z_SIZE))

In [None]:
# what if we multiply names?:
for name in ['nate', 'willy', 'sam', 'polly', 'jacob']:
    print name, '* 2 =', nearest(embed(name) * 2)

In [None]:
# what's the opposite of a name?
for name in ['nancy', 'barry', 'chance', 'rachel', 'gloria']:
    print '-' + name, '=', nearest(-embed(name))
# weird that rachel's opposite's opposite isn't rachel

In [None]:
# can we do addition and subtraction?
print nearest(embed('alberta') - embed('albert') + embed('robert'))
print nearest(embed('alberta') - embed('albert') + embed('justin'))

In [None]:
# what if, rather than looking for the nearest neighbors, we generate names from these arithmetic operations?
print unembed(embed('alberta') - embed('albert') + embed('robert'))
print unembed(embed('alberta') - embed('albert') + embed('justin'))
print unembed(embed('alberta') - embed('albert') + embed('joseph'))

print unembed(embed('alberta') - embed('albert') + embed('nate')) # doesn't work so well with names ending in vowels

In [None]:
# let's generate some random names:
def generate():
    return unembed(np.random.normal(size=Z_SIZE))
for _ in range(10):
    print generate()

In [None]:
# what if we train a GAN to mimick the latent vectors produced by real names?