In [1]:
import os
import random
import string
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.framework import ops
ops.reset_default_graph()

model = __import__('siamese_similarity_model')

In [2]:
batch_size = 200     # How many addresses to train on in one batch
n_batches = 300      # How many batches to train on
max_address_len = 20 # How many character to crop/pad each address
margin = 0.25        # A sort of regularization parameter that allows for 'wiggle' room in bad predicted similarities.
num_features = 50    # RNN feature size
dropout_keep_prob = 0.8 # Dropout probability

In [3]:
# Generate data
street_names = ['abbey', 'baker', 'canal', 'donner', 'elm', 'fifth',
                'grandvia', 'hollywood', 'interstate', 'jay', 'kings']
street_types = ['rd', 'st', 'ln', 'pass', 'ave', 'hwy', 'cir', 'dr', 'jct']

# Define test addresses
test_queries = ['111 abbey ln', '271 doner cicle',
                '314 king avenue', 'tensorflow is fun']
test_references = ['123 abbey ln', '217 donner cir', '314 kings ave',
                   '404 hollywood st', 'tensorflow is so fun']

In [4]:
def create_typo(s):
    rand_ind = random.choice(range(len(s)))
    s_list = list(s)
    s_list[rand_ind] = random.choice(string.ascii_lowercase +  '0123456789')
    s = ''.join(s_list)
    return s

In [5]:
def get_batch(n):
    # Generate a list of reference addresses with similar addresses that have
    # a typo.
    numbers = [random.randint(1, 9999) for i in range(n)]
    streets = [random.choice(street_names) for i in range(n)]
    street_suffs = [random.choice(street_types) for i in range(n)]
    full_streets = [str(w) + ' ' + x + ' ' + y for w,x,y in zip(numbers, streets, street_suffs)]
    typo_streets = [create_typo(x) for x in full_streets]
    reference = [list(x) for x in zip(full_streets, typo_streets)]
    
    # Shuffle last half of them for training on dissimilar addresses
    half_ix = int(n/2)
    bottom_half = reference[half_ix:]
    true_address = [x[0] for x in bottom_half]
    typo_address = [x[1] for x in bottom_half]
    typo_address = list(np.roll(typo_address, 1))
    bottom_half = [[x,y] for x,y in zip(true_address, typo_address)]
    reference[half_ix:] = bottom_half
    
    # Get target similarities (1's for similar, -1's for non-similar)
    target = [1]*(n-half_ix) + [-1]*half_ix
    reference = [[x,y] for x,y in zip(reference, target)]
    return(reference)

In [6]:
get_batch(4)

[[['7156 hollywood pass', '7156 hollywood ptss'], 1],
 [['3307 fifth ln', '6307 fifth ln'], 1],
 [['836 canal ave', '2999 donn3r dr'], -1],
 [['2999 donner dr', '836 zanal ave'], -1]]

各バッチの最初半分が似ているペア（真のアドレス，真のアドレス+typo）後半半分が似ていないペア，（真のアドレス，真のアドレス+typo+shuffle(対応関係をなくしたもの)）

Define vocabulary dictionary (remember to save ' ' for badding)

In [7]:
vocab_chars = string.ascii_lowercase + '0123456789 '
vocab2ix_dict = {char:(ix+1) for ix, char in enumerate(vocab_chars)}
vocab_length = len(vocab_chars) + 1

Define vocab one-hot encoding. Here we get the actual indices for usage in embedding lookup.

In [8]:
def address2onehot(address,
                   vocab2ix_dict = vocab2ix_dict,
                   max_address_len = max_address_len):
    # translate address string into indices
    address_ix = [vocab2ix_dict[x] for x in list(address)]
    
    # Pad or crop to max_address_len
    address_ix = (address_ix + [0]*max_address_len)[0:max_address_len]
    return(address_ix)

In [9]:
address = test_queries[0]

In [10]:
address,  address2onehot(address)

('111 abbey ln',
 [28, 28, 28, 37, 1, 2, 2, 5, 25, 37, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0])

Now we define  model placeholders.

In [14]:
address1_ph = tf.placeholder(tf.int32, [None, max_address_len], name="address1_ph")
address2_ph = tf.placeholder(tf.int32, [None, max_address_len], name="address2_ph")

y_target_ph = tf.placeholder(tf.int32, [None], name="y_target_ph")
dropout_keep_prob_ph = tf.placeholder(tf.float32, name="dropout_keep_prob")

Create embedding lookup. Here we use the identity matrix so that we have a one-hot encoded lookup.

In [17]:
identity_mat = tf.diag(tf.ones(shape=[vocab_length]))
address1_embed = tf.nn.embedding_lookup(identity_mat, address1_ph)
address2_embed = tf.nn.embedding_lookup(identity_mat, address2_ph)

Now we define the model.

In [18]:
# Define Model
text_snn = model.snn(address1_embed, address2_embed, dropout_keep_prob_ph,
              vocab_length, num_features, max_address_len)

ValueError: Variable bidirectional_rnn/fw/basic_lstm_cell/kernel already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "/home/kei/Projects/spykesim/drafts/siamese_similarity_model.py", line 41, in siamese_nn
    dtype=tf.float32)
  File "/home/kei/Projects/spykesim/drafts/siamese_similarity_model.py", line 58, in snn
    output1 = siamese_nn(address1, num_features)
  File "<ipython-input-13-9aa1010548c2>", line 3, in <module>
    vocab_length, num_features, max_address_len)
