## 5.3 CNN DSSM example by tf.estimator

In [2]:
import tensorflow as tf
import numpy as np
import os

from sklearn.model_selection import train_test_split

import json

  return f(*args, **kwds)


In [7]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'

TEST_SPLIT = 0.1
RNG_SEED = 13371447

In [8]:
q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))
prepro_configs = None

with open(DATA_IN_PATH + NB_WORDS_DATA_FILE, 'r') as f:
    prepro_configs = json.load(f)

In [9]:
print(prepro_configs.keys())

dict_keys(['vocab', 'vocab_size'])


In [10]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)

train_Q1 = train_X[:,0]
train_Q2 = train_X[:,1]
test_Q1 = test_X[:,0]
test_Q2 = test_X[:,1]

In [11]:
print(np.stack((q1_data, q2_data), axis=1).shape)

(298526, 2, 13)


In [12]:
print(np.stack((q1_data, q2_data), axis=1).shape)

(298526, 2, 13)


In [13]:
def rearrange(base, hypothesis, label):
    features = {"x1": base, "x2": hypothesis}
    return features, label

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_Q1, train_Q2, train_y))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.batch(16)
    dataset = dataset.map(rearrange)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((test_Q1, test_Q2, test_y))
    dataset = dataset.batch(64)
    dataset = dataset.map(rearrange)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [15]:
VOCAB_SIZE = prepro_configs['vocab_size']
MAX_SEQUENCE_LENGTH = 13

WORD_EMBEDDING_DIM = 100
CONV_FEATURE_DIM = 300
CONV_OUTPUT_DIM = 128
CONV_WINDOW_SIZE = 3

SIMILARITY_DENSE_FEATURE_DIM = 200

In [16]:
def model_fn(features, labels, mode):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT


    def basic_conv_sementic_network(inputs, name):
        conv_layer = tf.keras.layers.Conv1D(CONV_FEATURE_DIM, 
                                            CONV_WINDOW_SIZE, 
                                            activation=tf.nn.relu, 
                                            name=name + 'conv_1d',
                                            padding='same')(inputs)
        
        conv_layer = tf.keras.layers.Dropout(0.2)(conv_layer)
        
        max_pool_layer = tf.keras.layers.MaxPool1D(MAX_SEQUENCE_LENGTH, 1)(conv_layer)
        
        output_layer = tf.keras.layers.Dense(CONV_OUTPUT_DIM, 
                                            activation=tf.nn.relu,
                                            name=name + 'dense')(max_pool_layer)
        
        output_layer = tf.keras.layers.Dropout(0.2)(output_layer)
        
        output_layer = tf.squeeze(output_layer, 1)
        
        return output_layer
    
    embedding = tf.keras.layers.Embedding(VOCAB_SIZE,
                                          WORD_EMBEDDING_DIM)
    
    base_embedded_matrix = embedding(features['x1'])
    hypothesis_embedded_matrix = embedding(features['x2'])
    
    base_embedded_matrix = tf.keras.layers.Dropout(0.2)(base_embedded_matrix)
    hypothesis_embedded_matrix = tf.keras.layers.Dropout(0.2)(hypothesis_embedded_matrix)
    
    base_sementic_matrix = basic_conv_sementic_network(base_embedded_matrix, 'base')
    hypothesis_sementic_matrix = basic_conv_sementic_network(hypothesis_embedded_matrix, 'hypothesis')
    
    merged_matrix = tf.concat([base_sementic_matrix, hypothesis_sementic_matrix], -1)

    similarity_dense_layer = tf.keras.layers.Dense(SIMILARITY_DENSE_FEATURE_DIM,
                                             activation=tf.nn.relu)(merged_matrix)
    
    similarity_dense_layer = tf.keras.layers.Dropout(0.2)(similarity_dense_layer)
    
    logit_layer = tf.keras.layers.Dense(1)(similarity_dense_layer)
    logit_layer = tf.squeeze(logit_layer, 1)
    
    if PREDICT:
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  predictions={
                      'is_duplicate':tf.nn.sigmoid(logit_layer)
                  })
    
    loss = tf.losses.sigmoid_cross_entropy(labels, logit_layer)

    if EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.round(tf.nn.sigmoid(logit_layer)))
        eval_metric_ops = {'acc': accuracy}
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  eval_metric_ops= eval_metric_ops,
                  loss=loss)
    
    if TRAIN:
        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  train_op=train_op,
                  loss=loss)

In [17]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

est = tf.estimator.Estimator(model_fn, 
                             model_dir=DATA_OUT_PATH + 'checkpoint_5')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './data_out/checkpoint_5', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x123a89c50>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [18]:
est.train(train_input_fn, steps=1)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./data_out/checkpoint_5/model.ckpt.
INFO:tensorflow:loss = 0.7177738, step = 1
INFO:tensorflow:Saving checkpoints for 1 into ./data_out/checkpoint_5/model.ckpt.
INFO:tensorflow:Loss for final step: 0.7177738.


<tensorflow.python.estimator.estimator.Estimator at 0x123a89ac8>

In [None]:
est.evaluate(eval_input_fn)

In [19]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))

In [20]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x1":test_q1_data, 
                                                         "x2":test_q2_data}, 
                                                      shuffle=False)

In [21]:
predictions = np.array([p['is_duplicate'] for p in est.predict(input_fn=predict_input_fn)])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./data_out/checkpoint_5/model.ckpt-1
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [23]:
len(predictions)

2345796

In [24]:
import pandas as pd

test_df = pd.read_csv(DATA_IN_PATH + "test.csv", encoding='utf-8')
test_df = test_df.drop_duplicates()

In [25]:
test_df.head(5)

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [26]:
output = pd.DataFrame(data={"test_id":test_df["test_id"], "is_duplicate": list(predictions)} )
output.to_csv(DATA_OUT_PATH + "cnn_predict.csv", index=False, quoting=3 )