## 5.3 CNN DSSM example by tf.estimator

In [None]:
import tensorflow as tf
import numpy as np
import os

from sklearn.model_selection import train_test_split

import json

In [None]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'

TEST_SPLIT = 0.1
RNG_SEED = 13371447

In [None]:
q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))
prepro_configs = None

with open(DATA_IN_PATH + NB_WORDS_DATA_FILE, 'r') as f:
    prepro_configs = json.load(f)

In [None]:
print(prepro_configs.keys())

In [None]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)

train_Q1 = train_X[:,0]
train_Q2 = train_X[:,1]
test_Q1 = test_X[:,0]
test_Q2 = test_X[:,1]

In [None]:
print(np.stack((q1_data, q2_data), axis=1).shape)

In [None]:
print(np.stack((q1_data, q2_data), axis=1).shape)

In [None]:
def rearrange(base, hypothesis, label):
    features = {"base": base, "hypothesis": hypothesis}
    return features, label

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_Q1, train_Q2, train_y))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.batch(16)
    dataset = dataset.map(rearrange)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((test_Q1, test_Q2, test_y))
    dataset = dataset.map(rearrange)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [None]:
VOCAB_SIZE = prepro_configs['vocab_size']
MAX_SEQUENCE_LENGTH = 25

WORD_EMBEDDING_DIM = 100
CONV_FEATURE_DIM = 300
CONV_OUTPUT_DIM = 128
CONV_WINDOW_SIZE = 3

SIMILARITY_DENSE_FEATURE_DIM = 200

In [None]:
vocabulary_size = VOCAB_SIZE
embedding_size = WORD_EMBEDDING_DIM
conv_channel_size = CONV_FEATURE_DIM
conv_window_size = CONV_WINDOW_SIZE
conv_output_feature_size = CONV_OUTPUT_DIM
max_pool_window_size = MAX_SEQUENCE_LENGTH

similairiry_dense_dim = SIMILARITY_DENSE_FEATURE_DIM

def model_fn(features, labels, mode):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT


    def basic_conv_sementic_network(inputs, name, reuse=False):
        conv_layer = tf.keras.layers.Conv1D(conv_channel_size, 
                                            conv_window_size, 
                                            activation=tf.nn.relu, 
                                            name=name + 'conv_1d',
                                            padding='same')(inputs)
        
        conv_layer = tf.keras.layers.Dropout(0.2)(conv_layer)
        
        max_pool_layer = tf.keras.layers.MaxPool1D(max_pool_window_size, 1)(conv_layer)
        
        output_layer = tf.layers.Dense(conv_output_feature_size, 
                                       activation=tf.nn.relu,
                                       name=name + 'dense')(max_pool_layer)
        
        output_layer = tf.keras.layers.Dropout(0.2)(output_layer)
        
        output_layer = tf.squeeze(output_layer, 1)
        
        return output_layer
    
    embedding = tf.keras.layers.Embedding(vocabulary_size,
                                          embedding_size)
    
    base_embedded_matrix = embedding(features['base'])
    hypothesis_embedded_matrix = embedding(features['hypothesis'])
    
    base_embedded_matrix = tf.keras.layers.Dropout(0.2)(base_embedded_matrix)
    hypothesis_embedded_matrix = tf.keras.layers.Dropout(0.2)(hypothesis_embedded_matrix)
    
    base_sementic_matrix = basic_conv_sementic_network(base_embedded_matrix, 'base')
    hypothesis_sementic_matrix = basic_conv_sementic_network(hypothesis_embedded_matrix, 'hypothesis')
    
    merged_matrix = tf.concat([base_sementic_matrix, hypothesis_sementic_matrix], -1)

    similarity_dense_layer = tf.keras.layers.Dense(similairiry_dense_dim,
                                             activation=tf.nn.relu)(merged_matrix)
    
    similarity_dense_layer = tf.keras.layers.Dropout(0.2)(similarity_dense_layer)
    
    logit_layer = tf.keras.layers.Dense(1)(similarity_dense_layer)
    logit_layer = tf.squeeze(logit_layer, 1)
    
    if PREDICT:
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  predictions={
                      'prob':tf.nn.sigmoid(logit_layer)
                  })
            
    loss =tf.losses.sigmoid_cross_entropy(labels, logit_layer)
    
    elif EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.nn.sigmoid(logit_layer))
        eval_metric_ops = {'acc': accuracy}
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  eval_metric_ops= eval_metric_ops,
                  loss=loss)
    
    elif TRAIN:
        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  train_op=train_op,
                  loss=loss)

In [None]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

est = tf.estimator.Estimator(model_fn, 
                             model_dir=DATA_OUT_PATH + 'checkpoint')

In [None]:
est.train(train_input_fn, steps=100)

In [None]:
est.evaluate(eval_input_fn)

In [None]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))

In [None]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"base":test_q1_data, 
                                                         "hypothesis":test_q2_data}, 
                                                      shuffle=False)

In [None]:
predictions = np.array([p['prob'] for p in est.predict(input_fn=predict_input_fn)])

In [None]:
len(predictions)

In [None]:
import pandas as pd

DEFAULT_PATH='~/.kaggle/competitions/quora-question-pairs/'

test = pd.read_csv(DEFAULT_PATH + "test.csv", encoding='utf-8')
test = test.dropna()

In [None]:
test.head(5)

In [None]:
output = pd.DataFrame(data={"test_id":test["test_id"], "is_duplicate": list(predictions)} )
output.to_csv(DATA_OUT_PATH + "cnn_predict.csv", index=False, quoting=3 )