In [1]:
import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split

import json

In [2]:
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'

TEST_SPLIT = 0.1
RNG_SEED = 13371447

In [3]:
q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))
prepro_configs = None

with open(NB_WORDS_DATA_FILE, 'r') as f:
    prepro_configs = json.load(f)

In [4]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)

Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

In [5]:
def rearrange(base, hypothesis, label):
    features = {"base": base, "hypothesis": hypothesis}
    return features, label

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((Q1_train, Q2_train, y_train))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.batch(16)
    dataset = dataset.map(rearrange)
#     dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((Q1_test, Q2_test, y_test))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.batch(16)
    dataset = dataset.map(rearrange)
#     dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [6]:
VOCAB_SIZE = prepro_configs['vocab_size']
MAX_SEQUENCE_LENGTH = 25

WORD_EMBEDDING_DIM = 100
CONV_FEATURE_DIM = 300
CONV_OUTPUT_DIM = 128
CONV_WINDOW_SIZE = 3

SIMILARITY_DENSE_FEATURE_DIM = 200

In [7]:
vocabulary_size = VOCAB_SIZE
embedding_size = WORD_EMBEDDING_DIM
conv_channel_size = CONV_FEATURE_DIM
conv_window_size = CONV_WINDOW_SIZE
conv_output_feature_size = CONV_OUTPUT_DIM
max_pool_window_size = MAX_SEQUENCE_LENGTH

similairiry_dense_dim = SIMILARITY_DENSE_FEATURE_DIM

def model_fn(features, labels, mode):
    word_embeddings = tf.get_variable('word_embeddings', [vocabulary_size, embedding_size])
    
    def basic_conv_sementic_network(inputs, name, reuse=False):
        conv_layer = tf.layers.conv1d(inputs, conv_channel_size, conv_window_size, 
                                      activation=tf.nn.relu, 
                                      name=name + 'conv_1d',
                                      padding='same', 
                                      reuse=reuse)
        max_pool_layer = tf.layers.max_pooling1d(conv_layer, max_pool_window_size, 1)
        output_layer = tf.layers.dense(max_pool_layer, conv_output_feature_size, 
                                        activation=tf.nn.relu,
                                        reuse=reuse,
                                        name=name + 'dense')
        output_layer = tf.squeeze(output_layer, 1)
        
        return output_layer
    
    base_embedded_matrix = tf.nn.embedding_lookup(word_embeddings, features['base'])
    hypothesis_embedded_matrix = tf.nn.embedding_lookup(word_embeddings, features['hypothesis'])
    
    base_sementic_matrix = basic_conv_sementic_network(base_embedded_matrix, 'base')
    hypothesis_sementic_matrix = basic_conv_sementic_network(hypothesis_embedded_matrix, 'hypothesis')
    
    merged_matrix = tf.concat([base_sementic_matrix, hypothesis_sementic_matrix], -1)
    #norm_merged_matrix = tf.layers.batch_normalization(merged_matrix)
    similarity_dense_layer = tf.layers.dense(merged_matrix, similairiry_dense_dim,
                                             activation=tf.nn.relu)
    
    logit_layer = tf.layers.dense(similarity_dense_layer, 1)
    logit_layer = tf.squeeze(logit_layer, 1)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  predictions={
                      'prob':tf.nn.sigmoid(logit_layer)
                  })
            
    loss =tf.losses.sigmoid_cross_entropy(labels, logit_layer)
    
    if mode == tf.estimator.ModeKeys.EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.nn.sigmoid(logit_layer))
        eval_metric_ops = {'acc': accuracy}
        return tf.estimator.EstimatorSpec(
                  mode=mode,
                  eval_metric_ops= eval_metric_ops,
                  loss=loss)

    global_step = tf.train.get_global_step()
    #loss = tf.losses.mean_squared_error(labels, logit_layer)
    train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step)

    accuracy = tf.metrics.accuracy(labels, logit_layer)
    eval_metric_ops = {'acc': accuracy}

    return tf.estimator.EstimatorSpec(
              mode=mode,
              train_op=train_op,
              loss=loss)

In [8]:
est = tf.estimator.Estimator(model_fn, model_dir='train_model')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'train_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc4ef9a6828>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [9]:
est.train(train_input_fn, steps=100)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into train_model/model.ckpt.
INFO:tensorflow:loss = 0.69315964, step = 0
INFO:tensorflow:Saving checkpoints for 100 into train_model/model.ckpt.
INFO:tensorflow:Loss for final step: 0.71294993.


<tensorflow.python.estimator.estimator.Estimator at 0x7fc4ef9a6630>

In [10]:
est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-09-17-03:39:40
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from train_model/model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-09-17-03:39:42
INFO:tensorflow:Saving dict for global step 100: acc = 0.63194734, global_step = 100, loss = 0.6421221
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 100: train_model/model.ckpt-100


{'acc': 0.63194734, 'loss': 0.6421221, 'global_step': 100}

In [11]:
FILE_DIR_PATH = './'
Q1_TEST_DATA_FILE = 'q1_test.npy'
Q2_TEST_DATA_FILE = 'q2_test.npy'

test_q1_data = np.load(open(FILE_DIR_PATH + Q1_TEST_DATA_FILE, 'rb'))
test_q2_data = np.load(open(FILE_DIR_PATH + Q2_TEST_DATA_FILE, 'rb'))

In [12]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"base":test_q1_data, 
                                                         "hypothesis":test_q2_data}, 
                                                      shuffle=False)

In [13]:
predictions = np.array([p['prob'] for p in est.predict(input_fn=
predict_input_fn)])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from train_model/model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [14]:
len(predictions)

3563466

In [19]:
import pandas as pd

DEFAULT_PATH='~/.kaggle/competitions/quora-question-pairs/'

test = pd.read_csv(DEFAULT_PATH + "test.csv", encoding='utf-8')
test = test.dropna() #drop empty

  interactivity=interactivity, compiler=compiler, result=result)


In [20]:
test.head(5)

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [21]:
len(list(test["test_id"]))

3563466

In [22]:
output = pd.DataFrame( data={"test_id":test["test_id"], "is_duplicate": list(predictions)} )
output.to_csv( "cnn_predict.csv", index=False, quoting=3 )