In [None]:
import pandas as pd
df_train = pd.read_csv('train.csv')
df_train = df_train.sample(frac=1)

In [None]:
from collections import Counter
import itertools
def build_vocab(sentences, max_vocab=30000):
    word_counts = Counter(itertools.chain(*sentences))
    vocabulary_inv = []
    vocabulary_inv.append("<PAD/>")
    vocabulary_inv.append("<mino/>")
    vocabulary_inv.extend([x[0] for x in word_counts.most_common(max_vocab)])
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

In [None]:
import unicodedata

In [None]:
import re
import nltk
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
def nltk_tokenize(s):    
    s = s.lower().strip()    
    return nltk.word_tokenize(s)
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = s.lower().strip()
    s = re.sub(r"([.,!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s.split(' ')
# Split every line into pairs and normalize
pairs = []
labels = []
for q1,q2,is_dup in zip(df_train.question1,df_train.question2,df_train.is_duplicate):
#     if type(q1) == unicode:
#         print("unicode")
#         break
#     if type(q2) == unicode:
#         print("unicode")
#         break
#     if type(q1) == unicode:
#         pair1 = unicodeToAscii(q1)
#     else:
#         pair1 = q1
#     if type(q2) == unicode:
#     pair2 = unicodeToAscii(q2)
#     else:
#         pair2 = q2
    try:
        pair1 =  nltk_tokenize(q1)
    except:
        pair1 =  normalizeString(q1)
    if type(q2) ==float:
        q2 = "<mino/>"
    try:
        pair2 =  nltk_tokenize(q2)
    except:
        pair2 =  normalizeString(q2)
#     newpair1 =pair1.split(' ')
#     newpair2 =pair2.split(' ')
    pairs.append([pair1,pair2,is_dup])

In [None]:
import numpy as np
vocabulary, vocabulary_inv = build_vocab(np.array(pairs)[:,0] + np.array(pairs)[:,1], max_vocab=40000)

In [None]:
import pickle
output = open('vacabulary_quora_adab.pkl', 'wb')
pickle.dump(vocabulary,output)
output.close()
output = open('vocabulary_inv_quora_adab.pkl', 'wb') 
pickle.dump(vocabulary,output)
output.close()

In [None]:
def build_data(input_data, vocabulary,training = True):
    mino_idx = vocabulary["<mino/>"]
    data_index = []
    for pair in input_data:
        sentence1 = [vocabulary.get(word, mino_idx) for word in pair[0]]
        sentence2 = [vocabulary.get(word, mino_idx) for word in pair[1]]
        if training:
            data_index.append([sentence1,sentence2,pair[2]])
        else:
            data_index.append([sentence1,sentence2])
    return data_index
data_index = build_data(pairs,vocabulary)

In [None]:
def get_batch(batch,training = True):
    length1 = [len(sen1) for sen1 in batch[:,0]]
    max_length1 = max(length1)
    length2 = [len(sen2) for sen2 in batch[:,1]]
    max_length2 = max(length2)
    s1_batch = []
    s2_batch = []
    for sen in batch:
        if len(sen[0])==max_length1:
            s1_batch.append(sen[0])
        else:
            s1_batch.append(sen[0]+(max_length1-len(sen[0]))*[0])
        if len(sen[1])==max_length2:
            s2_batch.append(sen[1])
        else:
            s2_batch.append(sen[1]+(max_length2-len(sen[1]))*[0])
    if training:   
        return np.array(s1_batch),np.array(s2_batch),batch[:,2],length1,length2
    else:
        return np.array(s1_batch),np.array(s2_batch),length1,length2

In [None]:
def linear(input, output_dim, scope=None, stddev=0.1):
    norm = tf.random_normal_initializer(stddev=stddev)
    const = tf.constant_initializer(0.0)
    with tf.variable_scope(scope or 'linear'):
        w = tf.get_variable('w', [input.get_shape()[1], output_dim], initializer=norm)
        b = tf.get_variable('b', [output_dim], initializer=const)
        return tf.matmul(input, w) + b

In [None]:
import tensorflow as tf
import os
class LSTM(object):
    """ 
    a simple implement of lstm
    """
    def __init__(self,embedding_size = 300,word_vocab_size=6726):
        self.word_vocab_size = word_vocab_size
        self.embedding_size = embedding_size
        
        self.input_s1 = tf.placeholder(tf.int64, [None, None], name='input_placeholder_x1')
        self.input_s2 = tf.placeholder(tf.int64, [None, None], name='input_placeholder_x2')
        self.input_y = tf.placeholder(tf.int64, [None], name='input_placeholder_label')
        
        self.y_onehot = tf.to_float(tf.one_hot(tf.reshape(self.input_y,[-1]), 2, on_value=1, off_value=0, axis=-1,
                             dtype=tf.int32, name='y_oneHot'))
        
        with tf.device('/cpu:0'),tf.name_scope("embedding"):
            embeddings = tf.Variable(tf.truncated_normal([self.word_vocab_size,self.embedding_size],
                                                              stddev=0.1),name = 'embedding_vocab')
            embedding_diff_length = tf.Variable(tf.truncated_normal([80,20],
                                                              stddev=0.01),name = 'embedding_length')
        self.inputs1 = tf.nn.embedding_lookup(embeddings, self.input_s1)
        self.inputs2 = tf.nn.embedding_lookup(embeddings, self.input_s2)
        
        self.lengths1 =tf.placeholder(tf.int32, [None], name='length1')
        self.lengths2 =tf.placeholder(tf.int32, [None], name='length2')
        
        
        self.difflength = tf.placeholder(tf.int32, [None], name='diff_length')
        self.diff_len_embedding = tf.nn.embedding_lookup(embedding_diff_length, self.difflength)
#         with tf.name_scope('gru_m2'):
#             hidden_size_gru1 =300
#             lstm_fw_cell = tf.contrib.rnn.GRUCell(hidden_size_gru1)
#             lstm_bw_cell = tf.contrib.rnn.GRUCell(hidden_size_gru1)
#             outputs_1, states_1  = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell,
#                                                                     cell_bw=lstm_bw_cell,
#                                                                        dtype=tf.float32,
#                                                                        sequence_length=length1,
#                                                                        inputs=sentences1,
#                                                                        scope = "rnn1_m2")
#             state_fw1,state_bw1 = states_1
#             state_cat1 = tf.concat([state_fw1,state_bw1],1)
#             with tf.variable_scope(tf.get_variable_scope(), reuse=True):
#                 outputs_2, states_2  = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell,
#                                                                            cell_bw=lstm_bw_cell,
#                                                                            dtype=tf.float32,
#                                                                            sequence_length = length2,
#                                                                            inputs=sentences2,
#                                                                            scope = "rnn1_m2")
#                 state_fw2,state_bw2 = states_2
#                 state_cat2 = tf.concat([state_fw2,state_bw2],1)
#             mul_state = tf.multiply(state_cat2,state_cat1)
#             sub_state = tf.abs(tf.subtract(state_cat2,state_cat1))
#             all_state = linear(tf.concat([mul_state,
#                                           sub_state,
#                                           state_cat2,
#                                           state_cat1],axis= 1),600, scope='all_state_m2',stddev=0.1)
        with tf.name_scope('lstm_m3'):
            hidden_size =200
            lstm_fw_cell1 = tf.contrib.rnn.GRUCell(hidden_size)
            lstm_bw_cell1 = tf.contrib.rnn.GRUCell(hidden_size)
            outputs_1, states_1  = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell1,
                                                                   cell_bw=lstm_bw_cell1,
                                                                   dtype=tf.float32,
                                                                   sequence_length=self.lengths1,
                                                                   inputs=self.inputs1,
                                                                   scope = "rnn_m3")
            output_fw_1,output_bw_1 = outputs_1
            out_puts_1= tf.concat([output_fw_1, output_bw_1],2)
            da = 100
            r=100
            out1 =tf.reshape(out_puts_1,[-1,hidden_size*2])
            Ws1 = tf.Variable(tf.truncated_normal([hidden_size*2,da], stddev=0.1), name="Ws1")
            Ws2 = tf.Variable(tf.truncated_normal([da,r], stddev=0.1), name="Ws2")
            A1 = tf.reshape(tf.matmul(tf.tanh(tf.matmul(out1,Ws1)),Ws2),[-1,tf.shape(out_puts_1)[1],r])
            transA1 = tf.nn.softmax(tf.transpose(A1,[0,2,1])) 
#             P1 = tf.matmul(self.transA1,tf.transpose(self.transA1,[0,2,1]))
#             E1 = tf.eye(tf.shape(P1)[1],num_columns=tf.shape(P1)[2],batch_shape=[tf.shape(P1)[0]], dtype=tf.float32, name='E1')
#             self.Reg_A1 = tf.nn.l2_loss(tf.subtract(P1,E1))
            represent1 =tf.matmul(transA1,out_puts_1)
            with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                outputs_2, states_2  = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell1,
                                                                       cell_bw=lstm_bw_cell1,
                                                                       dtype=tf.float32,
                                                                       sequence_length=self.lengths2,
                                                                       inputs=self.inputs2,
                                                                       scope = "rnn_m3")
            
          
                output_fw_2,output_bw_2 = outputs_2
                state_fw2,state_bw2 = states_2
                out_puts_2 = tf.concat([output_fw_2, output_bw_2],2)
                
                out2 =tf.reshape(out_puts_2,[-1,hidden_size*2])
                A2 = tf.reshape(tf.matmul(tf.tanh(tf.matmul(out2,Ws1)),Ws2),[-1,tf.shape(out_puts_2)[1],r])
                transA2 = tf.nn.softmax(tf.transpose(A2,[0,2,1]))
#                 P2 = tf.matmul(self.transA2,tf.transpose(self.transA2,[0,2,1]))
#                 E2 = tf.eye(tf.shape(P2)[1],num_columns=tf.shape(P2)[2],batch_shape=[tf.shape(P2)[0]], dtype=tf.float32, name='E2')
#                 self.Reg_A2 = tf.nn.l2_loss(tf.subtract(P2,E2))
#                 self.reg = self.Reg_A1+self.Reg_A2
                represent2 =tf.matmul(transA2,out_puts_2)
#             self.mul_state = tf.multiply(self.represent1,self.represent2)
#             self.sub_state = tf.abs(tf.subtract(self.represent1,self.represent2))
#             self.state_re = linear(tf.concat([self.mul_state,self.sub_state],axis= 1), 100, scope='state_repre', stddev=0.1)
            
        mul_out2 = tf.multiply(represent2,represent1)
        sub_out2 = tf.abs(tf.subtract(represent2,represent1))
            
        remul2 = tf.reshape(tf.nn.relu(mul_out2),[-1,r*hidden_size*2])
        resub2 = tf.reshape(tf.nn.relu(sub_out2),[-1,r*hidden_size*2])
            
        represent1_resh = tf.reshape(tf.nn.relu(represent1),[-1,r*hidden_size*2])
        represent2_resh = tf.reshape(tf.nn.relu(represent2),[-1,r*hidden_size*2])
            
        mul =linear(remul2, 100, scope='fc_mul', stddev=0.1)
        sub =linear(resub2, 100, scope='fc_sub', stddev=0.1)
        re1_h =linear(represent1_resh, 100, scope='fc_re', stddev=0.1)
        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
            re2_h =linear(represent2_resh, 100, scope='fc_re', stddev=0.1)
            
            
        all_represent = tf.concat([mul,sub,re2_h,re1_h,self.diff_len_embedding],axis= 1)
        h2 = linear(tf.nn.relu(all_represent), 1200, scope='fc_h2_m3', stddev=0.1)
        with tf.name_scope("m3_prediction"):
            logits = linear(tf.nn.relu(h2),2,scope='m3_fc_out', stddev=0.1)
            self.logits_m3 = tf.nn.softmax(logits,name= 'm3_out')
        with tf.name_scope("m3_loss"):
            self.cross_entropy_m3 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.y_onehot,
                                                                                           logits=logits))
        with tf.name_scope("m3_accuracy"):
            self.predictions_m3 = tf.argmax(self.logits_m3, 1, name="m3_predictions")
            correct_predictions = tf.equal(self.predictions_m3, tf.argmax(self.y_onehot, 1))
            self.accuracy_m3 = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="m3_accuracy")

In [None]:
batch_s1,batch_s2,batch_label,length1, length2= get_batch(np.array(data_index[0:64]))
diff_length_Bat = np.abs(np.subtract(length1,length2))
diff_length_Bat[diff_length_Bat>79]=79

In [None]:
os.environ.setdefault('CUDA_VISIBLE_DEVICES','1')
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(gpu_options =gpu_options,allow_soft_placement=True,
                                  log_device_placement=False)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        lstm= LSTM(word_vocab_size =vocabulary.__len__())
        sess.run(tf.initialize_all_variables())
        feed_dict = {
            lstm.input_s1: batch_s1,
            lstm.input_s2: batch_s2,
            lstm.input_y: batch_label,
            lstm.lengths1:length1,
            lstm.lengths2:length2,
            lstm.difflength:diff_length_Bat
        }
        predictions = sess.run(lstm.cross_entropy_m3,feed_dict)

In [None]:
1+2+6

In [None]:
0.5+1+3.0/4

In [None]:
batch_dev_size = 256
dev_num = batch_dev_size*15
train_data = data_index[:-dev_num]
dev_data = data_index[-dev_num:]
print('train_size: '+str(len(train_data))+' dev_size: '+str(len(dev_data)))

In [None]:
# def dev_validate(sess,step,saver,checkpoint_prefix,current_step):
#     best_loss = 10
#     all_dev_acc =[]
#     acc_pred_d =[]
#     pred_score = []
#     true_dev_labels = []
#     num_correct_pairs = 0
#     for j in range(len(dev_data)/batch_dev_size):
#         batch_s1_d,batch_s2_d,batch_label_d,length1_d, length2_d= get_batch(np.array(dev_data[j*batch_dev_size:(j+1)*batch_dev_size]))
#         diff_length_Bat_dev = np.abs(np.subtract(length1_d,length2_d))
#         diff_length_Bat_dev[diff_length_Bat_dev>79]=79
#         feed_dict = {
#             lstm.input_s1: batch_s1_d,
#             lstm.input_s2: batch_s2_d,
#             lstm.input_y: batch_label_d,
#             lstm.lengths1:length1_d,
#             lstm.lengths2:length2_d,
#             lstm.difflength:diff_length_Bat_dev
#         }
#         acc_d,acc_pd,p_score = sess.run([lstm.accuracy_m3,
#                                           lstm.predictions_m3,
#                                           lstm.logits_m3],feed_dict)
#         all_dev_acc.append(acc_d)
#         acc_pred_d.append(acc_pd)
#         pred_score.extend(p_score[:,1].astype(float))
#         true_dev_labels.extend(batch_label_d)
#         num_correct_pairs = num_correct_pairs+np.sum(np.equal(batch_label,np.argmax(pd).astype(int)))
#     loss_dev = log_loss(true_dev_labels, pred_score)
#     dev_acc = np.mean(all_dev_acc)
#     print('step: '+str(step)+' acc: '+str(dev_acc) +'  '+str(sum(sum(acc_pred_d)))+ ' loss: '+str(loss_dev))
#     if loss_dev < best_loss:
#         best_loss = loss_dev
#         path = saver.save(sess, checkpoint_prefix, global_step=current_step)
#     print("Saved model checkpoint to {}\n".format(path))
#     return path

In [None]:
def train_model(trainData):
    best_loss = 10
    batch_size =64
    epoches = 2
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            lstm= LSTM(word_vocab_size = vocabulary.__len__())
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optim = tf.train.AdamOptimizer(learning_rate=1e-3) \
            .minimize(lstm.cross_entropy_m3, global_step=global_step)
            tf.global_variables_initializer().run()

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(os.path.join(
                os.path.curdir, "quora_train", timestamp))
            print("Writing to {}\n".format(out_dir))

            #save model
            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.all_variables(),max_to_keep=10)
            best_accuracy = 0
            for epo in range(epoches):
                np.random.shuffle(trainData)
                for i in range(len(trainData)/batch_size):
                    batch_s1,batch_s2,batch_label,length1, length2= get_batch(np.array(trainData[i*batch_size:(i+1)*batch_size]))
                    current_step = tf.train.global_step(sess, global_step)
                    diff_length_Bat = np.abs(np.subtract(length1,length2))
                    diff_length_Bat[diff_length_Bat>79]=79
                    feed_dict = {
                        lstm.input_s1: batch_s1,
                        lstm.input_s2: batch_s2,
                        lstm.input_y: batch_label,
                        lstm.lengths1:length1,
                        lstm.lengths2:length2,
                        lstm.difflength:diff_length_Bat
                    }
                    _ = sess.run(optim,feed_dict)
                    if (i%50) ==0:
                        all_dev_acc =[]
                        acc_pred_d =[]
                        pred_score = []
                        true_dev_labels = []
                        num_correct_pairs = 0
                        for j in range(len(dev_data)/batch_dev_size):
                            batch_s1_d,batch_s2_d,batch_label_d,length1_d, length2_d= get_batch(np.array(dev_data[j*batch_dev_size:(j+1)*batch_dev_size]))
                            diff_length_Bat_dev = np.abs(np.subtract(length1_d,length2_d))
                            diff_length_Bat_dev[diff_length_Bat_dev>79]=79
                            feed_dict = {
                                lstm.input_s1: batch_s1_d,
                                lstm.input_s2: batch_s2_d,
                                lstm.input_y: batch_label_d,
                                lstm.lengths1:length1_d,
                                lstm.lengths2:length2_d,
                                lstm.difflength:diff_length_Bat_dev
                            }
                            acc_d,acc_pd,p_score = sess.run([lstm.accuracy_m3,
                                                              lstm.predictions_m3,
                                                              lstm.logits_m3],feed_dict)
                            all_dev_acc.append(acc_d)
                            acc_pred_d.append(acc_pd)
                            pred_score.extend(p_score[:,1].astype(float))
                            true_dev_labels.extend(batch_label_d)
                            num_correct_pairs = num_correct_pairs+np.sum(np.equal(batch_label,np.argmax(pd).astype(int)))
                        loss_dev = log_loss(true_dev_labels, pred_score)
                        dev_acc = np.mean(all_dev_acc)
                        print('step: '+str(i)+' acc: '+str(dev_acc) +'  '+str(sum(sum(acc_pred_d)))+ ' loss: '+str(loss_dev))
                        if loss_dev < best_loss:
                            best_loss = loss_dev
                            path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                        print("Saved model checkpoint to {}\n".format(path))
    return path

In [None]:
def test_last_model(last_model,data_last):
    batch_t_size = 256
    False_pairs = []
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            saver = tf.train.import_meta_graph("{}.meta".format(last_model))
            saver.restore(sess, last_model)
            # Get the placeholders from the graph by name
            input_t_s1 = graph.get_operation_by_name("input_placeholder_x1").outputs[0]
            input_t_s2 = graph.get_operation_by_name("input_placeholder_x2").outputs[0]
            lengtht_1 = graph.get_operation_by_name("length1").outputs[0]
            lengtht_2 = graph.get_operation_by_name("length2").outputs[0]
            diff_length = graph.get_operation_by_name("diff_length").outputs[0]
            predictions = graph.get_operation_by_name("m3_prediction/m3_out").outputs[0]
            pred_d =[]
            for j in range(len(data_last)/batch_t_size):
                batch_s1_d,batch_s2_d,batch_label,length1_d,length2_d= get_batch(np.array(data_last[j*batch_t_size:(j+1)*batch_t_size]),training =True)
                diff_length_Bat = np.abs(np.subtract(length1_d,length2_d))
                diff_length_Bat[diff_length_Bat>79]=79
                feed_dict = {
                    input_t_s1: batch_s1_d,
                    input_t_s2: batch_s2_d,
                    lengtht_1:length1_d,
                    lengtht_2:length2_d,
                    diff_length: diff_length_Bat
                }
                pd = sess.run(predictions,feed_dict)
                index_False =np.reshape(np.argwhere(np.equal(batch_label,np.argmax(pd,axis=1))==False),[-1])
                if len(index_False)!= 0:
                    for id_f in index_False:
                        False_pairs.append([batch_s1_d[id_f][:length1_d[id_f]].tolist(),
                                            batch_s2_d[id_f][:length2_d[id_f]].tolist(),
                                            batch_label[id_f]])
            if len(False_pairs)!= 0:
                print('false_pairs_length: ' + str(len(False_pairs)))
    return False_pairs

In [None]:
import time
from sklearn.metrics import log_loss
num_base_clfs=10
ensem_files =[]
ensem_files.append('/home/zhangjinbin/research/quora/quora_train/1493780013/checkpoints/model-11994')
num_c =0
num_resample = 1
while num_c < num_base_clfs:
    false_pairs = test_last_model(ensem_files[-1],train_data)
    resample_pairs = []
    for _ in range(num_resample):
        resample_pairs.extend(false_pairs)
    resample_pairs.extend(train_data)
    num_c+=1
    best_model_epo = train_model(resample_pairs)
    ensem_files.append(best_model_epo)