## Import packages

In [19]:
import numpy as np
import pandas as pd
import gc

In [2]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation, Dropout, Reshape, Flatten, Permute, Lambda
from keras.activations import softmax
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import concatenate, Concatenate, Dot
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, Input, SpatialDropout1D, Bidirectional, TimeDistributed
from keras.layers.recurrent import LSTM, GRU
from keras import backend as K
from keras.preprocessing import sequence, text
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras import optimizers

Using TensorFlow backend.


In [3]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

## Hyper parameter setting

In [4]:
token = 'words' # based on words or chars
embed_size = 300 # how big is each word vector
max_features = 20890 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 15 # max number of words in a comment to use
num_rnn_units = 256
num_hidden_units = 200
drop_prob = 0.2
max_norm = 5.0

## File path

In [5]:
TRAIN_PATH = './train.csv'
TEST_PATH = './test.csv'
QUESTION_PATH = './question.csv'
embed_files = {'words': './word_embed.txt', 'chars': './char_embed.txt'}

## Some helper function

In [6]:
# Get question id from a list. Remove the Q
def get_ids(qids):
    ids = []
    for t_ in qids:
        ids.append(int(t_[1:]))
    return np.asarray(ids)

# Get the text
def get_texts(q_list, question_path=QUESTION_PATH):
    qes = pd.read_csv(question_path)
    ids = get_ids(q_list)
    all_tokens = qes[token]
    texts = [all_tokens[t] for t in ids]
    return texts

## Read the text

#### Train data

In [7]:
train = pd.read_csv(TRAIN_PATH)
x_train = list(zip(train['q1'], train['q2']))
y_train = train['label']

# get the text list of question 1 and 2
q1_train = [i[0] for i in x_train]
text1_train = get_texts(q1_train)
q2_train = [i[1] for i in x_train]
text2_train = get_texts(q2_train)

#### Test data

In [8]:
test = pd.read_csv(TEST_PATH)
x_test = list(zip(test['q1'], test['q2']))

# get the text list of question 1 and 2
q1_test = [i[0] for i in x_test]
text1_test = get_texts(q1_test)
q2_test = [i[1] for i in x_test]
text2_test = get_texts(q2_test)

## Tokenize

In [9]:
tokenizer = Tokenizer(num_words=max_features, lower=False) # Don't lower the W or L!!!
tokenizer.fit_on_texts(pd.read_csv(QUESTION_PATH)[token])

# train set
tokenized1_train = tokenizer.texts_to_sequences(text1_train)
tokenized2_train = tokenizer.texts_to_sequences(text2_train)
X1_train = pad_sequences(tokenized1_train, maxlen=maxlen)
X2_train = pad_sequences(tokenized2_train, maxlen=maxlen)

# test set
tokenized1_test = tokenizer.texts_to_sequences(text1_test)
tokenized2_test = tokenizer.texts_to_sequences(text2_test)
X1_test = pad_sequences(tokenized1_test, maxlen=maxlen)
X2_test = pad_sequences(tokenized2_test, maxlen=maxlen)

## Prepare the pretrained word embedding

In [10]:
def get_coefs(line): return line[0], np.asarray(line[1:], dtype='float32')
embed_file = embed_files[token]
embeddings_index = dict(get_coefs(o.strip().split()) for o in open(embed_file, encoding='utf-8'))

20891


In [11]:
all_embs = np.hstack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

(0.015683081, 1.1956546)

In [12]:
word_index = tokenizer.word_index
embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features+1, embed_size))

for word, i in word_index.items():
    if i > max_features: break
    embedding_vector = embeddings_index.get(word)
    #print (i, word, len(embedding_vector))
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [13]:
embedding_matrix = np.asarray(embedding_matrix, dtype='float32')

## Build the model

In [14]:
K.clear_session()

In [None]:
def time_distributed(input_, layers):
    "Apply a list of layers in TimeDistributed mode"
    out_ = []
    node_ = input_
    for layer_ in layers:
        node_ = TimeDistributed(layer_)(node_)
    out_ = node_
    return out_

In [None]:
def single_model():
    i1 = Input(shape=(maxlen,))
    i2 = Input(shape=(maxlen,))

    # embedding layer
    emb = Embedding(max_features+1, embed_size, weights=[embedding_matrix], trainable=False)
    inp1 = emb(i1)
    inp2 = emb(i2)

    # enhance lstm
    sd = SpatialDropout1D(drop_prob)
    gru_att = Bidirectional(LSTM(num_rnn_units, return_sequences=True, dropout=drop_prob, recurrent_dropout=drop_prob))
    inp1 = gru_att(sd(inp1))
    inp2 = gru_att(sd(inp2))

    # attend
    da_layers = [
        Dense(num_hidden_units, activation='relu'),
        Dropout(drop_prob),
        Dense(num_hidden_units, activation='relu'),
        Dropout(drop_prob)]
    x1 = time_distributed(inp1, da_layers)
    x2 = time_distributed(inp2, da_layers)
    attention = Dot(axes=-1)([x1, x2])
    w_att_1 = Lambda(lambda x: softmax(x, axis=1))(attention)
    w_att_2 = Permute((2,1))(Lambda(lambda x: softmax(x, axis=2))(attention))
    x1_aligned = Dot(axes=1)([w_att_1, inp1])
    x2_aligned = Dot(axes=1)([w_att_2, inp2])

    # compose
    x1_combined = Concatenate()([inp1, x2_aligned])
    x2_combined = Concatenate()([inp2, x1_aligned])
    gru_com = Bidirectional(LSTM(num_rnn_units, return_sequences=True, dropout=drop_prob, recurrent_dropout=drop_prob))
    x1_compare = gru_com(x1_combined)
    x2_compare = gru_com(x2_combined)

    compare_layers = [
        Dense(num_hidden_units, activation='relu'),
        Dropout(drop_prob),
        Dense(num_hidden_units, activation='relu'),
        Dropout(drop_prob)]
    x1_compare = time_distributed(x1_compare, compare_layers)
    x2_compare = time_distributed(x2_compare, compare_layers)

    # aggregate
    gmp1 = GlobalMaxPooling1D()(x1_compare)
    gap1 = GlobalAveragePooling1D()(x1_compare)
    gmp2 = GlobalMaxPooling1D()(x2_compare)
    gap2 = GlobalAveragePooling1D()(x2_compare)
    
    conc = concatenate([gmp1, gap1, gmp2, gap2])
    x = BatchNormalization()(conc)
    x = Dense(num_hidden_units, activation='relu')(x)
    x = Dropout(drop_prob)(x)
    x = BatchNormalization()(x)
    x = Dense(num_hidden_units, activation='relu')(x)
    x = Dropout(drop_prob)(x)
    x = BatchNormalization()(x)
    x = Dense(num_hidden_units, activation='relu')(x)
    x = Dropout(drop_prob)(x)
    x = BatchNormalization()(x)
    x = Dense(num_hidden_units, activation='relu')(x)
    x = Dropout(drop_prob)(x)
    x = BatchNormalization()(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=[i1, i2], outputs=x)
    
    return model

## Train the model and predict to make the first layer feature
Split the train set into 8 folds. Train the model using 7 folds.

Predict the remaining 1 fold and concatenate. The sequence is not changed in order to unify different models.

Predict the test set and take average. The sequence is unchanged.

In [16]:
kf = KFold(n_splits=8)
n_fold = 0
val_pred_list = []
test_pred_list = []
y_val_list = []
for train_index, valid_index in kf.split(X1_train):
    n_fold += 1
    print ("========== Fold %d: =========="%n_fold)
    
    # split samples
    X1_tra, X1_val = X1_train[train_index], X1_train[valid_index]
    X2_tra, X2_val = X2_train[train_index], X2_train[valid_index]
    y_tra, y_val = y_train[train_index], y_train[valid_index]
    
    # build the model
    #K.clear_session()
    adam = optimizers.Adam(clipnorm=max_norm)
    model = single_model()
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    
    # train the model
    #cp = ModelCheckpoint(filepath="my_model11-%d.h5"%n_fold, save_best_only=True)
    es = EarlyStopping(patience=4)
    rp = ReduceLROnPlateau(patience = 0)
    hist = model.fit([X1_tra, X2_tra], y_tra, batch_size = 256, epochs=20, validation_data=([X1_val, X2_val], y_val), callbacks=[es, rp])
    
    # load the best checkpoint and predict
    #K.clear_session()
    #del model
    #model = load_model("my_model11-%d.h5"%n_fold)
    val_pred = model.predict([X1_val, X2_val], batch_size=1024)
    val_pred_list.append(val_pred)
    y_val_list.append(y_val)
    test_pred = model.predict([X1_test, X2_test], batch_size=1024)
    test_pred_list.append(test_pred)
    del X1_tra, X1_val, X2_tra, X2_val, y_tra, y_val, model, val_pred, test_pred, hist
    gc.collect()

Train on 222587 samples, validate on 31799 samples
Epoch 1/1
2
3
4
5
Train on 222587 samples, validate on 31799 samples
Epoch 1/1
2
3
4
5
Train on 222588 samples, validate on 31798 samples
Epoch 1/1
2
3
4
5
Train on 222588 samples, validate on 31798 samples
Epoch 1/1
2
3
4
5
Train on 222588 samples, validate on 31798 samples
Epoch 1/1
2
3
4
5
Train on 222588 samples, validate on 31798 samples
Epoch 1/1
2
3
4
5
Train on 222588 samples, validate on 31798 samples
Epoch 1/1
2
3
4
5
Train on 222588 samples, validate on 31798 samples
Epoch 1/1
2
3
4
5


In [17]:
val_pred_list = np.concatenate(val_pred_list)
test_pred_list = np.asarray(test_pred_list)
y_val_list = np.concatenate(y_val_list)
test_pred_list = np.mean(test_pred_list, axis=0)

In [26]:
val_pred_list = np.squeeze(val_pred_list)
test_pred_list = np.squeeze(test_pred_list)

## Write the array into csv file

In [30]:
train = pd.DataFrame({'train':val_pred_list, 'label':y_val_list})
test = pd.DataFrame({'test':test_pred_list})
train.to_csv('train16.csv', index=False)
test.to_csv('test16.csv', index=False)