## Import packages

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'  # 安装graphviz的路径，用于模型可视化

In [3]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation, Dropout, Reshape, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import concatenate, Concatenate
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, Input, SpatialDropout1D, Bidirectional
from keras.layers.recurrent import LSTM, GRU
from keras import backend as K
from keras.preprocessing import sequence, text
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.utils import plot_model
from keras import optimizers, initializers, regularizers, constraints
from keras.engine.topology import Layer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Hyper parameter setting

In [4]:
token = 'words' # based on words or chars
embed_size = 300 # how big is each word vector
max_features = 20885 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 15 # max number of words in a comment to use
num_rnn_units = 128
num_hidden_units = 300
drop_prob = 0.1
max_norm = 5.0
filter_sizes = [1,2,3,5]
num_filters = 128

## File path

In [5]:
TRAIN_PATH = './train.csv'
TEST_PATH = './test.csv'
QUESTION_PATH = './q_no_stopwords.csv'
embed_files = {'words': './word_embed.txt', 'chars': './char_embed.txt'}

## Some helper function

In [6]:
# Get question id from a list. Remove the Q
def get_ids(qids):
    ids = []
    for t_ in qids:
        ids.append(int(t_[1:]))
    return np.asarray(ids)

# Get the text
def get_texts(q_list, question_path=QUESTION_PATH):
    qes = pd.read_csv(question_path)
    ids = get_ids(q_list)
    all_tokens = qes[token]
    texts = [all_tokens[t] for t in ids]
    return texts

## Read the text

#### Train data
split some data for validation

In [7]:
train = pd.read_csv(TRAIN_PATH)
list_train = list(zip(train['q1'], train['q2']))
label_train = train['label']
#print(len(list_train), len(label_train))

X_tra, X_val, y_tra, y_val = train_test_split(list_train, label_train, train_size=0.85, random_state=8, shuffle=True)

# get the text list of question 1 and 2
q1_train = [i[0] for i in X_tra]
text1_train = get_texts(q1_train)
q2_train = [i[1] for i in X_tra]
text2_train = get_texts(q2_train)
q1_val = [i[0] for i in X_val]
text1_val = get_texts(q1_val)
q2_val = [i[1] for i in X_val]
text2_val = get_texts(q2_val)



#### Test data

In [8]:
test = pd.read_csv(TEST_PATH)
list_test = list(zip(test['q1'], test['q2']))

# get the text list of question 1 and 2
q1_test = [i[0] for i in list_test]
text1_test = get_texts(q1_test)
q2_test = [i[1] for i in list_test]
text2_test = get_texts(q2_test)

## Tokenize

In [9]:
tokenizer = Tokenizer(num_words=max_features, lower=False) # Don't lower the W or L!!!
tokenizer.fit_on_texts(pd.read_csv(QUESTION_PATH)[token])

# train set
tokenized1_train = tokenizer.texts_to_sequences(text1_train)
tokenized2_train = tokenizer.texts_to_sequences(text2_train)
X1_train = pad_sequences(tokenized1_train, maxlen=maxlen)
X2_train = pad_sequences(tokenized2_train, maxlen=maxlen)

# validation set
tokenized1_val = tokenizer.texts_to_sequences(text1_val)
tokenized2_val = tokenizer.texts_to_sequences(text2_val)
X1_val = pad_sequences(tokenized1_val, maxlen=maxlen)
X2_val = pad_sequences(tokenized2_val, maxlen=maxlen)

# test set
tokenized1_test = tokenizer.texts_to_sequences(text1_test)
tokenized2_test = tokenizer.texts_to_sequences(text2_test)
X1_test = pad_sequences(tokenized1_test, maxlen=maxlen)
X2_test = pad_sequences(tokenized2_test, maxlen=maxlen)

## Prepare the pretrained word embedding

In [10]:
def get_coefs(line): return line[0], np.asarray(line[1:], dtype='float32')
embed_file = embed_files[token]
embeddings_index = dict(get_coefs(o.strip().split()) for o in open(embed_file, encoding='utf-8'))
print (len(embeddings_index.items()))
#print (list(embeddings_index.items())[20890])

20891


In [11]:
all_embs = np.hstack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.015683081, 1.1956546)

In [12]:
word_index = tokenizer.word_index
embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features+1, embed_size))

for word, i in word_index.items():
    if i > max_features: break
    embedding_vector = embeddings_index.get(word)
    #print (i, word, len(embedding_vector))
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [13]:
embedding_matrix = np.asarray(embedding_matrix, dtype='float32')
print (np.shape(embedding_matrix))

(20886, 300)


## Build the model

In [14]:
from keras import backend as K
K.clear_session()

In [15]:
# Attetion layer
class FeedForwardAttention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(FeedForwardAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def call(self, x):

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.softmax(eij)

        a = K.expand_dims(a)
        weighted_input = x * a

        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

In [16]:
inp1 = Input(shape=(maxlen,))
inp2 = Input(shape=(maxlen,))

# basic rnn+cnn+attention model
inp = Input(shape=(maxlen,))
h = Embedding(max_features+1, embed_size, weights=[embedding_matrix], trainable=False)(inp)
h = SpatialDropout1D(drop_prob)(h)
h = Bidirectional(LSTM(num_rnn_units, return_sequences=True, dropout=drop_prob, recurrent_dropout=drop_prob))(h)
h = Bidirectional(GRU(num_rnn_units, return_sequences=True, dropout=drop_prob, recurrent_dropout=drop_prob))(h)
    
conv_0 = Conv1D(num_filters, kernel_size=filter_sizes[0], padding = "same", activation = 'relu')(h)
conv_1 = Conv1D(num_filters, kernel_size=filter_sizes[1], padding = "same", activation = 'relu')(h)
conv_2 = Conv1D(num_filters, kernel_size=filter_sizes[2], padding = "same", activation = 'relu')(h)
conv_3 = Conv1D(num_filters, kernel_size=filter_sizes[3], padding = "same", activation = 'relu')(h)

maxpool_0 = GlobalMaxPooling1D()(conv_0)
avgpool_0 = GlobalAveragePooling1D()(conv_0)
att_0 = FeedForwardAttention(maxlen)(conv_0)
maxpool_1 = GlobalMaxPooling1D()(conv_1)
avgpool_1 = GlobalAveragePooling1D()(conv_1)
att_1 = FeedForwardAttention(maxlen)(conv_1)
maxpool_2 = GlobalMaxPooling1D()(conv_2)
avgpool_2 = GlobalAveragePooling1D()(conv_2)
att_2 = FeedForwardAttention(maxlen)(conv_2)
maxpool_3 = GlobalMaxPooling1D()(conv_3)
avgpool_3 = GlobalAveragePooling1D()(conv_3)
att_3 = FeedForwardAttention(maxlen)(conv_3)

z = concatenate([maxpool_0, maxpool_1, maxpool_2, maxpool_3, avgpool_0, avgpool_1, avgpool_2, avgpool_3, att_0, att_1, att_2, att_3])

base_model = Model(inputs=inp, outputs=z)

o1 = base_model(inp1)
o2 = base_model(inp2)

conc = concatenate([o1,o2])
x = BatchNormalization()(conc)

x = Dense(num_hidden_units, activation='relu')(x)
x = Dropout(drop_prob)(x)
x = BatchNormalization()(x)

x = Dense(1, activation='sigmoid')(x)
model = Model(inputs=[inp1, inp2], outputs=x)

adam = optimizers.Adam(clipnorm=max_norm)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

model.summary()
plot_model(base_model, to_file='model14.png', show_shapes=True)

Tensor("input_1:0", shape=(?, 15), dtype=float32)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 15)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 15)           0                                            
__________________________________________________________________________________________________
model_1 (Model)                 (None, 1536)         7362308     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 3072)         0     

## Train the model

In [17]:
cp = ModelCheckpoint(filepath="my_model14.h5", save_best_only=True)
es = EarlyStopping(patience=2)
rp = ReduceLROnPlateau(patience = 0)
hist = model.fit([X1_train, X2_train], y_tra, batch_size = 256, epochs=15, validation_data=([X1_val, X2_val], y_val), callbacks=[cp, es, rp])

Train on 216228 samples, validate on 38158 samples
Epoch 1/15
  6336/216228 [..............................] - ETA: 46:04 - loss: 0.8207 - acc: 0.6199

KeyboardInterrupt: 

## check the loss curve

In [None]:
print (hist.history)
%matplotlib inline
plt.figure(1)
plt.plot (hist.history['loss'])
plt.plot (hist.history['val_loss'])

## Load the model

In [None]:
from keras.models import load_model
model = load_model('my_model14.h5', custom_objects={'FeedForwardAttention': FeedForwardAttention})

## predict the test data

In [None]:
y_pred = model.predict([X1_test, X2_test], batch_size=1024)

In [None]:
# To make a submission file 
def make_submission(predict_prob):
    with open('sub14.csv', 'w') as file:
        file.write(str('y_pre') + '\n')
        for line in predict_prob:
            #line = np.clip(line, 0.005, 0.995)
            file.write(str(line[0]) + '\n')
    file.close()
    
make_submission(y_pred)