## Import packages

In [1]:
import numpy as np
import pandas as pd
import gc, random, math, time

In [2]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation, Dropout, Reshape, Flatten, Lambda, Permute
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import concatenate, Concatenate, multiply, Dot, dot
from keras.layers import  GlobalMaxPooling1D, GlobalAveragePooling1D, Input, SpatialDropout1D, Bidirectional
from keras.layers import CuDNNLSTM, CuDNNGRU, LSTM, GRU, Softmax
from keras import backend as K
from keras.engine.topology import Layer
from keras.preprocessing import sequence, text
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras import optimizers
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


## Hyper parameter setting

In [3]:
embed_size = 300 # how big is each word vector
max_features = 160000 # how many unique words to use (i.e num rows in embedding vector)
maxlen_p = 150 # max number of words in a context to use
maxlen_q = 15 # max number of words in a question to use
batch_size = 256
num_rnn_units = 64
num_hidden_units = 300
drop_prob = 0.5
max_norm = 5.0
features = 2

## File path

In [4]:
train_path = './data/train1.tsv' # train set
valid_path = './data/valid1.tsv' # validation set
test_path = './data/test1.tsv' # test set
#embed_file = './sgns.target.word-ngram.1-2.dynwin5.thr10.neg5.dim300.iter5' # 预训练词向量
fasttext_file = './cc.zh.300.vec' # 预训练词向量
'''
train_feature_p_path = './data/train_fea_p.npy' # train passage word feature
valid_feature_p_path = './data/valid_fea_p.npy' # validation passage word feature
test_feature_p_path = './data/test_fea_p.npy' # test passage word feature
train_feature_q_path = './data/train_fea_q.npy' # train passage word feature
valid_feature_q_path = './data/valid_fea_q.npy' # validation passage word feature
test_feature_q_path = './data/test_fea_q.npy' # test passage word feature
'''

"\ntrain_feature_p_path = './data/train_fea_p.npy' # train passage word feature\nvalid_feature_p_path = './data/valid_fea_p.npy' # validation passage word feature\ntest_feature_p_path = './data/test_fea_p.npy' # test passage word feature\ntrain_feature_q_path = './data/train_fea_q.npy' # train passage word feature\nvalid_feature_q_path = './data/valid_fea_q.npy' # validation passage word feature\ntest_feature_q_path = './data/test_fea_q.npy' # test passage word feature\n"

##  Read file

In [5]:
train = pd.read_csv(train_path, sep='\t', header=0)
valid = pd.read_csv(valid_path, sep='\t', header=0)
test = pd.read_csv(test_path, sep='\t', header=0)
print (train.shape, valid.shape, test.shape)
print (train.head())

(250000, 6) (30000, 6) (10000, 5)
   id                                            passage  \
0   1  孩子 是 父母 的 一面镜子   由于 儿童 的 世界观 尚未 形成   他们 的 模仿 带...   
1   2  目前   中国 很多 地方   学生 火车票 磁条 都 已经 升级 了   在 磁条 里 已...   
2   3               引起 黑 便 的 原因 有 很多   不一 定都 是 癌症 引起 的     
3   4  芝士 味 超级 香 超级 浓       根本 停不下来   好 次 好 次   • ؔ ʶ...   
4   5  林 贝 儿 XO 认为   新政 的 实施 将 有效 地 整合 中国 的 奶粉 市场   那...   

                       a0                       a1                    a2  \
0       的 孩子 无法确定 保姆 带大           你 的 孩子 是 保姆 带大 的    你 的 孩子 不 是 保姆 带大 的   
1  不 是 一个 区间 刷 学生证 不 能 有票  不 是 一个 区间 刷 学生证 无法确定 有票  不 是 一个 区间 刷 学生证 能 有票   
2           拉   便 一定 是 胃癌         拉 黑 便 一定 无法确定 胃癌        拉 黑 便 一定 不 是     
3    早上 空腹吃 芝士   饼 不 会 发胖   早上 空腹吃 芝士 威化 饼 无法确定 发胖    早上 空腹吃 芝士   饼 会 发胖   
4       林 贝 儿 奶粉 无法确定   了         林 贝 儿 奶粉 通过 新政 了    林 贝 儿 奶粉 没 通过 新政 了   

   answer  
0       0  
1       0  
2       2  
3       1  
4       1  


train_feature_p = np.load(train_feature_p_path)
valid_feature_p = np.load(valid_feature_p_path)
test_feature_p = np.load(test_feature_p_path)
print (train_feature_p.shape, valid_feature_p.shape, test_feature_p.shape)
train_feature_q = np.load(train_feature_q_path)
valid_feature_q = np.load(valid_feature_q_path)
test_feature_q = np.load(test_feature_q_path)
print (train_feature_q.shape, valid_feature_q.shape, test_feature_q.shape)

## Buld up the text input pipeline

####  Fit the tokenizer on train, valid and test set

In [6]:
tokenizer = Tokenizer(num_words=max_features, lower=True) 

tokenizer.fit_on_texts(pd.concat([train['passage'], train['a0'], train['a1'], train['a2'], 
                                  valid['passage'], valid['a0'], valid['a1'], valid['a2'],
                                  test['passage'], test['a0'], test['a1'], test['a2']], ignore_index=True))

In [7]:
print (tokenizer.document_count, len(tokenizer.word_counts))

1160000 167703


#### text to seq

In [8]:
tra_p = tokenizer.texts_to_sequences(train['passage'])
tra_0 = tokenizer.texts_to_sequences(train['a0'])
tra_1 = tokenizer.texts_to_sequences(train['a1'])
tra_2 = tokenizer.texts_to_sequences(train['a2'])
val_p = tokenizer.texts_to_sequences(valid['passage'])
val_0 = tokenizer.texts_to_sequences(valid['a0'])
val_1 = tokenizer.texts_to_sequences(valid['a1'])
val_2 = tokenizer.texts_to_sequences(valid['a2'])
te_p = tokenizer.texts_to_sequences(test['passage'])
te_0 = tokenizer.texts_to_sequences(test['a0'])
te_1 = tokenizer.texts_to_sequences(test['a1'])
te_2 = tokenizer.texts_to_sequences(test['a2'])

#### pad seq to maxlen

In [9]:
train_p = pad_sequences(tra_p, maxlen=maxlen_p)
train_0 = pad_sequences(tra_0, maxlen=maxlen_q)
train_1 = pad_sequences(tra_1, maxlen=maxlen_q)
train_2 = pad_sequences(tra_2, maxlen=maxlen_q)
valid_p = pad_sequences(val_p, maxlen=maxlen_p)
valid_0 = pad_sequences(val_0, maxlen=maxlen_q)
valid_1 = pad_sequences(val_1, maxlen=maxlen_q)
valid_2 = pad_sequences(val_2, maxlen=maxlen_q)
test_p = pad_sequences(te_p, maxlen=maxlen_p)
test_0 = pad_sequences(te_0, maxlen=maxlen_q)
test_1 = pad_sequences(te_1, maxlen=maxlen_q)
test_2 = pad_sequences(te_2, maxlen=maxlen_q)

In [10]:
print (train_p.shape, train_0.shape, valid_p.shape, valid_1.shape, test_p.shape, test_2.shape)

(250000, 150) (250000, 15) (30000, 150) (30000, 15) (10000, 150) (10000, 15)


#### label

In [6]:
train_l = train['answer']
train_l = to_categorical(train_l, num_classes=3)
valid_l = valid['answer']
valid_l = to_categorical(valid_l, num_classes=3)

In [12]:
print (train_l.shape, valid_l.shape, train_l[79])

(250000, 3) (30000, 3) [0. 1. 0.]


## Prepare the pretrained word embedding

In [None]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
fasttext_index = dict(get_coefs(*o.strip().split()) for o in open(fasttext_file, encoding='utf-8'))
all_ft = np.hstack(fasttext_index.values())
ft_mean,ft_std = all_ft.mean(), all_ft.std()
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
fasttext_matrix = np.random.normal(ft_mean, ft_std, (nb_words+1, embed_size))
for word, i in word_index.items():
    if i > max_features: break
    fasttext_vector = fasttext_index.get(word)
    if fasttext_vector is not None: fasttext_matrix[i] = fasttext_vector
fasttext_matrix = np.asarray(fasttext_matrix, dtype='float32')

## Build the model

In [None]:
K.clear_session()

In [21]:
def attention_flow (x):
    h = x[0]
    u = x[1]
    #d = K.int_shape(h)[-1]
    #arr = K.variable(d, dtype='float32')
    #print (arr)
    s = K.batch_dot(h, K.permute_dimensions(u, (0,2,1)), axes=[2,1])  # [t, j]
    #print (s)
    p2q = K.batch_dot(K.softmax(s, axis=-1), u, axes=[2,1]) # [t, 2d]
    b = K.softmax(K.max(s, axis=-1, keepdims=True), -2) # [t, 1]
    q2p = K.tile(K.batch_dot(K.permute_dimensions(h, (0,2,1)), b, axes=[2,1]), [1, 1, K.int_shape(h)[1]]) # [2d, t]
    h_p2q = multiply([h, p2q]) # [t, 2d]
    h_q2p = multiply([h, K.permute_dimensions(q2p, (0,2,1))]) # [t, 2d]
    g = concatenate([h, p2q, h_p2q, h_q2p]) # [t, 8d]
    '''
    # self-attention
    sg = K.batch_dot(g, K.permute_dimensions(g, (0,2,1)), axes=[2,1]) # [t, t]
    g = K.batch_dot(K.softmax(sg, axis=-1), g, axes=[2,1]) # [t, 8d]
    '''
    return g

In [22]:
def cos_sim (x):
    p = x[0] # [t, 2d]
    q = x[1] # [j, 2d]
    s = dot([p, K.permute_dimensions(q, (0,2,1))], axes=(2,1), normalize=True) # [t, j] cosine simlilarity
    max_sim = K.max(s, axis=-1, keepdims=True) # [t, 1]
    return max_sim

In [29]:
def single_model():
    pa = Input(shape=(maxlen_p,))
    a0 = Input(shape=(maxlen_q,))
    a1 = Input(shape=(maxlen_q,))
    a2 = Input(shape=(maxlen_q,))
    
    # base model
    p = Input(shape=(maxlen_p,))
    q = Input(shape=(maxlen_q,))
    # Embedding layer
    ft = Embedding(nb_words+1, embed_size, weights=[fasttext_matrix], trainable=False)
    pft = ft(p)
    qft = ft(q)
    pft = SpatialDropout1D(0.2)(pft)
    qft = SpatialDropout1D(0.2)(qft)
    
    # Contextual embedding layer
    h = Bidirectional(CuDNNGRU(num_rnn_units, return_sequences=True))(pft) # [t, 2d]
    u = Bidirectional(CuDNNGRU(num_rnn_units, return_sequences=True))(qft) # [j,2d]
    
    # Attention flow layer
    g = Lambda(attention_flow)([h, u]) # [t, 8d]

    # Modelling layer
    m, hf, hb = Bidirectional(CuDNNGRU(num_rnn_units, return_sequences=True, return_state=True))(g) # [t, 2d], d, d
    #m = Bidirectional(CuDNNLSTM(num_rnn_units, return_sequences=True))(m) # [t, 2d]
    
    um, uhf, uhb = Bidirectional(CuDNNGRU(num_rnn_units, return_sequences=True, return_state=True))(u) # [j,2d], d, d

    # Output layer
    conc = Concatenate()([g, m]) # [t, 10d]
    gmp = GlobalMaxPooling1D()(conc) # [10d]
    gap = GlobalAveragePooling1D()(conc) # [10d]
    z1 = Concatenate()([gmp, gap, hf, hb]) # [22d]
    
    ugmp = GlobalMaxPooling1D()(um) # [4d]
    ugap = GlobalAveragePooling1D()(um) # [4d]
    z2 = Concatenate()([ugmp, ugap, uhf, uhb]) # [10d]

    y = Concatenate()([z1, z2])
    base_model = Model(inputs=[p, q], outputs=y)
    
    y0 = base_model()([pa, a0])
    y1 = base_model()([pa, a1])
    y2 = base_model()([pa, a2])
    conc = Concatenate()([y0, y1, y2])
    x = BatchNormalization()(conc)
    x = Dense(num_hidden_units, activation='relu')(x)
    x = Dropout(drop_prob)(x)
    x = BatchNormalization()(x)
    #x = Dense(num_hidden_units, activation='relu')(x)
    #x = Dropout(drop_prob)(x)
    #x = BatchNormalization()(x)

    x = Dense(3, activation='softmax')(x)
    model = Model(inputs=[pa, a0, a1, a2], outputs=x)
    #print (model.summary())
    return model

## Train the model

In [30]:
model = single_model()

In [31]:
model.load_weights('./model/my9.h5')

In [32]:
adam = optimizers.Adam(lr=0.001, clipnorm=max_norm)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['categorical_accuracy'])
    
# train the model
cp = ModelCheckpoint(filepath='./model/my9.h5', monitor='val_categorical_accuracy', save_best_only=True, save_weights_only=True)
es = EarlyStopping(patience=0,  monitor='val_categorical_accuracy')
rp = ReduceLROnPlateau(patience = 1,  monitor='val_loss')
hist = model.fit(
    [train_p, train_0, train_1, train_2], 
    train_l,
    batch_size = batch_size,
    epochs = 4,
    shuffle = False,
    validation_data = ([valid_p, valid_0, valid_1, valid_2], valid_l), 
    callbacks=[rp, cp, es])

Train on 750000 samples, validate on 90000 samples
Epoch 1/4
Epoch 2/4


In [None]:
print (hist.history)
from matplotlib import pyplot as plt
%matplotlib inline
plt.figure(1)
plt.plot (hist.history['binary_accuracy'])
plt.plot (hist.history['val_binary_accuracy'])

## load the best weight

In [None]:
model.load_weights('./model/my9.h5')

## predict the test data

In [None]:
test_pred = model.predict([test_p, test_0, test_1, test_2], batch_size=batch_size*4)

In [None]:
test_pred = np.squeeze(test_pred)
print(test_pred.shape)

## Write the array into csv file

In [None]:
res = pd.DataFrame({'id':test['id'], 'passage':test['passage'], 'query':test['query'], 'option':test['option'], 'label':test_pred})
res.to_csv('./result/test9_long.csv', index=False, encoding='utf-8_sig')

## check validation prediction result

val_pred = model.predict([valid_p, valid_q, valid_feature], batch_size=batch_size*4)
val_pred = np.squeeze(val_pred)
res = pd.DataFrame({'id':valid['id'], 'passage':valid['passage'], 'query':valid['query'], 'option':valid['option'], 'label':val_pred})
res.to_csv('./result/valid17_long.csv', index=False, encoding='utf-8_sig')