## Import packages

In [1]:
import numpy as np
import pandas as pd
import gc, random, math, time

In [2]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation, Dropout, Reshape, Flatten, Lambda, Permute
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import concatenate, Concatenate, multiply, Dot
from keras.layers import  GlobalMaxPooling1D, GlobalAveragePooling1D, Input, SpatialDropout1D, Bidirectional
from keras.layers import CuDNNLSTM, CuDNNGRU, LSTM, GRU
from keras import backend as K
from keras.engine.topology import Layer
from keras.preprocessing import sequence, text
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras import optimizers

Using TensorFlow backend.


In [3]:
from sklearn.model_selection import KFold

## Hyper parameter setting

In [4]:
embed_size = 300 # how big is each word vector
max_features = 150000 # how many unique words to use (i.e num rows in embedding vector)
maxlen_p = 100 # max number of words in a context to use
maxlen_q = 10 # max number of words in a question to use
batch_size = 256
num_rnn_units = 64
num_hidden_units = 200
drop_prob = 0.5
max_norm = 5.0

## File path

In [5]:
train_path = './data/train.tsv' # train set
valid_path = './data/valid.tsv' # validation set
test_path = './data/test.tsv' # test set
embed_file = './sgns.target.word-ngram.1-2.dynwin5.thr10.neg5.dim300.iter5' # 预训练词向量

##  Read file

In [6]:
train = pd.read_csv(train_path, sep='\t', header=0)
valid = pd.read_csv(valid_path, sep='\t', header=0)
test = pd.read_csv(test_path, sep='\t', header=0)
print (train.shape, valid.shape, test.shape)
print (train.head())

(750000, 4) (90000, 4) (29997, 4)
   id                                            passage  \
0   1  孩子 是 父母 的 一面镜子   由于 儿童 的 世界观 尚未 形成   他们 的 模仿 带...   
1   1  孩子 是 父母 的 一面镜子   由于 儿童 的 世界观 尚未 形成   他们 的 模仿 带...   
2   1    是 父母 的 一面镜子   由于 儿童   世界观 尚未 形成     的 模仿 带有 ...   
3   2  目前   中国 很多 地方   学生 火车票 磁条 都 已经 升级 了   在 磁条 里 已...   
4   2  目前   中国 很多 地方   学生   磁条 都 已经 升级 了   在 磁条 里    ...   

                   query  label  
0   你 的 孩子 无法确定 保姆 带 大 的      1  
1      你 的 孩子 是 保姆 带 大 的      0  
2     你 的 孩子 不是 保姆 带 大 的      0  
3  不是 一个 区间 刷 学生证 不能 有 票      1  
4   不是 一个 区间 刷 学生证 能 有 票      0  


## Buld up the text input pipeline

####  Fit the tokenizer on train, valid and test set

In [7]:
tokenizer = Tokenizer(num_words=max_features, lower=True) 

tokenizer.fit_on_texts(pd.concat([train['passage'], train['query'], valid['passage'], valid['query'], test['passage'], test['query']], ignore_index=True))

In [8]:
print (tokenizer.document_count, len(tokenizer.word_counts))

1739994 172760


#### text to seq

In [9]:
tra_p = tokenizer.texts_to_sequences(train['passage'])
tra_q = tokenizer.texts_to_sequences(train['query'])
val_p = tokenizer.texts_to_sequences(valid['passage'])
val_q = tokenizer.texts_to_sequences(valid['query'])
te_p = tokenizer.texts_to_sequences(test['passage'])
te_q = tokenizer.texts_to_sequences(test['query'])

#### pad seq to maxlen

In [10]:
train_p = pad_sequences(tra_p, maxlen=maxlen_p)
train_q = pad_sequences(tra_q, maxlen=maxlen_q)
valid_p = pad_sequences(val_p, maxlen=maxlen_p)
valid_q = pad_sequences(val_q, maxlen=maxlen_q)
test_p = pad_sequences(te_p, maxlen=maxlen_p)
test_q = pad_sequences(te_q, maxlen=maxlen_q)

In [11]:
print (train_p.shape, train_q.shape, valid_p.shape, valid_q.shape, test_p.shape, test_q.shape)

(750000, 100) (750000, 10) (90000, 100) (90000, 10) (29997, 100) (29997, 10)


#### label

In [12]:
train_l = train['label']
valid_l = valid['label']

In [13]:
print (train_l.shape, valid_l.shape)

(750000,) (90000,)


## Prepare the pretrained word embedding

In [14]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embed_file, encoding='utf-8'))

In [15]:
all_embs = np.hstack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.014820942, 0.26983637)

In [16]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words+1, embed_size))
for word, i in word_index.items():
    if i > max_features: break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [17]:
embedding_matrix = np.asarray(embedding_matrix, dtype='float32')

## Build the model

In [18]:
K.clear_session()

In [19]:
class AttentionFlow(Layer):
    """Attention Flow Layer with self attention.
    refer to: https://github.com/shiningliang/MRC2018/blob/master/BiDAF%2BSelf%20Attention/dureader/layers/match_layer.py
    Add a FC for calculating similarity matrix, a FC for final fusion. Add self-attention for the match result.
    """

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        if isinstance(input_shape, list):
            input_shape = input_shape[0]
        d = input_shape[-1] // 2
        self.alpha = self.add_weight(name='alpha', 
                                      shape=(d * 6, 1),
                                      initializer='glorot_uniform',
                                      trainable=True)
        '''
        self.beta = self.add_weight(name='beta', 
                                      shape=(d * 8, d * 2),
                                      initializer='glorot_uniform',
                                      trainable=True)
        '''
        super(AttentionFlow, self).build(input_shape)  # Be sure to call this somewhere!

    def compute_output_shape(self, input_shape):
        if isinstance(input_shape, list):
            input_shape = input_shape[0]
        return (input_shape[0], input_shape[1], input_shape[-1])

    def call(self, inputs):
        # h: context representation of passage (batch_size, t, 2d)
        # u: context representation of query (batch_size, j, 2d)
        h = inputs[0]
        u = inputs[1]
        s = self.sim(h, u) # [t, j]
        
        p2q = Dot(axes=(2,1))([K.softmax(s, axis=-1), u]) # [t, 2d]
        b = K.softmax(K.max(s, axis=-1, keepdims=True), -2) # [t, 1]
        q2p = K.tile(Dot(axes=(2,1))([Permute((2,1))(h), b]), [1, 1, maxlen_p]) # [2d, t]
        h_p2q = multiply([h, p2q]) # [t, 2d]
        h_q2p = multiply([h, Permute((2,1))(q2p)]) # [t, 2d]
        g = concatenate([h, p2q, h_p2q, h_q2p]) # [t, 8d]
        g = K.dot(g, self.beta) # [t, 2d]
        
        # self-attention
        sg = self.sim(g, g) # [t, t]
        g = Dot(axes=(2,1))([K.softmax(sg, axis=-1), g]) # [t, 2d]
        return g    
        
    # calculate the similarity matrix  
    def sim(self, h, u):
        h_r = K.repeat_elements(K.expand_dims(h, axis=2), rep=K.int_shape(u)[1], axis=2) # [t,j,2d]
        u_r = K.repeat_elements(K.expand_dims(u, axis=1), rep=K.int_shape(h)[1], axis=1) # [t,j,2d]
        s = concatenate([h_r, u_r, multiply([h_r, u_r])], axis=-1) # [t, j, 6d]
        s = K.squeeze(K.dot(s, self.alpha), axis=-1) # [t, j]
        return s

In [20]:
def single_model():
    p = Input(shape=(maxlen_p,))
    q = Input(shape=(maxlen_q,))
    
    # Embedding layer
    embed = Embedding(nb_words+1, embed_size, weights=[embedding_matrix], trainable=False)
    pe = embed(p)
    pe = SpatialDropout1D(0.2)(pe)
    qe = embed(q)
    qe = SpatialDropout1D(0.2)(qe)
    
    # Contextual embedding layer
    h = Bidirectional(LSTM(num_rnn_units, return_sequences=True, unroll=True, dropout=0.1, recurrent_dropout=0.1))(pe) # [t, 2d]
    u = Bidirectional(LSTM(num_rnn_units, return_sequences=True, unroll=True, dropout=0.1, recurrent_dropout=0.1))(qe) # [j,2d]

    # Attention flow layer
    g = AttentionFlow()([h, u]) # [t, 2d]
    
    # Modelling layer
    m, hf, cf, hb, cb = Bidirectional(CuDNNLSTM(num_rnn_units, return_sequences=True, return_state=True))(g) # [t, 2d], d, d, d, d
    #m = Bidirectional(CuDNNLSTM(num_rnn_units, return_sequences=True))(m) # [t, 2d]

    # Output layer
    conc = Concatenate()([g, m]) # [t, 4d]
    gmp = GlobalMaxPooling1D()(conc) # [4d]
    gap = GlobalAveragePooling1D()(conc) # [4d]
    x = Concatenate()([gmp, gap, hf, hb]) # [10d]
    x = BatchNormalization()(x)
    x = Dense(num_hidden_units, activation='relu')(x)
    x = Dropout(drop_prob)(x)
    x = BatchNormalization()(x)
    #x = Dense(num_hidden_units, activation='relu')(x)
    #x = Dropout(drop_prob)(x)
    #x = BatchNormalization()(x)

    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=[p, q], outputs=x)
    print (model.summary())
    return model

## Train the model

In [21]:
adam = optimizers.Adam(lr=0.001, clipnorm=max_norm)
model = single_model()
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['binary_accuracy'])
    
# train the model
es = EarlyStopping(patience=5)
rp = ReduceLROnPlateau(patience = 1)
hist = model.fit(
    [train_p, train_q], 
    train_l,
    batch_size = batch_size,
    epochs = 12,
    shuffle = True,
    validation_data = ([valid_p, valid_q], valid_l), 
    callbacks=[es, rp])

ValueError: Dimensions must be equal, but are 1536 and 384 for 'attention_flow_1/MatMul_1' (op: 'MatMul') with input shapes: [?,1536], [384,1].

In [None]:
print (hist.history)
from matplotlib import pyplot as plt
%matplotlib inline
plt.figure(1)
plt.plot (hist.history['loss'])
plt.plot (hist.history['val_loss'])

## predict the test data

In [None]:
test_pred = model.predict([test_p, test_q], batch_size=batch_size*4)

In [None]:
test_pred = np.squeeze(test_pred)
print(test_pred.shape)

## Write the array into csv file

In [None]:
res = pd.DataFrame({'id':test['id'], 'passage':test['passage'], 'query':test['query'], 'option':test['option'], 'label':test_pred})
res.to_csv('/result/test6_long.csv', index=False, encoding='utf-8_sig')