<a href="https://colab.research.google.com/github/michaelwaheb/CIT690E-DeepLearning_Michael_Reda_191002/blob/main/chatbot/bilstm_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --user -U nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stops = set(stopwords.words('english'))

In [None]:
from keras import backend as K
from keras.layers import Layer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import itertools

'''
This configuration file provides a series of predefined functions
'''


# ------------------custom function------------------ #

def text_to_word_list(text):  # 文本分词
    text = str(text)
    text = text.lower()
    # Text cleaning rules for English text
    import re
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

def make_w2v_embeddings(flag, word2vec, df, embedding_dim):  # 将词转化为词向量
    vocabs = {}  # 词序号
    vocabs_count = 0  # 词个数计数器

    vocabs_not_w2v = {}  # 无法用词向量表示的词
    vocabs_not_w2v_count = 0  # Word count that cannot be represented by word vectors

    # 停用词
    # stops = set(open('data/stopwords.txt').read().strip().split('\n'))

    for index, row in df.iterrows():
        # 打印处理进度
        if index != 0 and index % 1000 == 0:
            print(str(index) + " sentences embedded.")

        for question in ['question1', 'question2']:
            q2n = []  # q2n -> question to numbers representation
            words = text_to_word_list(row[question])

            for word in words:
                # if word in stops:  # remove stop words
                # continue
                # The word of OOV is put into a dictionary that cannot be represented by a word vector, and the value is 1
                if word not in word2vec and word not in vocabs_not_w2v:  
                    vocabs_not_w2v_count += 1
                    vocabs_not_w2v[word] = 1   
                if word not in vocabs:  # Non-OOV words, extract the corresponding id
                    vocabs_count += 1
                    vocabs[word] = vocabs_count
                    q2n.append(vocabs_count)
                else:
                    q2n.append(vocabs[word])
            df.at[index, question + '_n'] = q2n

    embeddings = 1 * np.random.randn(len(vocabs) + 1, embedding_dim)  # 随机初始化一个形状为[全部词个数，词向量维度]的矩阵
    '''
    词1 [a1, a2, a3, ..., a60]
    词2 [b1, b2, b3, ..., b60]
    词3 [c1, c2, c3, ..., c60]
    '''
    embeddings[0] = 0  # 第一行用0填充，因为不存在index为0的词

    for index in vocabs:
        vocab_word = vocabs[index]
        if vocab_word in word2vec:
            embeddings[index] = word2vec[vocab_word]
    del word2vec

    return df, embeddings


def split_and_zero_padding(df, max_seq_length):  # 调整tokens长度

    # 训练集矩阵转换成字典
    X = {'left': df['question1_n'], 'right': df['question2_n']}

    # 调整到规定长度
    for dataset, side in itertools.product([X], ['left', 'right']):
        dataset[side] = pad_sequences(dataset[side], padding='pre', truncating='post', maxlen=max_seq_length)

    return dataset


class ManDist(Layer):  # 封装成keras层的曼哈顿距离计算

    # 初始化ManDist层，此时不需要任何参数输入
    def __init__(self, **kwargs):
        self.result = None
        super(ManDist, self).__init__(**kwargs)

    # Automatically build ManDist layer
    def build(self, input_shape):
        super(ManDist, self).build(input_shape)

    # Calculate Manhattan distance
    def call(self, x, **kwargs):
        self.result = K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True))
        return self.result

    # return result
    def compute_output_shape(self, input_shape):
        return K.int_shape(self.result)

In [None]:
# Basic package
from time import time
import pandas as pd
from sklearn.model_selection import train_test_split
import keras
from gensim.models import KeyedVectors
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, Flatten, Activation, RepeatVector, Permute, Lambda, \
    Bidirectional, TimeDistributed, Dropout, Conv1D, GlobalMaxPool1D
from keras.layers.merge import multiply, concatenate
import keras.backend as K
from tensorflow.keras.optimizers import Adam

In [None]:
!pip install wget

import wget
url = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
filename = wget.download(url)



In [None]:
#! pip install kaggle
#! mkdir -p ~/.kaggle
#! cp kaggle.json ~/.kaggle/
#! chmod 600 ~/.kaggle/kaggle.json
#! kaggle competitions download quora-question-pairs -f train.csv.zip
#! kaggle competitions download quora-question-pairs -f test.csv.zip

train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
#! unzip -o test.csv.zip
#! unzip -o train.csv.zip

Archive:  train.csv.zip
  inflating: train.csv               


In [None]:
# ------------------Preloading------------------ #

# Chinese and English training options, English training set is used by default
TRAIN_CSV = './train.csv'
flag = 'en'
embedding_path = 'GoogleNews-vectors-negative300.bin.gz'
embedding_dim = 300
max_seq_length = 10
savepath = './en_SiameseLSTM.h5'

In [None]:
# Whether to enable pre-trained word vectors, use randomly initialized word vectors by default
# Load word vectors
print("Loading word2vec model(it may takes 2-3 mins) ...")
embedding_dict = KeyedVectors.load_word2vec_format(embedding_path, binary=True)


Loading word2vec model(it may takes 2-3 mins) ...


KeyboardInterrupt: ignored

In [None]:
# Read and load the training set
train_df = pd.read_csv(TRAIN_CSV)
for q in ['question1', 'question2']:
    train_df[q + '_n'] = train_df[q]

In [None]:
# Vectorize the training set words
train_df, embeddings = make_w2v_embeddings(flag, embedding_dict, train_df, embedding_dim=embedding_dim)

In [None]:
# max_seq_length_in_dataset
max_seq_length_in_dataset = max(train_df.question1_n.map(lambda x: len(x)).max(),
                     train_df.question2_n.map(lambda x: len(x)).max())
print("max_seq_length_in_dataset : ",max_seq_length_in_dataset)


In [None]:
train_df.head()

Take the training data from:
question1   question2   is_duplicate
borrow Repayment Information Borrow Repayment Date 0
become:

question1 question2 is_duplicate question1_n question2_n
Borrow repayment information Borrow repayment date 0 Borrow repayment information Borrow repayment date

After becoming id:
question1 question2 is_duplicate question1_n question2_n
Borrow repayment information Borrow repayment date 0 [31, 639] [31, 255]

In [None]:
# Split training set
X = train_df[['question1_n', 'question2_n']]
Y = train_df['is_duplicate']
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.2)

X_train = split_and_zero_padding(X_train, max_seq_length)
X_validation = split_and_zero_padding(X_validation, max_seq_length)

In [None]:
# Convert labels to numbers
Y_train = Y_train.values
Y_validation = Y_validation.values

In [None]:
# Confirm that the data is prepared and correct
print("X_train['left'].shape" , X_train['left'].shape)
print("X_train['right'].shape" , X_train['left'].shape)
print("len(X_train['left'])" , len(X_train['left']))
print("len(Y_train)" , len(Y_train))

assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

In [None]:
print(Y_validation)

# -----------------base function------------------ #


In [None]:
def shared_model(_input):
    # word vectorization
    embedded = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_shape=(max_seq_length,),
                         trainable=False)(_input)

    # multilayerBi-LSTM
    activations = Bidirectional(LSTM(n_hidden, return_sequences=True), merge_mode='concat')(embedded)
    activations = Bidirectional(LSTM(n_hidden, return_sequences=True), merge_mode='concat')(activations)

    # dropout
    activations = Dropout(0.5)(activations)

    # Attention
    attention = TimeDistributed(Dense(1, activation='tanh'))(activations)
    attention = Flatten()(attention)
    attention = Activation('softmax')(attention)
    attention = RepeatVector(n_hidden * 2)(attention)
    attention = Permute([2, 1])(attention)
    sent_representation = multiply([activations, attention])
    sent_representation = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation)

    # dropout
    sent_representation = Dropout(0.3)(sent_representation)

    return sent_representation

In [None]:
if __name__ == '__main__':

    # Hyperparameter
    batch_size = 1024
    n_epoch = 15
    n_hidden = 100

    left_input = Input(shape=(max_seq_length,), dtype='float32')
    right_input = Input(shape=(max_seq_length,), dtype='float32')

    left_sen_representation = shared_model(left_input)
    right_sen_representation = shared_model(right_input)

    # The Manhattan distance is introduced, and the original 
    #vector on the obtained transformation concat is 
    #then subjected to a non-linear transformation 
    #through a multi-layer DNN, and the sigmoid obtains the similarity
    
    # I did not use the Mahalanobis distance mentioned 
    #in https://zhuanlan.zhihu.com/p/31638132. 
    #I tried Manhattan distance, point product and cos, 
    #and Manhattan is the best.
    
    man_distance = ManDist()([left_sen_representation, right_sen_representation])
    sen_representation = concatenate([left_sen_representation, right_sen_representation, man_distance])
    similarity = Dense(1, activation='sigmoid')(Dense(2)(Dense(4)(Dense(16)(sen_representation))))
    model = Model(inputs=[left_input, right_input], outputs=[similarity])

    model.compile(loss='mean_squared_error', optimizer= Adam(), metrics=['accuracy'])
    model.summary()

    training_start_time = time()
    malstm_trained = model.fit([X_train['left'], X_train['right']], Y_train,
                               batch_size=batch_size, epochs=n_epoch,
                               validation_data=([X_validation['left'], X_validation['right']], Y_validation))
    training_end_time = time()
    print("Training time finished.\n%d epochs in %12.2f" % (n_epoch, training_end_time - training_start_time))

In [None]:
# Plot accuracy
plt.plot(malstm_trained.history['accuracy'])
plt.plot(malstm_trained.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
#plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
#plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()