In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import necessary libraries
import numpy as np 
import pandas as pd
import json
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import re
import gc
import seaborn as sns

import tensorflow as tf
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from tensorflow.keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, SpatialDropout1D, Dense, Dropout, Input, concatenate, Conv1D, Activation, Flatten

In [None]:
sample_submission = pd.read_csv("../input/tensorflow2-question-answering/sample_submission.csv")
sample_submission.head(20)

In [None]:
#path for data files
train_path = '../input/tensorflow2-question-answering/simplified-nq-train.jsonl'
test_path = '../input/tensorflow2-question-answering/simplified-nq-test.jsonl'

In [None]:
chunksize=2000

In [None]:
#read a sample of data files
def read_data(path, sample = True, chunksize = chunksize):
    if sample == True:
        df = []
        with open(path, 'r') as reader:
            for i in range(chunksize):
                df.append(json.loads(reader.readline()))
        df = pd.DataFrame(df)
        print('Sample data have {} rows and {} columns'.format(df.shape[0], df.shape[1]))
    else:
        df = pd.read_json(path, orient = 'records', lines = True)
        print('Sample data have {} rows and {} columns'.format(df.shape[0], df.shape[1]))
        gc.collect()
    return df

train = read_data(train_path, sample = True)
test = read_data(test_path, sample = False)
train.head()

# Exploratory Data Analysis

In [None]:
df = read_data(train_path, sample = True)
print(df['question_text'][1])
print(df['annotations'][1])

In [None]:
#check the distribution of word count in sample 10000 documents
doc_text_words = df['document_text'].apply(lambda x: len(x.split(' ')))
plt.figure(figsize=(12,6))
sns.kdeplot(doc_text_words.values,shade=True, color="r").set_title('Distribution of text word count of ' + str(chunksize) + ' documents')

In [None]:
#long_answer_candidates
df.long_answer_candidates[0][:5]

In [None]:
df.annotations[1][:100]

In [None]:
df.question_text[1][:100]

In [None]:
df.document_text[3][:100]

In [None]:
#check long_answers_distribution
def preprocess_data():
    df = read_data(train_path, sample = True)
    df['yes_no'] = df.annotations.apply(lambda x: x[0]['yes_no_answer'])
    df['long'] = df.annotations.apply(lambda x: [x[0]['long_answer']['start_token'], x[0]['long_answer']['end_token']])
    df['short'] = df.annotations.apply(lambda x: x[0]['short_answers'])
    return df
df = preprocess_data()
df

In [None]:
df['yes_no'].value_counts()

In [None]:
# filter the answers that exist
df_ans_exists = df.long.apply(lambda x: "Answer Doesn't exist" if x == -1 else "Answer Exists") == "Answer Exists"
df_ans_exists.count()

In [None]:
#check if there are questions that do not have an answer
display(df.long.apply(lambda x: "Answer Doesn't exist" if x[0] == -1 else "Answer Exists").value_counts(normalize=True))

In [None]:
#check the dstribution of Yes and No Answers
yes_no_dist = df[df_ans_exists].yes_no.value_counts(normalize=True)
display(yes_no_dist)

In [None]:
#checking the distribution of short answers
short_dist = df[df_ans_exists].short.apply(lambda x: "Short answer exists" if len(x) > 0 else "Short answer doesn't exist").value_counts(normalize=True)
plt.figure(figsize=(8,6))
sns.barplot(x=short_dist.index,y=short_dist.values,hue=short_dist.index,palette = "Reds").set_title("Distribution of short answers in answerable questions")

In [None]:
short_ans_dist = df[df_ans_exists].short.apply(len).value_counts(normalize=True)
short_ans_dist = pd.concat([short_ans_dist.loc[[0,1,],], pd.Series(short_ans_dist.loc[3:].sum(),index=['>=3'])])
short_ans_dist = short_ans_dist.rename(index={0: 'No Short answer',1:"1 or 2 Short answers",">=3":"More than 2 short answers"})
plt.figure(figsize=(12,6))
sns.barplot(x=short_ans_dist.index,y=short_ans_dist.values,hue=short_ans_dist.index,palette = "Reds").set_title("Distribution of Number of Short Answers in answerable questions")

In [None]:
#check if there are missing data in the datasets
def check_missing_data(df):
    missing_value=df.isna().sum().any()
    if missing_value==True:
        total = df.isnull().sum()
        percent = (df.isnull().sum())/(df.isnull().count()*100)
        output = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
        data_type = []

        for col in df.columns:
            dtype = str(train[col].dtype)
            data_type.append(dtype)
        output['Types'] = data_type
        return(np.transpose(output))
    else:
        return(False)
    
print("Missing data exists in train set: ",check_missing_data(train))
print("Missing data exists in test set: ",check_missing_data(test))

In [None]:
#check count values in each column
def count_values_in_column(data,feature):
    total=data.loc[:,feature].value_counts(dropna=False)
    percentage=round(data.loc[:,feature].value_counts(dropna=False,normalize=True)*100,2)
    return pd.concat([total,percentage],axis=1,keys=['Total','Percentage'])

In [None]:
count_values_in_column(df,'yes_no')

In [None]:
#count unique values in each column
def unique_values_in_column(data,feature):
    unique_val=pd.Series(data.loc[:,feature].unique())
    return pd.concat([unique_val],axis=1,keys=['Unique Values'])

In [None]:
unique_values_in_column(df,'document_text')

In [None]:
#find duplicated values in each column
def duplicated_values_data(data):
    dup=[]
    columns=data.columns
    for i in data.columns:
        dup.append(sum(data[i].duplicated()))
    return pd.concat([pd.Series(columns),pd.Series(dup)],axis=1,keys=['Columns','Duplicate count'])

In [None]:
duplicated_values_data(df)

In [None]:
df.describe()

In [None]:
#remove hash texts in questions
def find_hash(text):
    line=re.findall(r'(?<=#)\w+',text)
    return " ".join(line)

In [None]:
df['hash']=df['question_text'].apply(lambda x: find_hash(x))

In [None]:
df['hash'].value_counts()

In [None]:
unique_values_in_column(df,'hash').value_counts()

In [None]:
# df['hash'] = df['hash'].fillna(df['document_text'])
df.loc[df['hash'].str.strip() == '', 'hash'] = df['document_text']

In [None]:
df

In [None]:
#remove punctuations
def find_punct(text):
    line = re.findall(r'[!"\$%&\'()*+,\-.\/:;=#@?\[\\\]^_`{|}~]*', text)
    string="".join(line)
    return list(string)

In [None]:
df['punctuation']=df['question_text'].apply(lambda x : find_punct(x))
df.loc[df['hash'].str.strip() == '', 'hash'] = df['question_text']

In [None]:
df

In [None]:
np.unique(df['punctuation'])

In [None]:
# define training parameters 
num_train_ques = 2000
num_val_ques = 2050
sample_rate = 15

In [None]:
# define model parameters
epochs = 40
batch_size = 64
class_weights = {0: 0.5, 1: 5.}

# **Part 1- identifying correct long answer to a question**

In [None]:
def get_question_and_document(line):
    question = line['question_text']
    text = line['document_text'].split(' ')
    annotations = line['annotations'][0]    
    return question, text, annotations
                
def get_long_candidate(i, annotations, candidate):
    # check if this candidate is the correct long answer
    if i == annotations['long_answer']['candidate_index']:
        label = True
    else:
        label = False

    # get place where long answer starts and ends in the document text
    long_start = candidate['start_token']
    long_end = candidate['end_token']    
    return label, long_start, long_end

def form_data_row(question, label, text, long_start, long_end):
    row = {
        'question': question,
        'long_answer': ' '.join(text[long_start:long_end]),
        'is_long_answer': label,
    }    
    return row

In [None]:
def load_data(file_path, questions_start, questions_end):
    rows = []
    
    with open(file_path) as file:
        for i in tqdm(range(questions_start, questions_end)):
            line = file.readline()
            line = json.loads(line)
            question, text, annotations = get_question_and_document(line)

            for i, candidate in enumerate(line['long_answer_candidates']):
                label, long_start, long_end = get_long_candidate(i, annotations, candidate)

                if label == True or (i % sample_rate == 0):
                    rows.append(
                        form_data_row(question, label, text, long_start, long_end)
                    )        
    return pd.DataFrame(rows)

In [None]:
train_df = load_data(train_path, 0, num_train_ques)
test_df = load_data(train_path, num_train_ques, num_val_ques)

In [None]:
train_df.head(5)

In [None]:
test_df.head(5)

In [None]:
train_df.head(10)['long_answer']

# **Pre-processing texts**

In [None]:
#cleaning texts by removing stopwords 
def remove_stopwords(sentence):
    words = sentence.split()
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

#removing html tags 
def remove_html(sentence):
    html = re.compile(r'<.*?>` `` ')
    return html.sub(r'', sentence)

#returns the pre-processed dataframe for long answers and questions
def preprocessed_df(df):
    df['long_answer'] = df['long_answer'].apply(lambda x : remove_stopwords(x))
    df['long_answer'] = df['long_answer'].apply(lambda x : remove_html(x))

    df['question'] = df['question'].apply(lambda x : remove_stopwords(x))
    df['question'] = df['question'].apply(lambda x : remove_html(x))
    
    return df

In [None]:
#Count vectorizer for N grams

def ngrams_top(corpus,ngram_range,n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    """
    vec = CountVectorizer(stop_words = 'english',ngram_range=ngram_range).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    total_list=words_freq[:n]
    df=pd.DataFrame(total_list,columns=['text','count'])
    return df

In [None]:
ngrams_top(df['question_text'],(3,3),n=10)

In [None]:
train_df = preprocessed_df(train_df)
test_df = preprocessed_df(test_df)
train_df.head(5)

In [None]:
train_df['long_answer']

# Tokenizing the texts

In [None]:
# define tokenization parameters
filters = '!"''#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
lower_case = True
max_len = 500 #max lenght of a sentence input in to the model

In [None]:
#assigning a numeric index to each unique word in the dataset so that an array of integers
def define_tokenizer(series):
    sentences = pd.concat(series)    
    tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=lower_case,filters=filters  )
    tokenizer.fit_on_texts(sentences)
    return tokenizer

#encoding, rest with 0 if max_len>no.of words in sentence 
def encode(sentences, tokenizer):
    encoded_sentences = tokenizer.texts_to_sequences(sentences)
    encoded_sentences = tf.keras.preprocessing.sequence.pad_sequences(encoded_sentences,
                                                                      maxlen=max_len, padding='post')
    return encoded_sentences

In [None]:
tokenizer = define_tokenizer([train_df.long_answer,train_df.question,
                              test_df.long_answer,test_df.question])

In [None]:
tokenizer

In [None]:
# Get the training data word index
word_index = tokenizer.word_index
print('Number of Unique Tokens: %d' % len(word_index))

In [None]:
list(enumerate(word_index.items()))[:50]

In [None]:
tokenizer.word_index['handle']

In [None]:
tokenizer.word_index['pandemic']

In [None]:
train_long_answers = encode(train_df['long_answer'].values, tokenizer)
train_questions = encode(train_df['question'].values, tokenizer)

test_long_answers = encode(test_df['long_answer'].values, tokenizer)
test_questions = encode(test_df['question'].values, tokenizer)

In [None]:
train_long_answers

In [None]:
train_long_answers[0]

In [None]:
train_questions

In [None]:
train_labels = train_df.is_long_answer.astype(int).values
test_labels = test_df.is_long_answer.astype(int).values

In [None]:
train_labels

In [None]:
test_labels

# Define embedding layer

 Loading the pre-trained embedding - using Glove

In [None]:
embed_size = 200
embedding_dict = {}

with open('../input/glove6b/glove.6B.' + str(embed_size) + 'd.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:],'float32')
        embedding_dict[word] = vectors        
f.close()

In [None]:
num_words = len(word_index) + 1

#initializing embedding matrix
embedding_matrix = np.zeros((num_words, embed_size))
print("num_words:", num_words)
print("embedding_matrix:", embedding_matrix)

In [None]:
for word, i in word_index.items():
    if i > num_words:
        continue
    
    emb_vec = embedding_dict.get(word)
    
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec

# Defining model¶

In [None]:
#initialize embedding layer
embedding_layer = tf.keras.layers.Embedding(
    len(word_index) + 1,
    embed_size,
    embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix),
    trainable = False
)

In [None]:
# question encoding-encodes the question
question_input = Input(shape=(None,))
question_x = embedding_layer(question_input)
question_x = SpatialDropout1D(0.2)(question_x)
question_x = Bidirectional(LSTM(100, return_sequences=True))(question_x)
question_x = GlobalMaxPooling1D()(question_x) # outputs an encoded array representing the question

# answer encoding-encodes the answer
answer_input = Input(shape=(None,))
answer_x = embedding_layer(answer_input)
answer_x = SpatialDropout1D(0.2)(answer_x)
answer_x = Bidirectional(LSTM(150, return_sequences=True))(answer_x)
answer_x = GlobalMaxPooling1D()(answer_x) #outputs an encoded array representing the answer

# classification
combined_x = concatenate([question_x, answer_x])
combined_x = Dense(300, activation='relu')(combined_x)
combined_x = Dropout(0.5)(combined_x)
combined_x = Dense(300, activation='relu')(combined_x)
combined_x = Dropout(0.5)(combined_x)
model_output = Dense(1, activation='sigmoid')(combined_x) # probability how close the potential answer is to the true answer to the question.

# defining model by combining above three parts
model = tf.keras.models.Model(inputs=[answer_input, question_input], outputs=model_output)

In [None]:
model.summary()

In [None]:
model.compile(
    loss='binary_crossentropy', 
    optimizer='adam',
    metrics=['BinaryAccuracy', 'Recall', 'Precision'])

In [None]:
#define callbacks - to avoid plateauing & achieve early stopping
callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', patience=2, verbose=1),
    tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, verbose=1),]

In [None]:
# define model parameters
epochs = 30
batch_size = 128
class_weights = {0: 0.5, 1: 5.}

In [None]:
history = model.fit(
    x = [train_long_answers, train_questions], 
    y = train_labels,
    validation_data = (
        [test_long_answers, test_questions], test_labels),
    epochs = epochs,
    callbacks = callbacks,
    class_weight = class_weights,
    batch_size = batch_size,
    shuffle = True
)

In [None]:
#save model
model.save('long_model.h5')

# Model evaluation

identifying if correct long answers given to a question

In [None]:
#Evaluate checking training & validation loss
fig, ax = plt.subplots(1, 2, figsize=(15, 5))

ax[0].set_title('Training Loss')
ax[0].plot(history.history['loss'],marker = '.',mec = 'r', mfc = 'r')

ax[1].set_title('Validation Loss')
ax[1].plot(history.history['val_loss'],marker = '.',mec = 'r', mfc = 'r',color="green")

In [None]:
#Evaluate checking accuracy, recall and precision
fig, ax = plt.subplots(3, 2, figsize=(15, 10))

ax[0,0].set_title('Training Accuracy')
ax[0,0].plot(history.history['binary_accuracy'],marker = '.',mec = 'r', mfc = 'r')

ax[0,1].set_title('Validation Accuracy')
ax[0,1].plot(history.history['val_binary_accuracy'],marker = '.',mec = 'r', mfc = 'r',color="green")

ax[1,0].set_title('Training Recall')
ax[1,0].plot(history.history['recall'],marker = '.',mec = 'r', mfc = 'r')

ax[1,1].set_title('Validation Recall')
ax[1,1].plot(history.history['val_recall'],marker = '.',mec = 'r', mfc = 'r',color="green")

ax[2,0].set_title('Training Precision')
ax[2,0].plot(history.history['precision'],marker = '.',mec = 'r', mfc = 'r')

ax[2,1].set_title('Validation Precision')
ax[2,1].plot(history.history['val_precision'],marker = '.',mec = 'r', mfc = 'r',color="green")

In [None]:
#Evaluate checking recall

recall = history.history['recall'][-1]
precision = history.history['precision'][-1]

print('Train F1 score: {0:.4f}'.format(
    2 * (precision * recall) / (precision + recall)
))

recall = history.history['val_recall'][-1]
precision = history.history['val_precision'][-1]

print('Validation F1 score: {0:.4f}'.format(2 * (precision * recall) / (precision + recall)))

In [None]:
predictions = model.predict([np.expand_dims(test_labels, axis=0), np.expand_dims(test_labels, axis=0)])

In [None]:
predictions

In [None]:
#evaluating with live inputs
def test_question(question, positive, negative):
    sentences = [question, positive, negative]
    
    for i in range(3):
        sentences[i] = remove_stopwords(sentences[i])
        sentences[i] = remove_html(sentences[i])
    
    sentences = encode(sentences, tokenizer)
    
    predictions = model.predict([np.expand_dims(sentences[1], axis=0), np.expand_dims(sentences[0], axis=0)])

    print('Positive: {0:.2f}'.format(predictions[0][0]))

    predictions = model.predict(
        [np.expand_dims(sentences[2], axis=0), np.expand_dims(sentences[0], axis=0)])

    print('Negative: {0:.2f}'.format(predictions[0][0]))
    return predictions

In [None]:
question = 'who invented a portable handset '

positive='<P> Martin Cooper invented a portable handset in 1973, when he was a project manager at Motorola. It was almost three decades after the idea of cellular communications was introduced by Bell Laboratories. </P>'
negative = '<P> Email marketing has evolved rapidly alongside the technological growth of the 21st century . Prior to this growth , when emails were novelties to the majority of customers , email marketing was not as effective . In 1978 , Gary Thuerk of Digital Equipment Corporation ( DEC ) sent out the first mass email to approximately 400 potential clients via the Advanced Research Projects Agency Network ( ARPANET ) . This email resulted in $13 million worth of sales in DEC products , and highlighted the potential of marketing through mass emails . However , as email marketing developed as an effective means of direct communication , users began blocking out content from emails with filters and blocking programs . In order to effectively communicate a message through email , marketers had to develop a way of pushing content through to the end user , without being cut out by automatic filters and spam removing software . This resulted in the birth of triggered marketing emails , which are sent to specific users based on their tracked online browsing patterns . </P>'

In [None]:
test_question(question, positive, negative)

In [None]:
question = 'who is the south african high commissioner in london'

positive = "<P> Tracy McConnell , better known as `` The Mother '' , is the title character from the CBS television sitcom How I Met Your Mother . The show , narrated by Future Ted , tells the story of how Ted Mosby met The Mother . Tracy McConnell appears in 8 episodes from `` Lucky Penny '' to `` The Time Travelers '' as an unseen character ; she was first seen fully in `` Something New '' and was promoted to a main character in season 9 . The Mother is played by Cristin Milioti . </P>"

negative = "<P> In `` Bass Player Wanted '' , the Mother picks up a hitchhiking Marshall , carrying his son Marvin , on her way to Farhampton Inn . On their way , it is revealed that the Mother is a bass player in the band , that is scheduled to play at the wedding reception . But the band 's leader , Darren , forced her to quit . The Mother ultimately decides to confront Darren and retake the band . She ends up alone at the bar , and while practicing a speech to give Darren , Darren walks up to her furious the groom 's best man punched him for `` no reason . '' Amused by this , the Mother laughs , and Darren quits the band in anger . </P>"

In [None]:
test_question(question, positive, negative)

# **Part 2- extracting short answer from a long answer**

In [None]:
#filter records where short answers exists
def get_short_answer(annotations, long_start, long_end):
    if len(annotations['short_answers']) > 0:
        short_start = annotations['short_answers'][0]['start_token']
        short_end = annotations['short_answers'][0]['end_token']        
        short_start = short_start - long_start
        short_end = short_end - long_start        
        return short_start, short_end
    else:
        return 0, 0
    
def form_short_data_row(question, text, long_start, long_end, short_start, short_end):
    long_answer = ' '.join(text[long_start:long_end])
    short_answer = ' '.join(long_answer.split(' ')[short_start:short_end])
    row = {
        'question': question,
        'long_answer': long_answer,
        'short_answer': short_answer,
        'short_start': short_start,
        'short_end': short_end
    }    
    return row

In [None]:
#loading short answers
def load_short_data(file_path, questions_start, questions_end):
    rows = []    
    with open(file_path) as file:

        for i in tqdm(range(questions_start, questions_end)):
            line = file.readline()
            line = json.loads(line)
            question, text, annotations = get_question_and_document(line)

            for i, candidate in enumerate(line['long_answer_candidates']):
                label, long_start, long_end = get_long_candidate(i, annotations, candidate)

                if label == True:
                    short_start, short_end = get_short_answer(annotations, long_start, long_end)
                    
                    rows.append(
                        form_short_data_row(question, text, long_start, long_end, short_start, short_end)
                    )
        
    return pd.DataFrame(rows)

In [None]:
train_short_df = load_short_data(train_path, 0, num_train_ques)
test_short_df = load_short_data(train_path, num_train_ques, num_val_ques)

In [None]:
train_short_df.head(10)

In [None]:
count_values_in_column(train_short_df,'short_answer')

In [None]:
train_long_answers = encode(train_short_df['long_answer'].values, tokenizer)
train_questions = encode(train_short_df['question'].values, tokenizer)

test_long_answers = encode(test_short_df['long_answer'].values, tokenizer)
test_questions = encode(test_short_df['question'].values, tokenizer)

In [None]:
#define 2 arrays for the start index and another for the end index
def form_short_labels(df, sentence_length):
    start_labels = np.zeros((len(df), sentence_length))
    end_labels = np.zeros((len(df), sentence_length))

    #get the token indexes from short_start and short_end columns and assign it to new arrays. encoding with 1
    for i in range(len(df)):
        start = df.loc[i].short_start
        end = df.loc[i].short_end

        if start < 500 and end < 500:
            start_labels[i, start] = 1
            end_labels[i, end] = 1
        else:
            continue
    
    return start_labels, end_labels

train_start_labels, train_end_labels = form_short_labels(train_short_df, max_len)
test_start_labels, test_end_labels = form_short_labels(test_short_df, max_len)

In [None]:
print(train_short_df.loc[10].question)

print(train_short_df.loc[10].long_answer)
print(train_short_df.loc[10].short_answer)

print('Start index: {0}'.format(train_start_labels[10]))
print('End index: {0}'.format(train_end_labels[10]))

Defining the short model

In [None]:
# short answer model parameters
short_epochs = 100
short_batch_size = 64
embed_size_short=200

In [None]:
# load from file
embedding_dict = {}

with open('../input/glove6b/glove.6B.200' + 'd.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:],'float32')
        embedding_dict[word] = vectors
        
f.close()

# write to matrix
num_words = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((num_words, embed_size_short))

for word, i in tokenizer.word_index.items():
    if i > num_words:
        continue
    
    emb_vec = embedding_dict.get(word)
    
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec
        
# load as tensorflow embedding
#define embedding layer for the short model
embedding_layer2 = tf.keras.layers.Embedding(
    len(tokenizer.word_index) + 1,
    embed_size_short,
    embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix),
    trainable = False
)

In [None]:
# encoding question input
question_input = Input(shape=(None,))
question_x = embedding_layer2(question_input)
question_x = SpatialDropout1D(0.2)(question_x)
question_x = Bidirectional(LSTM(200, return_sequences=True))(question_x)
question_x = Bidirectional(LSTM(100, return_sequences=True))(question_x)

# encoding answer input
answer_input = Input(shape=(None,))
answer_x = embedding_layer2(answer_input)
answer_x = SpatialDropout1D(0.2)(answer_x)
answer_x = Bidirectional(LSTM(250, return_sequences=True))(answer_x)
answer_x = Bidirectional(LSTM(150, return_sequences=True))(answer_x)

# merge the encodings
combined_x = concatenate([question_x, answer_x])

# predict start index of the short answer
start_x = Dropout(0.1)(combined_x) 
start_x = Conv1D(1,1)(start_x)
start_x = Flatten()(start_x)
start_x = Activation('softmax', name='start_token')(start_x)

# predict end index of the short answer
end_x = Dropout(0.1)(combined_x) 
end_x = Conv1D(1,1)(end_x)
end_x = Flatten()(end_x)
end_x = Activation('softmax', name='end_token')(end_x)

# merge the parts into one model
short_model = tf.keras.models.Model(inputs=[answer_input, question_input], outputs=[start_x, end_x])

In [None]:
short_model.compile(
    loss='categorical_crossentropy', 
    optimizer='adam',
    metrics=['categorical_accuracy', 'Recall', 'Precision'])

In [None]:
short_model.summary()

In [None]:
# define callbacks for the short model
#increased patience or number of epochs with no improvement after which training will be stopped.
callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', patience=4, verbose=1),
    tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, verbose=1),]

In [None]:
history = short_model.fit(
    x = [train_long_answers, train_questions], 
    y = [train_start_labels, train_end_labels],epochs = short_epochs, callbacks = callbacks,
    validation_data = ([test_long_answers, test_questions], [test_start_labels, test_end_labels]),
    batch_size = short_batch_size,  shuffle = True)

In [None]:
short_model.save('short_model.h5')

# **Model Evaluation**

identifying if correct short answers given to a long answers

In [None]:
print('Epoch: {0}'.format(len(history.history['loss'])))
print('Loss: {0}'.format(history.history['loss'][-1]))

In [None]:
print('Training final results')

accuracy = history.history['start_token_categorical_accuracy'][-1]
recall = history.history['start_token_recall'][-1]
precision = history.history['start_token_precision'][-1]

print('--------------------------------------------------')
print('Start token accuracy: {0}'.format(accuracy))
print('Start token recall: {0}'.format(recall))
print('Start token precision: {0}'.format(precision))
print('Start token F1 score: {0:.4f}'.format(2 * (precision * recall) / (precision + recall)))

print('--------------------------------------------------')

accuracy = history.history['end_token_categorical_accuracy'][-1]
recall = history.history['end_token_recall_1'][-1]
precision = history.history['end_token_precision_1'][-1]

print('End token accuracy: {0}'.format(accuracy))
print('End token recall: {0}'.format(recall))
print('End token precision: {0}'.format(precision))
print('End token F1 score: {0:.4f}'.format(2 * (precision * recall) / (precision + recall)))

In [None]:
print('Validation final results')
print('--------------------------------------------------')

accuracy = history.history['val_start_token_categorical_accuracy'][-1]
recall = history.history['val_start_token_recall'][-1]
precision = history.history['val_start_token_precision'][-1]

print('Start token accuracy: {0}'.format(accuracy))
print('Start token recall: {0}'.format(recall))
print('Start token precision: {0}'.format(precision))
print('Start token F1 score: {0:.4f}'.format( 2 * (precision * recall) / (precision + recall)))

print('--------------------------------------------------')

accuracy = history.history['val_end_token_categorical_accuracy'][-1]
recall = history.history['val_end_token_recall_1'][-1]
precision = history.history['val_end_token_precision_1'][-1]

print('End token accuracy: {0}'.format(accuracy))
print('End token recall: {0}'.format(recall))
print('End token precision: {0}'.format(precision))
print('End token F1 score: {0:.4f}'.format(2 * (precision * recall) / (precision + recall)))

In [None]:
#Evaluate plotting training & validation loss

fig, ax = plt.subplots(1, 2, figsize=(15, 5))

ax[0].set_title('Training Loss')
ax[0].plot(history.history['loss'],marker = '.',mec = 'r', mfc = 'r')

ax[1].set_title('Validation Loss')
ax[1].plot(history.history['val_loss'],marker = '.',mec = 'r', mfc = 'r',color="green")

In [None]:
#plotting metrics; accuracy, precision, recall for the first output array
fig, ax = plt.subplots(3, 2, figsize=(15, 10))

fig.suptitle('First output-start token')

ax[0,0].set_title('Training Accuracy')
ax[0,0].plot(history.history['start_token_categorical_accuracy'],marker = '.',mec = 'r', mfc = 'r')

ax[0,1].set_title('Validation Accuracy')
ax[0,1].plot(history.history['val_start_token_categorical_accuracy'],marker = '.',mec = 'r', mfc = 'r',color="green")

ax[1,0].set_title('Training Recall')
ax[1,0].plot(history.history['start_token_recall'],marker = '.',mec = 'r', mfc = 'r')

ax[1,1].set_title('Validation Recall')
ax[1,1].plot(history.history['val_start_token_recall'],marker = '.',mec = 'r', mfc = 'r',color="green")

ax[2,0].set_title('Training Precision')
ax[2,0].plot(history.history['start_token_precision'],marker = '.',mec = 'r', mfc = 'r')

ax[2,1].set_title('Validation Precision')
ax[2,1].plot(history.history['val_start_token_precision'],marker = '.',mec = 'r', mfc = 'r',color="green")

In [None]:
#plotting metrics; accuracy, precision, recall for the second output array
fig, ax = plt.subplots(3, 2, figsize=(15, 10))

fig.suptitle('Second output-end token')

ax[0,0].set_title('Training Accuracy')
ax[0,0].plot(history.history['end_token_categorical_accuracy'],marker = '.',mec = 'r', mfc = 'r')

ax[0,1].set_title('Validation Accuracy')
ax[0,1].plot(history.history['val_end_token_categorical_accuracy'],marker = '.',mec = 'r', mfc = 'r',color="green")

ax[1,0].set_title('Training Recall')
ax[1,0].plot(history.history['end_token_recall_1'],marker = '.',mec = 'r', mfc = 'r')

ax[1,1].set_title('Validation Recall')
ax[1,1].plot(history.history['val_end_token_recall_1'],marker = '.',mec = 'r', mfc = 'r',color="green")

ax[2,0].set_title('Training Precision')
ax[2,0].plot(history.history['end_token_precision_1'],marker = '.',mec = 'r', mfc = 'r')

ax[2,1].set_title('Validation Precision')
ax[2,1].plot(history.history['val_end_token_precision_1'],marker = '.',mec = 'r', mfc = 'r',color="green")

In [None]:
#testing with live inputs
def test_short_answer(question, long_answer):
    sentences = [long_answer, question]
    
    sentences = encode(sentences, tokenizer)
    
    predictions = short_model.predict(
        [np.expand_dims(sentences[0], axis=0), np.expand_dims(sentences[1], axis=0)])
    
    predictions = np.array(predictions)
    
    prediction_start = np.argmax(predictions[0,0])
    prediction_end = np.argmax(predictions[1,0])
    prediction_string = ' '.join(long_answer.split(' ')[prediction_start:prediction_end])

    return prediction_start, prediction_end, prediction_string

In [None]:
question = 'which is the most common use of opt-in e-mail marketing'
long_answer = "<P> A common example of permission marketing is a newsletter sent to an advertising firm 's customers . Such newsletters inform customers of upcoming events or promotions , or new products . In this type of advertising , a company that wants to send a newsletter to their customers may ask them at the point of purchase if they would like to receive the newsletter . </P>"

In [None]:
start, end, short_answer = test_short_answer(question, long_answer)

print('Start token: ' + str(start))
print('End token: ' + str(end))
print('Answer: ' + short_answer)

In [None]:
question = 'who invented a portable handset'
long_answer="<P> Martin Cooper invented a portable handset in 1973, when he was a project manager at Motorola. It was almost three decades after the idea of cellular communications was introduced by Bell Laboratories. </P>"

In [None]:
start, end, short_answer = test_short_answer(question, long_answer)

print('Start token: ' + str(start))
print('End token: ' + str(end))
print('Answer: ' + short_answer)

In [None]:
question = 'who is the south african high commissioner in london'
long_answer = "<P> Tracy McConnell , better known as `` The Mother '' , is the title character from the CBS television sitcom How I Met Your Mother . The show , narrated by Future Ted , tells the story of how Ted Mosby met The Mother . Tracy McConnell appears in 8 episodes from `` Lucky Penny '' to `` The Time Travelers '' as an unseen character ; she was first seen fully in `` Something New '' and was promoted to a main character in season 9 . The Mother is played by Cristin Milioti . </P>"

In [None]:
start, end, short_answer = test_short_answer(question, long_answer)

print('Start token: ' + str(start))
print('End token: ' + str(end))
print('Answer: ' + short_answer)

# **Horizontal and Vertical Ensembles**

In [None]:
import os

In [None]:
def assemble_horizontal_ensemble(ensemble_size, min_val_loss, filepath, epochs, batch_size, class_weights):
    callbacks = [
        tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', patience=2, verbose=1),
        tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, verbose=1),
        tf.keras.callbacks.ModelCheckpoint(
            filepath=filepath,
            save_weights_only=True,
            monitor='val_accuracy',
            mode='min',
            save_freq='epoch',
            initial_value_threshold=min_val_loss
        )
    ]

    # Save the model weight files after each epoch
    model.fit(
        x = [train_long_answers, train_questions], 
        y = train_labels,
        validation_data = (
            [test_long_answers, test_questions], test_labels),
        epochs = epochs,
        callbacks = callbacks,
        class_weight = class_weights,
        batch_size = batch_size,
        shuffle = True
    )
    
    # Filter out just the last {ensemble_size} number of models and removed the previous ones
    # Keep the file names of the last {ensemble_size} models in a list
    horizontal_ensemble = []
    files = sorted(os.listdir("/tmp/checkpoints"), reversed=True)
    epoch_nums_to_remove = epochs - ensemble_size
    for file in files:
        epoch, val_loss = file.split("_")
        epoch = int(epoch)
        
        # This keeps just the last {ensemble_size} models
        if epoch < epoch_nums_to_remove:
            os.remove(file)
        else:
            horizontal_ensemble.append(file)

In [None]:
# define horizontal ensemble parameters
ensemble_size = 10
min_val_loss = 0.02
filepath = "/tmp/checkpoints/{epoch:02d}_{val_loss:.2f}.hdf5"
directory_name = "/tmp/checkpoints"

# define model parameters
epochs = 10
batch_size = 128
class_weights = {0: 0.5, 1: 5.}

In [None]:
assemble_horizontal_ensemble(ensemble_size, min_val_loss, filepath, directory_name, epochs, batch_size, class_weights)