In [None]:
from google.colab import drive
import json
import numpy as np
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras.backend as K


drive.mount("/content/drive")
proj_dir = "/content/drive/MyDrive/QA_project" #Change to directory where there are the train and dev set and glove embeddings

In [None]:
!unzip "/content/drive/MyDrive/QA_project/glove.6B.zip" -d "/content"

In [None]:
with open('/content/drive/MyDrive/QA_project/train-v2.0.json') as f:
  train = json.load(f)

with open('/content/drive/MyDrive/QA_project/dev-v2.0.json') as f:
  dev = json.load(f)

In [None]:
# Load train data
train_contexts = []
train_questions = []
train_questions_id = [] #used for evaluation script
train_is_imp = []
train_answers = []
train_answers_start = []
train_answers_end = []
train_index = []

i = 0

for data in train['data']:
  title = data['title']

  for paragraph in data['paragraphs']:
    context = paragraph['context']

    for qa in paragraph['qas']:
      question = qa['question']
      id = qa['id']
      is_impossible = qa['is_impossible']

      if is_impossible:
        train_is_imp.append(1)
        train_contexts.append(context)
        train_questions.append(question)
        train_answers.append("")
        train_answers_start.append(0)
        train_answers_end.append(0)
        train_index.append(i)
        i+=1
      else:
        for answer in qa['answers']:
          text = answer['text']
          start = answer['answer_start']
          
          train_is_imp.append(0)
          train_contexts.append(context)
          train_questions.append(question)
          train_questions_id.append(id)
          train_answers.append(text)
          train_answers_start.append(start)
          train_answers_end.append(start + len(text))
          train_index.append(i)
          i+=1

In [None]:
# Load dev data
dev_contexts = []
dev_questions = []
dev_questions_id = [] #used for evaluation script
dev_is_imp = []
dev_answers = []
dev_answers_start = []
dev_answers_end = []

for data in dev['data']:
  title = data['title']

  for paragraph in data['paragraphs']:
    context = paragraph['context']

    for qas in paragraph['qas']:
      question = qas['question']
      id = qas['id']
      is_impossible = qas['is_impossible']

      if is_impossible:
        dev_is_imp.append(1)
        dev_contexts.append(context)
        dev_questions.append(question)
        dev_questions_id.append(id)
        dev_answers.append("")
        dev_answers_start.append(0)
        dev_answers_end.append(0)
      else:
        for answer in qas['answers']:
          text = answer['text']
          start = answer['answer_start']

          dev_is_imp.append(0)
          dev_contexts.append(context)
          dev_questions.append(question)
          dev_questions_id.append(id)
          dev_answers.append(text)
          dev_answers_start.append(start)
          dev_answers_end.append(start + len(text))

In [None]:
# Tokenizer
from nltk.tokenize.regexp import RegexpTokenizer

tkn = RegexpTokenizer('[\s!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n—]', gaps=True)

In [None]:
# Tokenize training set and build vocabulary
vocab = set()

tokenized_train_contexts = tkn.tokenize_sents(map(lambda x:x.lower(),train_contexts))
span_train_contexts = list(tkn.span_tokenize_sents(map(lambda x:x.lower(),train_contexts)))

tokenized_train_questions = tkn.tokenize_sents(map(lambda x:x.lower(),train_questions))

for item in tokenized_train_contexts:
  vocab.update(item)

for item in tokenized_train_questions:
  vocab.update(item)

In [None]:
# Tokenize dev set
tokenized_dev_contexts = tkn.tokenize_sents(map(lambda x:x.lower(),dev_contexts))
span_dev_contexts = list(tkn.span_tokenize_sents(map(lambda x:x.lower(),dev_contexts)))

tokenized_dev_questions = tkn.tokenize_sents(map(lambda x:x.lower(),dev_questions))

In [None]:
# Build word index
word_index = {}
for idx, voc in enumerate(vocab):
  word_index[voc] = idx + 2

In [None]:
# Build token-answer correspondence training
train_start = []
train_end = []

for answ_start, answ_end, span_context, is_imp in zip(train_answers_start, train_answers_end, span_train_contexts, train_is_imp):
  if is_imp == 1:
    train_start.append([0]*len(span_context))
    train_end.append([0]*len(span_context))
  else:
    answer_enc = []
    answer_start_enc = []
    answer_end_enc = [0]*len(span_context)

    started = False
    end_idx = None

    for idx, span in enumerate(span_context):
      if span[0] >= answ_start and not started:
        answer_start_enc.append(1)
        answer_enc.append(1)
        started = True
        end_idx = idx
      elif started and span[1] <= answ_end:
        end_idx = idx
        answer_start_enc.append(0)
        answer_enc.append(1)
      else:
        answer_start_enc.append(0)
        answer_enc.append(0)

    if end_idx:
      answer_end_enc[end_idx] = 1

    train_start.append(answer_start_enc)
    train_end.append(answer_end_enc)

In [None]:
# Build token-answer correspondence dev
dev_start = []
dev_end = []

for answ_start, answ_end, span_context, is_imp in zip(dev_answers_start, dev_answers_end, span_dev_contexts, dev_is_imp):
  if is_imp == 1:
    dev_start.append([0]*len(span_context))
    dev_end.append([0]*len(span_context))
  else:
    answer_enc = []
    answer_start_enc = []
    answer_end_enc = [0]*len(span_context)

    started = False
    end_idx = None

    for idx, span in enumerate(span_context):
      if span[0] >= answ_start and not started:
        answer_start_enc.append(1)
        answer_enc.append(1)
        started = True
        end_idx = idx
      elif started and span[1] <= answ_end:
        end_idx = idx
        answer_start_enc.append(0)
        answer_enc.append(1)
      else:
        answer_start_enc.append(0)
        answer_enc.append(0)

    if end_idx:
      answer_end_enc[end_idx] = 1

    dev_start.append(answer_start_enc)
    dev_end.append(answer_end_enc)

In [None]:
del span_train_contexts
del span_dev_contexts

In [None]:
def tokenized_texts_to_sequences(tkn_texts, word_index):
  sequences = []
  for text in tkn_texts:
    seq = []
    for word in text:
      seq.append(word_index[word]) if word in word_index.keys() else seq.append(1)
    sequences.append(seq)
  
  return sequences

# Integer encoding training
train_ctx = tokenized_texts_to_sequences(tokenized_train_contexts, word_index)
train_q = tokenized_texts_to_sequences(tokenized_train_questions, word_index)

# Integer encoding dev
dev_ctx = tokenized_texts_to_sequences(tokenized_dev_contexts, word_index)
dev_q = tokenized_texts_to_sequences(tokenized_dev_questions, word_index)

In [None]:
from keras.preprocessing.sequence import pad_sequences

# Pad train sequences and answer encoding
train_ctx = pad_sequences(train_ctx, padding='post')
train_q = pad_sequences(train_q, padding='post')
train_start = pad_sequences(train_start, padding='post')
train_end = pad_sequences(train_end, padding='post')

# Pad dev sequences
dev_ctx = pad_sequences(dev_ctx, padding='post')
dev_q = pad_sequences(dev_q, padding='post')
dev_start = pad_sequences(dev_start, padding='post')
dev_end = pad_sequences(dev_end, padding='post')

In [None]:
# Save target list to numpy array
train_is_imp_arr = np.array(train_is_imp)

dev_is_imp_arr = np.array(dev_is_imp)

In [None]:
# shuffle train data
import random

random.seed(42)
shuffler = np.random.permutation(train_ctx.shape[0])
train_ctx_shuffled = train_ctx[shuffler]
train_q_shuffled = train_q[shuffler]
train_start_shuffled = train_start[shuffler]
train_end_shuffled = train_end[shuffler]
train_imp_shuffled = train_is_imp_arr[shuffler]

train_index = np.array(train_index)
train_index_shuffled = train_index[shuffler]

In [None]:
# Preparing embedding
embeddings_index = {}
with open('/content/glove.6B.300d.txt') as f:
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:], dtype='float32')
    embeddings_index[word] = coefs

In [None]:
# Define embedding matrix
embedding_matrix = np.zeros((len(word_index) + 2, 300))
for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
num_words = len(word_index) + 2

MODEL DEFINITION

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import *
import keras.backend as K

In [None]:
class RepeatConcat(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super(RepeatConcat, self).__init__()

  def compute_mask(self, inputs, mask=None):
    if mask:
      return mask[0]
    return mask

  def call(self, inputs, training, mask=None):
    ctx = inputs[0]
    q = inputs[1]

    q_repeated = K.tile(q, [1,K.shape(ctx)[1],1])

    return K.concatenate([q_repeated, ctx], axis=-1)

  def get_config(self):
    config = super().get_config().copy()
      
    return config

In [None]:
class Query2Context(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super(Query2Context,self).__init__()

  def compute_mask(self, inputs, mask=None):
    if mask:
      return mask[0]
    return mask

  def call(self, inputs, training, mask=None):
    ctx = inputs[0]
    q = inputs[1]
    q_mask = mask[0] if mask else None

    scores = tf.matmul(ctx, q, transpose_b=True)
    max = tf.math.reduce_max(scores, axis=-1)
    if q_mask is not None:
      padding_mask = tf.logical_not(q_mask)
      # Bias so padding positions do not contribute to attention distribution.
      # Note 65504. is the max float16 value.
      if scores.dtype is tf.float16:
        max -= 65504. * tf.cast(padding_mask, dtype=scores.dtype)
      else:
        max -= 1.e9 * tf.cast(padding_mask, dtype=scores.dtype)

    soft = tf.nn.softmax(max, axis=-1)
    soft = tf.expand_dims(soft, -2)
    result = tf.matmul(soft, ctx)
    
    return result


  def get_config(self):
    config = super().get_config().copy()
    return config

In [None]:
# define input layers
question_input = keras.Input(shape=(None,), name="question")
context_input = keras.Input(shape=(None,), name="context")

# embedding
token_emb = Embedding(num_words, 300, weights=[embedding_matrix], trainable=False, mask_zero=True)
q_emb = token_emb(question_input)
q_emb = Dropout(0.4)(q_emb)
c_emb = token_emb(context_input)
c_emb = Dropout(0.4)(c_emb)

# encoder
q_encoding = Bidirectional(GRU(units=64, return_sequences=True, dropout=0.1))(q_emb)
c_encoding = Bidirectional(GRU(units=64, return_sequences=True, dropout=0.1))(c_emb)

# combination

c2q, scores = Attention()([c_encoding, q_encoding], return_attention_scores=True)
q2c = Query2Context()([c_encoding, q_encoding])
concat = Concatenate(axis=-1)([c_encoding, c2q])
concat = RepeatConcat()([concat,q2c])

# decoding
decoding1 = Bidirectional(GRU(units=64, return_sequences=True, dropout=0.1))(concat)
imp_predictor = Bidirectional(GRU(units=64, dropout=0.1))(decoding1)
imp_predictor = Dense(100, activation='relu')(imp_predictor)
imp_predictor = Dropout(0.1)(imp_predictor)
imp_predictor = Dense(20, activation='relu')(imp_predictor)
imp_predictor = Dropout(0.1)(imp_predictor)
imp_predictor = Dense(1, activation='sigmoid', name="is_impossible")(imp_predictor)

model = keras.Model(
    outputs=[imp_predictor]
)

In [None]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss={"is_impossible": keras.losses.BinaryCrossentropy()
    }
)

In [None]:
keras.utils.plot_model(model)

In [None]:
model.summary()

In [None]:
# apply early stopping
import time 

logdir = os.path.join(os.curdir, "logs", "run_{}".format(time.time()))

callbacks = [keras.callbacks.TensorBoard(logdir),
             keras.callbacks.EarlyStopping(patience=5),
             keras.callbacks.ModelCheckpoint("qa_attention_model.h5",save_best_only=True)]

In [None]:
history = model.fit(x={"context": train_ctx_shuffled, "question": train_q_shuffled},
                   y={"is_impossible": train_imp_shuffled},
                   epochs=30,
                   batch_size=512,
                   validation_split=0.2,
                   callbacks=callbacks
)

In [None]:
prediction = model.predict([dev_q, dev_ctx])

In [None]:
np.save('/content/drive/MyDrive/QA_project/is_imp.npy', prediction)

In [None]:
# USE MODEL 5 TO PREDICT AFTER FILTERING