In [1]:
! git clone -q https://github.com/AUT-NLP/PQuAD.git
! pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import json
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.utils import to_categorical

from sklearn.metrics import f1_score

from transformers import AutoTokenizer, TFAutoModel

In [3]:
max_length = 128
doc_stride = 16
model_name = "HooshvareLab/bert-base-parsbert-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

## Loading Dataset

In [4]:
def simplify_dataset(dataset):
    simplified_data = []

    for document in dataset['data']:
        for paragraph in document['paragraphs']:
            for qa in paragraph['qas']:
                entry = {
                    'id': qa['id'],
                    'title': document['title'],
                    'context': paragraph['context'],
                    'question': qa['question'],
                    'is_impossible': qa['is_impossible']
                }
                if not qa['is_impossible']:
                    entry['answer'] = {
                        'text': qa['answers'][0]['text'],
                        'answer_start': qa['answers'][0]['answer_start']
                    }

                simplified_data.append(entry)

    return simplified_data

In [5]:
def load_pquad():

    with open('PQuAD/Dataset/Train.json') as f:
        train = json.load(f)
        train = simplify_dataset(train)

    with open('PQuAD/Dataset/Validation.json') as f:
        validation = json.load(f)
        validation = simplify_dataset(validation)

    with open('PQuAD/Dataset/Test.json') as f:
        test = json.load(f)
        test = simplify_dataset(test)

    return train, validation, test

In [6]:
train, validation, test = load_pquad()

## Dataset Statistics

In [7]:
def extract_dataset_statistics(dataset):
    tmp = pd.json_normalize(dataset)

    question_count = len(tmp)
    print('total question count =', question_count)

    impossible_answers = np.sum(tmp['is_impossible'] == True)
    answerability = (1 - impossible_answers / question_count) * 100
    print('impossible to answer =', impossible_answers)
    print(f'answerability = {answerability:.1f} %\n')

    context_word_count = tmp['context'].str.count(' ') + 1
    print('min context words =', int(context_word_count.min()))
    print('max context words =', int(context_word_count.max()))
    print(f'avg context words = {context_word_count.mean():.1f}\n')

    question_word_count = tmp['question'].str.count(' ') + 1
    print('min question words =', int(question_word_count.min()))
    print('max question words =', int(question_word_count.max()))
    print(f'avg question words = {question_word_count.mean():.1f}\n')

    answer_word_count = tmp['answer.text'].str.count(' ') + 1
    print('min answer words =', int(answer_word_count.min()))
    print('max answer words =', int(answer_word_count.max()))
    print(f'avg answer words = {answer_word_count.mean():.1f}\n')

    del tmp

In [8]:
extract_dataset_statistics(train)

total question count = 63994
impossible to answer = 15721
answerability = 75.4 %

min context words = 7
max context words = 274
avg context words = 129.1

min question words = 2
max question words = 256
avg question words = 10.4

min answer words = 1
max answer words = 127
avg answer words = 5.2



In [9]:
extract_dataset_statistics(validation)

total question count = 7976
impossible to answer = 1981
answerability = 75.2 %

min context words = 15
max context words = 256
avg context words = 125.3

min question words = 2
max question words = 51
avg question words = 10.7

min answer words = 1
max answer words = 157
avg answer words = 6.3



In [10]:
extract_dataset_statistics(test)

total question count = 8002
impossible to answer = 1914
answerability = 76.1 %

min context words = 17
max context words = 328
avg context words = 128.2

min question words = 2
max question words = 58
avg question words = 10.9

min answer words = 1
max answer words = 105
avg answer words = 5.5



## Preprocessing

In [11]:
def preprocess_element(entry):

    tokenized_entry = tokenizer(
        entry["question"],
        entry["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_entry.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_entry.pop("offset_mapping")

    tokenized_entry["start_positions"] = []
    tokenized_entry["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):

        input_ids = tokenized_entry["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_entry.sequence_ids(i)

        # if no answers are given, set the cls_index as answer
        if entry['is_impossible']:
            tokenized_entry["start_positions"].append(cls_index)
            tokenized_entry["end_positions"].append(cls_index)

        else:

            answer = entry["answer"]
            start_char = answer["answer_start"]
            end_char = start_char + len(answer["text"])

            # find index of the first context token
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # find index of the last context token
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # if the answer is out of the span it's impossible to answer
            if (start_char < offsets[token_start_index][0] or
                offsets[token_end_index][1] < end_char):
                
                tokenized_entry["start_positions"].append(cls_index)
                tokenized_entry["end_positions"].append(cls_index)

            else:

                while (token_start_index < len(offsets) and
                    offsets[token_start_index][0] <= start_char):
                    token_start_index += 1

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1

                tokenized_entry["start_positions"].append(token_start_index - 1)
                tokenized_entry["end_positions"].append(token_end_index + 1)

    return tokenized_entry

In [12]:
def reformat_dataset(dataset):

    X = {
        'input_ids': np.stack(dataset['input_ids']),
        'token_type_ids': np.stack(dataset['token_type_ids']),
        'attention_mask': np.stack(dataset['attention_mask']),
    }

    Y = {
        'start_positions': np.stack(dataset['start_positions']),
        'end_positions': np.stack(dataset['end_positions']),
    }

    return X, Y

In [13]:
def preprocess_dataset(dataset):
    X = []

    for entry in dataset:
        x = preprocess_element(entry)
        for i in range(len(x['input_ids'])):
            X.append({
                'input_ids': np.array(x['input_ids'][i], dtype=np.int32),
                'token_type_ids': np.array(x['token_type_ids'][i], dtype=np.int32),
                'attention_mask': np.array(x['attention_mask'][i], dtype=np.int32),
                'start_positions': to_categorical(x['start_positions'][i], num_classes=max_length),
                'end_positions': to_categorical(x['end_positions'][i], num_classes=max_length)
            })

    return reformat_dataset(pd.DataFrame(X))

In [14]:
X_train, Y_train = preprocess_dataset(train)
X_val, Y_val = preprocess_dataset(validation)
X_test, Y_test = preprocess_dataset(test)

## Model

first, we need to load the pretrained base model into the tensorflow environment

In [15]:
base_model = TFAutoModel.from_pretrained(model_name)

Downloading tf_model.h5:   0%|          | 0.00/963M [00:00<?, ?B/s]

Some layers from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [16]:
def build_model():

    input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
    token_type_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="token_type_ids")
    attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")

    encoded = base_model(
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask
    )[0]

    start_positions = tf.keras.layers.Dense(1)(encoded)
    start_positions = tf.keras.layers.Flatten()(start_positions)
    start_positions = tf.keras.layers.Softmax(name='start_positions')(start_positions)

    end_positions = tf.keras.layers.Dense(1)(encoded)
    end_positions = tf.keras.layers.Flatten()(end_positions)
    end_positions = tf.keras.layers.Softmax(name='end_positions')(end_positions)

    model = tf.keras.models.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_positions, end_positions]
    )

    return model

In [17]:
def custom_loss_function(y_true, y_pred):

    loss_1 = tf.keras.losses.categorical_crossentropy(y_true[0], y_pred[0])
    loss_2 = tf.keras.losses.categorical_crossentropy(y_true[1], y_pred[1])

    return loss_1 + loss_2

In [18]:
model = build_model()

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=custom_loss_function)

In [19]:
model.fit(X_train, Y_train,
    validation_data=(X_val, Y_val),
    batch_size=40, epochs=2)

Epoch 1/2




Epoch 2/2


<keras.callbacks.History at 0x7f33f0660bb0>

## Performance Metrics

In [32]:
model.evaluate(X_test, Y_test)



[4.1288628578186035, 2.164177417755127, 1.9646871089935303]

In [35]:
Y_pred = model.predict(X_test)



In [43]:
def compute_exact_match(Y_test, Y_pred):
    start_pred = np.argmax(Y_pred[0], axis=1)
    end_pred = np.argmax(Y_pred[1], axis=1)

    start_true = np.argmax(Y_test['start_positions'], axis=1)
    end_true = np.argmax(Y_test['end_positions'], axis=1)

    correct_predictions = 0
    total_questions = len(start_pred)

    for i in range(total_questions):
        if start_true[i] == start_pred[i] and end_true[i] == end_pred[i]:
            correct_predictions += 1

    accuracy = (correct_predictions / total_questions) * 100
    return accuracy

In [51]:
def mean_f1_score(Y_test, Y_pred):
    start_pred = np.argmax(Y_pred[0], axis=1)
    end_pred = np.argmax(Y_pred[1], axis=1)

    start_true = np.argmax(Y_test['start_positions'], axis=1)
    end_true = np.argmax(Y_test['end_positions'], axis=1)

    return 50 * (f1_score(start_true, start_pred, average='micro') + f1_score(end_true, end_pred, average='micro'))

In [52]:
print(f'exact match = {compute_exact_match(Y_test, Y_pred):.1f} %')
print(f'f1 score = {mean_f1_score(Y_test, Y_pred):.1f} %')

exact match = 67.8 %
f1 score = 74.4 %


## Example Tests

In [20]:
def display_example(i):

    print('question :', test[i]['question'])

    example = [test[i]]
    X_example, Y_example = preprocess_dataset(example)

    res = model(X_example)
    start = np.argmax(res[0][0])
    end = np.argmax(res[1][0])

    print('answer :', tokenizer.decode(X_example['input_ids'][0][start:end+1]))

In [21]:
display_example(0)

question : کتاب مقدس دین اسلام چیست؟
answer : قران


In [33]:
display_example(5)

question : قرآن به چه زبانی است؟
answer : عربی


In [22]:
display_example(10)

question : وحی چیست؟
answer : [CLS]


In [23]:
display_example(100)

question : از آغاز تأسیس دارالفنون چه کسانی به مخالفت با آن پرداختند؟
answer : برخی از درباریان


In [27]:
display_example(250)

question : افزوده شدن شمشیر به نقش شیر و خورشید به چه زمانی برمی‌گردد؟
answer : دوران قاجار
