In [1]:
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig


In [2]:
path = 'dataset/'

train_path = path + 'QA_train.json'

eval_path = path + 'QA_test.json'

max_len = 512



configuration = BertConfig()  # default parameters and configuration for BERT

# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",max_length=max_len)
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True, clean_text=True, strip_accents=True)


In [3]:
def normalize_text(text):
    text = text.lower()

    # Remove punctuations
    exclude = set(string.punctuation)
    text = "".join(ch for ch in text if ch not in exclude)

    # Remove articles
    regex = re.compile(r"\b(و|با|در)\b", re.UNICODE)
    text = re.sub(regex, " ", text)

    # Remove extra white space
    text = " ".join(text.split())
    return text


class ExactMatch(keras.callbacks.Callback):
  
    def __init__(self, x_eval, y_eval):
      
        self.x_eval = x_eval
        self.y_eval = y_eval

    def on_epoch_end(self, epoch, logs=None):

        pred_start, pred_end = self.model.predict(self.x_eval)
       
        count = 0
        eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = eval_examples_no_skip[idx]
            offsets = squad_eg.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]

            normalized_pred_ans = normalize_text(pred_ans)
            normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers]
            
            if normalized_pred_ans in normalized_true_ans:
                count += 1
        acc = count / len(self.y_eval[0])
        print(f"\nepoch={epoch+1}, exact match score={acc:.2f}")

In [4]:
class TokenizedData:
    def __init__(self, question, context, start_char_idx, answer_text, all_answers):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx

        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False

    def preprocessing(self):
        context = self.context
        question = self.question
        answer_text = self.answer_text
        start_char_idx = self.start_char_idx


        # Clean context, answer and question
        context = " ".join(str(context).split())
        question = " ".join(str(question).split())
        answer = " ".join(str(answer_text).split())

        # Find end character index of answer in context
        end_char_idx = start_char_idx + len(answer)
        if end_char_idx >= len(context):
            self.skip = True
            return
        context = context[0:end_char_idx]
        # Mark the character indexes in context that are in answer
        is_char_in_ans = [0] * len(context)
        for idx in range(start_char_idx, end_char_idx):
            is_char_in_ans[idx] = 1

        # Tokenize context
        tokenized_context = tokenizer.encode(context)

        # Find tokens that were created from answer characters
        ans_token_idx = []
        for idx, (start, end) in enumerate(tokenized_context.offsets):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)

        if len(ans_token_idx) == 0:
            self.skip = True
            return

        # Find start and end token index for tokens from answer
        start_token_idx = ans_token_idx[0]
        end_token_idx = ans_token_idx[-1]

        # Tokenize question
        tokenized_question = tokenizer.encode(question)

        # Create inputs
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
            tokenized_question.ids[1:]
        )
        attention_mask = [1] * len(input_ids)

        # Pad and create attention masks.
        # Skip if truncation is needed
        padding_length = max_len - len(input_ids)

        if padding_length > 0:  # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:  # skip

            self.skip = True
            return

        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.context_token_to_char = tokenized_context.offsets


with open(train_path) as f:
    raw_train_data = json.load(f)

with open(eval_path) as f:
    raw_eval_data = json.load(f)


def data_parser(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                if qa['is_impossible']:
                    continue
                question = qa["question"]
               
                answer_text = qa["answers"][0]["text"]
                all_answers = [_["text"] for _ in qa["answers"]]
                start_char_idx = qa["answers"][0]["answer_start"]
                end_char_idx = qa["answers"][0]["answer_end"]
                squad_eg = TokenizedData(
                    question, context, start_char_idx, answer_text, all_answers
                )
                squad_eg.preprocessing()
                squad_examples.append(squad_eg)
    return squad_examples


def create_inputs(squad_examples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                
                dataset_dict[key].append(getattr(item, key))

    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

  
    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y


train_squad_examples = data_parser(raw_train_data)
x_train, y_train = create_inputs(train_squad_examples)


eval_squad_examples = data_parser(raw_eval_data)
x_eval, y_eval = create_inputs(eval_squad_examples)

print(len(x_eval[0]))
print(len(x_train[0]))

469
4479


In [5]:
class BertBasedModel:
    
    def __init__(self):
        pass
    
    def model_builder(self):

        ## BERT encoder
        encoder = TFBertModel.from_pretrained("bert-base-uncased")
        encoder.layers[0].trainable = False


        ## QA Model
        input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
        token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
        attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
        embedding = encoder(
            input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
        )[0]

        start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
        start_logits = layers.Flatten()(start_logits)

        end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
        end_logits = layers.Flatten()(end_logits)

        start_probs = layers.Activation(keras.activations.softmax)(start_logits)
        end_probs = layers.Activation(keras.activations.softmax)(end_logits)

        model = keras.Model(
            inputs=[input_ids, token_type_ids, attention_mask],
            outputs=[start_probs, end_probs],
        )

        
       
        loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
        optimizer = keras.optimizers.Adam(learning_rate=5e-5)
        model.compile(optimizer=optimizer, loss=[loss, loss], run_eagerly=True)

        model.summary()

        return model

In [6]:
if os.path.exists("model_weights.h5"):
    
    model_class = BertBasedModel()
    model = model_class.model_builder()
    model.load_weights('model_weights.h5')
    
    context = 'من محمد ایزدی هستم. من دانشجوی ارشد رشته هوش مصنوعی هستم.'
    question = 'رشته من چیست؟'
    start_char_idx = 41
    answer_text = 'هوش مصنوعی'
    all_answers = []
    
    squad_eg = TokenizedData(
        question, context, start_char_idx, answer_text, all_answers
    )
    squad_eg.preprocessing()
    
    x_test, y_test = create_inputs([squad_eg]) 
    

    pred_start, pred_end = model.predict(x_test)
    

    offsets = squad_eg.context_token_to_char
    start = np.argmax(pred_start)
    end = np.argmax(pred_end)
    if start >= len(offsets):
        print('sorry the answer was not good enough :(')
        quit()


    pred_char_start = offsets[start][0]
    if end < len(offsets):
        pred_char_end = offsets[end][1]
        pred_ans = squad_eg.context[pred_char_start:pred_char_end]
    else:
        pred_ans = squad_eg.context[pred_char_start:]
        
    print(pred_ans)
    with open('answer.txt', 'w') as f:
        f.write('متن: '+ context +'\n')
        f.write('جواب صحیح: '+ answer_text +'\n')
        f.write('جواب مدل: '+ pred_ans +'\n')



    
    
    
    
    
else:
    exact_match_callback = ExactMatch(x_eval, y_eval)
    model_class = BertBasedModel()
    model = model_class.model_builder()
    model.fit(
        x_train,
        y_train,
        epochs=3,
        verbose=1,
        batch_size=8, #این بچ سایز مناسب نیست و صرفا بخاطر اجرا برروی سیسمی ضعیت قرار داده شده است. بهتر از این مقدار را برابر با ۶۴ قرار دهید
        callbacks=[exact_match_callback],
    )
    
    model.save_weights('model_weights.h5')

    


2022-01-30 12:02:42.436273: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoi

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_3[0][0]',            