# Installing libraries

Installing HuggingFace Transformers (https://github.com/huggingface/transformers)

In [None]:
pip install transformers scikit-learn torch pandas



Importing the libraries


In [6]:
import json
from pathlib import Path
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast 
import torch

# Dataset processing

Uploading the dataset, splitting the data into train, validation and test sets

In [64]:
import pandas as pd

path = Path('../data/original.json')
data = json.loads(path.read_text(encoding='utf-8'))
df = pd.DataFrame(data)

df1 = pd.DataFrame(df['data'].values.tolist())
df1.columns = df1.columns
col = df.columns.difference(['data'])
df = pd.concat([df[col], df1],axis=1)

In [97]:
data = df.explode('paragraphs')['paragraphs'].to_list()

In [98]:
train, temp = train_test_split(data, test_size=0.3, shuffle=True)
val, test = train_test_split(temp, test_size=0.5, shuffle=True)

Getting contexts, questions and answers from the train and validation sets

In [100]:
def read_set(set):
    
    contexts = []
    questions = []
    answers = []

    for group in set:
        context = group['context']
        for qa in group['qas']:
            question = qa['question']
            for answer in qa['answers']:
                contexts.append(context)
                questions.append(question)
                answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_set(train)
val_contexts, val_questions, val_answers = read_set(val)

Adding span tags to answers and contexts

In [101]:
def add_tags(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_tags = answer['answer_start']
        end_tags = answer['answer_end']

add_tags(train_answers, train_contexts)
add_tags(val_answers, val_contexts)

# Tokenization and vectorization

Initializating BertTokenizerFast from HuggingFace for BERT base multilingual cased pre-trained model

In [102]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

Tokenizing and vectorizing questions and contexts with BertTokenizerFast

In [103]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

# Preparing the data for training

Adding token positions to answers

In [104]:
def add_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_positions(train_encodings, train_answers)
add_positions(val_encodings, val_answers)

Adapting the data for training with PyTorch

In [107]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx] or -1, dtype=torch.int64) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = Dataset(train_encodings)
val_dataset = Dataset(val_encodings)


In [106]:
import pickle

with open("../data/train_dataset.pkl","wb") as file:
    pickle.dump(train_dataset, file)

with open("../data/val_dataset.pkl","wb") as file:
    pickle.dump(val_dataset, file)

In [109]:
with open("../data/val_answers.pkl","wb") as file:
    pickle.dump(val_answers, file)

with open("../data/val_questions.pkl","wb") as file:
    pickle.dump(val_questions, file)

with open("../data/val_contexts.pkl","wb") as file:
    pickle.dump(val_contexts, file)