In [1]:
!pip install transformers



You should consider upgrading via the 'c:\users\mohni\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.





In [2]:
import json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, QuestionAnsweringPipeline
import torch
from torch.utils.data import DataLoader
from transformers import AdamW

In [3]:
class PolicyDataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  
  def __getitem__(self, idx):
    return {
        key: torch.tensor(val[idx]) for key, val in self.encodings.items()
    }
  
  def __len__(self):
    return len(self.encodings.input_ids)

In [4]:
def read_json(filename):
    with open(filename) as file:
        data = json.load(file)
    return data

In [None]:
tokenizer = AutoTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
model = AutoModelForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=433294681.0), HTML(value='')))

In [None]:
def add_token_positions(encodings, answers):
    start_positions, end_positions = list(), list()
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]["start_answer"]))
        end_positions.append(encodings.char_to_token(i, answers[i]["end_answer"]))

    # print(tokenizer.char_to_word(start_positions[-1]))
    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
        start_positions[-1] = tokenizer.model_max_length

    # if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1
    
    if end_positions[-1] is None:
        end_positions[-1] = encodings.char_to_token(i, answers[i]['end_answer'] - 1)

    encodings.update({
      'start_positions': start_positions,
      'end_positions': end_positions
      })

In [None]:
data = read_json("training_set.json")

In [None]:
contexts, questions, answers = list(), list(), list()

for topicID in data:
    context = data[topicID]["context"]
    for qa in data[topicID]["qas"]:
        contexts.append(context)
        questions.append(data[topicID]["qas"][qa]["question"])
        answers.append(data[topicID]["qas"][qa]["answer"])


In [None]:
flag = True
for answer, context in zip(answers, contexts):
    if context[answer["start_answer"]:answer["end_answer"]] != answer["answer"]:
        flag = False
        break
print(flag)

In [None]:
train_encodings = tokenizer(contexts, questions, truncation=True, padding=True)
add_token_positions(train_encodings, answers)
print(train_encodings["start_positions"],"\n", train_encodings["end_positions"])

In [None]:
train_dataset = PolicyDataset(train_encodings)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)
optim = AdamW(model.parameters(), lr = 5e-5)

for epoch in range(5):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask = attention_mask, start_positions = start_positions, end_positions = end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
    print("epoch-->",epoch)


epoch--> 0
epoch--> 0
epoch--> 0
epoch--> 0
epoch--> 0
epoch--> 0
epoch--> 1
epoch--> 1
epoch--> 1
epoch--> 1
epoch--> 1
epoch--> 1
epoch--> 2
