In [22]:
from transformers import BertForQuestionAnswering, AutoTokenizer, DefaultDataCollator, TrainingArguments, Trainer, BertTokenizer
import torch

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
context = "The University of California was founded in 1868, located in Berkeley."
question = "When was the University of California established?"

In [24]:
inputs = tokenizer(question, context, return_tensors='pt')
with torch.no_grad():
    outputs = model(**inputs)

# Find the tokens with the highest `start` and `end` scores
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1

# Convert tokens to answer string
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs.input_ids[0, answer_start:answer_end]))
print("Answer:", answer)

Answer: ? [SEP] the university of california was founded in 1868


In [25]:
from datasets import load_dataset
squad = load_dataset("squad", split="train[:100]")
squad = squad.train_test_split(test_size=0.2)


In [26]:
squad["train"][0]

{'id': '57338724d058e614000b5ca1',
 'title': 'University_of_Notre_Dame',
 'context': "In 1919 Father James Burns became president of Notre Dame, and in three years he produced an academic revolution that brought the school up to national standards by adopting the elective system and moving away from the university's traditional scholastic and classical emphasis. By contrast, the Jesuit colleges, bastions of academic conservatism, were reluctant to move to a system of electives. Their graduates were shut out of Harvard Law School for that reason. Notre Dame continued to grow over the years, adding more colleges, programs, and sports teams. By 1921, with the addition of the College of Commerce, Notre Dame had grown from a small college to a university with five colleges and a professional law school. The university continued to expand and add new residence halls and buildings with each subsequent president.",
 'question': 'Which college did Notre Dame add in 1921?',
 'answers': {'text': 

In [27]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=128,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_squad = squad.map(preprocess_function, batched=True)
data_collator = DefaultDataCollator()

Map: 100%|██████████| 80/80 [00:00<00:00, 1228.33 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 2195.68 examples/s]


In [28]:
training_args = TrainingArguments(
    output_dir="qa_model",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

                                                 
 20%|██        | 5/25 [18:57<1:26:38, 259.94s/it]

{'eval_loss': 4.150496482849121, 'eval_runtime': 258.0795, 'eval_samples_per_second': 0.077, 'eval_steps_per_second': 0.008, 'epoch': 1.0}


                                                 
 40%|████      | 10/25 [35:03<46:51, 187.46s/it]
100%|██████████| 2/2 [00:47<00:00,  5.32s/it]

{'eval_loss': 3.6842856407165527, 'eval_runtime': 72.4365, 'eval_samples_per_second': 0.276, 'eval_steps_per_second': 0.028, 'epoch': 2.0}


 60%|██████    | 15/25 [57:49<48:36, 291.62s/it]
 60%|██████    | 15/25 [1:03:08<48:36, 291.62s/it]

{'eval_loss': 3.416550874710083, 'eval_runtime': 317.6756, 'eval_samples_per_second': 0.063, 'eval_steps_per_second': 0.006, 'epoch': 3.0}


 80%|████████  | 20/25 [1:33:44<28:16, 339.23s/it]  
 80%|████████  | 20/25 [1:36:07<28:16, 339.23s/it]

{'eval_loss': 3.3300864696502686, 'eval_runtime': 141.976, 'eval_samples_per_second': 0.141, 'eval_steps_per_second': 0.014, 'epoch': 4.0}


                                                  
100%|██████████| 25/25 [2:01:11<00:00, 290.86s/it]


{'eval_loss': 3.3137893676757812, 'eval_runtime': 96.2629, 'eval_samples_per_second': 0.208, 'eval_steps_per_second': 0.021, 'epoch': 5.0}
{'train_runtime': 7271.4806, 'train_samples_per_second': 0.055, 'train_steps_per_second': 0.003, 'train_loss': 3.5326968383789064, 'epoch': 5.0}


TrainOutput(global_step=25, training_loss=3.5326968383789064, metrics={'train_runtime': 7271.4806, 'train_samples_per_second': 0.055, 'train_steps_per_second': 0.003, 'total_flos': 26129675673600.0, 'train_loss': 3.5326968383789064, 'epoch': 5.0})

In [32]:
context = "The University of California was founded in 1868, located in Berkeley."
question = "When was the University of California established?"

# Tokenize the context to find the exact start and end position of the answer
encoded = tokenizer.encode_plus(question, context, return_tensors="pt").to("mps")
input_ids = encoded["input_ids"].tolist()[0]

model.eval()
with torch.no_grad():
    outputs = model(**encoded)

answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1

# Convert tokens to answer string
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print("Answer:", answer)

Answer: [CLS] when was the university of california established? [SEP] the university of california was founded in 1868
