In [1]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer,  TrainingArguments, Trainer
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


### Loading the model and the tokenizer

Here, I loaded the model, that I had downloaded locally along with its tokenizer. The model I used is Bert-Base-Uncased, downloaded from the HuggingFace Hub.

In [12]:
model_path = './model'

model = AutoModelForQuestionAnswering.from_pretrained(model_path)
model = model.to('cpu')
tokenizer = AutoTokenizer.from_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ./model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Downloading and Splitting the Dataset.
For finetuning the BERT model. I chose SQUAD dataset. 

#### Downloading the dataset

In [3]:
squad = load_dataset("squad", split="train[:5000]")

#### Splitting the dataset

In [4]:
squad = squad.train_test_split(test_size=0.2)

In [5]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1000
    })
})

In [6]:
squad["train"][0]

{'id': '56ce362aaab44d1400b885bc',
 'title': 'New_York_City',
 'context': 'The first non-Native American inhabitant of what would eventually become New York City was Dominican trader Juan Rodriguez (transliterated to Dutch as Jan Rodrigues). Born in Santo Domingo of Portuguese and African descent, he arrived in Manhattan during the winter of 1613–1614, trapping for pelts and trading with the local population as a representative of the Dutch. Broadway, from 159th Street to 218th Street, is named Juan Rodriguez Way in his honor.',
 'question': 'Where was Juan Rodriguez born?',
 'answers': {'text': ['Santo Domingo'], 'answer_start': [175]}}

In [7]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [8]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map: 100%|██████████| 4000/4000 [00:01<00:00, 3961.23 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 4009.65 examples/s]


In [9]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [10]:
training_args = TrainingArguments(
    output_dir="finetuned_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [11]:
trainer.train()

  0%|          | 0/750 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 18.00 MiB (GPU 0; 3.81 GiB total capacity; 3.13 GiB already allocated; 5.06 MiB free; 3.18 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### Inference

In [None]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [None]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="finetuned_model")
question_answerer(question=question, context=context)