In [1]:
from datasets import load_dataset, DatasetDict
import torch
torch.set_printoptions(linewidth=1000000)
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

dataset = load_dataset("squad")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [2]:
def add_token_positions(example):
    answer_start_char = example['answers']['answer_start']
    if len(answer_start_char) > 0:
        answer_start_char = answer_start_char[0]
        answer_end_char = answer_start_char + len(example['answers']['text'][0])
        # print([example['context'][answer_start_char:answer_end_char]])
        context_encoding = tokenizer(example['context'])
        len_question = len(tokenizer(example['question'])['input_ids']) - 1
        example['start_positions'] = context_encoding.char_to_token(answer_start_char) + len_question
        example['end_positions'] = context_encoding.char_to_token(answer_end_char - 1) + len_question + 1
    else:
        example['start_positions'] = 0
        example['end_positions'] = 0

    return example

# Apply the function to each example in the dataset
updated_dataset = DatasetDict()
updated_dataset['train'] = dataset['train'].map(add_token_positions)
updated_dataset['validation'] = dataset['validation'].map(add_token_positions)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [3]:
for i in range(10):
    # len_question = len(tokenizer(updated_dataset['train'][i]['question'])['input_ids']) -1
    print(updated_dataset['train'][i]['answers'])
    # print(tokenizer.decode(tokenizer(updated_dataset['train'][i]['question'], updated_dataset['train'][i]['context'])['input_ids'][updated_dataset['train'][i]['start_positions']+len_question:updated_dataset['train'][i]['end_positions']+len_question]))
    print(tokenizer.decode(tokenizer(updated_dataset['train'][i]['question'], updated_dataset['train'][i]['context'])['input_ids'][updated_dataset['train'][i]['start_positions']:updated_dataset['train'][i]['end_positions']]))

{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}
saint bernadette soubirous
{'text': ['a copper statue of Christ'], 'answer_start': [188]}
a copper statue of christ
{'text': ['the Main Building'], 'answer_start': [279]}
the main building
{'text': ['a Marian place of prayer and reflection'], 'answer_start': [381]}
a marian place of prayer and reflection
{'text': ['a golden statue of the Virgin Mary'], 'answer_start': [92]}
a golden statue of the virgin mary
{'text': ['September 1876'], 'answer_start': [248]}
september 1876
{'text': ['twice'], 'answer_start': [441]}
twice
{'text': ['The Observer'], 'answer_start': [598]}
the observer
{'text': ['three'], 'answer_start': [126]}
three
{'text': ['1987'], 'answer_start': [908]}
1987


In [4]:
def tokenize_function(example):
    return tokenizer(example['question'], example['context'], padding='max_length', truncation=True)

tokenized_dataset = updated_dataset.map(tokenize_function, batched=True)

tokenized_dataset.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])
print(tokenized_dataset)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'start_positions', 'end_positions', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'start_positions', 'end_positions', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10570
    })
})


In [11]:
print(tokenized_dataset['train'][0])
print(tokenizer.decode(tokenized_dataset['train'][0]['input_ids'][130:138]))

{'start_positions': tensor(130), 'end_positions': tensor(138), 'input_ids': tensor([  101,  2000,  3183,  2106,  1996,  6261,  2984,  9382,  3711,  1999,  8517,  1999, 10223, 26371,  2605,  1029,   102,  6549,  2135,  1010,  1996,  2082,  2038,  1037,  3234,  2839,  1012, 10234,  1996,  2364,  2311,  1005,  1055,  2751,  8514,  2003,  1037,  3585,  6231,  1997,  1996,  6261,  2984,  1012,  3202,  1999,  2392,  1997,  1996,  2364,  2311,  1998,  5307,  2009,  1010,  2003,  1037,  6967,  6231,  1997,  4828,  2007,  2608,  2039, 14995,  6924,  2007,  1996,  5722,  1000,  2310,  3490,  2618,  4748,  2033, 18168,  5267,  1000,  1012,  2279,  2000,  1996,  2364,  2311,  2003,  1996, 13546,  1997,  1996,  6730,  2540,  1012,  3202,  2369,  1996, 13546,  2003,  1996, 24665, 23052,  1010,  1037, 14042,  2173,  1997,  7083,  1998,  9185,  1012,  2009,  2003,  1037, 15059,  1997,  1996, 24665, 23052,  2012, 10223, 26371,  1010,  2605,  2073,  1996,  6261,  2984, 22353,  2135,  2596,  2000,  3002,

In [5]:
train_loader = torch.utils.data.DataLoader(tokenized_dataset['train'], batch_size = 8, shuffle = True)
val_loader = torch.utils.data.DataLoader(tokenized_dataset['validation'], batch_size = 8)

for batch in train_loader:
    print(batch)
    print(batch['input_ids'])
    print(batch['token_type_ids'])
    print(batch['attention_mask'])
    print(batch['start_positions'])
    print(batch['end_positions'])
    print(batch.keys())
    break

{'start_positions': tensor([44, 11, 70, 99, 30, 43, 14, 15]), 'end_positions': tensor([ 81,  16,  71, 103,  31,  50,  15,  16]), 'input_ids': tensor([[  101,  2054,  4177,  ...,     0,     0,     0],
        [  101,  2040,  2439,  ...,     0,     0,     0],
        [  101,  1996, 27838,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2003,  ...,     0,     0,     0],
        [  101,  2198, 14845,  ...,     0,     0,     0],
        [  101,  2054,  2406,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
tensor([[  101,  2054

In [6]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased").to(device)
# parallel_model = torch.nn.DataParallel(model)
optimizer = torch.optim.AdamW(model.parameters())

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from tqdm.auto import tqdm

epochs = 1

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for i, batch in tqdm(enumerate(train_loader), total=len(train_loader)):
        optimizer.zero_grad()
        
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'token_type_ids': batch['token_type_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'start_positions': batch['start_positions'].to(device),
            'end_positions': batch['end_positions'].to(device),
        }

        outputs = model(**inputs)

        start_logits, end_logits = outputs[1], outputs[2]

        for j in range(len(start_logits)):
            start = torch.argmax(start_logits[j])
            end = torch.argmax(end_logits[j]) + 1  # Add 1 to include the end token

            answer = tokenizer.decode(batch['input_ids'][j][start:end])

            print(f"Prediction: {answer}")

        loss = outputs[0]
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f'Epoch {epoch + 1} Loss {epoch_loss / len(train_loader)}')


  0%|          | 0/10950 [00:00<?, ?it/s]

Prediction: 
Prediction: 
Prediction: 
Prediction: whip, who is the chief vote counter for his or her party. the current chief deputy majority whip is republican patrick mchenry. within the house republican conference, the chief deputy whip is the highest appointed position and often a launching pad for future positions in the house leadership. the house democratic conference has multiple chief deputy whips, led by a senior chief deputy whip, which is the highest appointed position within the house democratic caucus. the current senior chief deputy
Prediction: , 66 portland place, london ; and the riba architecture study rooms in the henry cole wing
Prediction: san diego de alcala and old town san diego state historic park. also, the local craft brewing industry attracts an increasing number of visitors for " beer tours " and the annual san diego beer week in november ; san diego has been called " america's craft beer capital
Prediction: 
Prediction: were they? [SEP] the instruments us

KeyboardInterrupt: 

In [None]:
# login to huggingface and push model to hub
from huggingface_hub import notebook_login
notebook_login()

In [None]:
model.push_to_hub("pgajo/bert-base-uncased-squad2-2")