In [1]:
print('Start')
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments, DistilBertForSequenceClassification
from tqdm.notebook import tqdm
from datasets import load_dataset, Dataset
import torch
import numpy as np  # Linear algebra
import json  # To read json
import pandas as pd
from torch.utils.data import DataLoader
from torch.optim import AdamW  # AdamW instead of Adam because it's better for SQuAD
from collections import Counter
import os
from torch.utils.tensorboard import SummaryWriter  # For TensorBoard
from transformers import AutoTokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

Start


2024-09-27 22:05:23.069869: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-27 22:05:23.848428: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-27 22:05:24.054460: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-27 22:05:24.158807: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-27 22:05:24.963615: I tensorflow/core/platform/cpu_feature_guar

Device: cuda


## Read a json file

The dataset I used is the dev-v2 dataset. It is a json file. I didn't know how to read this file so I used a code from kaggle:  
https://www.kaggle.com/code/sanjay11100/squad-stanford-q-a-json-to-pandas-dataframe

In [2]:
def squad_json_to_dataframe_dev(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    m = pd.json_normalize(file, record_path[:-1] )
    r = pd.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = idx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [3]:
input_file_path = 'dev-v2.0.json'
record_path = ['data','paragraphs','qas','answers']
dataset = squad_json_to_dataframe_dev(input_file_path=input_file_path,record_path=record_path)
dataset = Dataset.from_pandas(dataset)
dataset

Reading the json file
processing...


shape of the dataframe is (11873, 4)
Done


Dataset({
    features: ['id', 'question', 'context', 'answers'],
    num_rows: 11873
})

## Tokenization

The curent dataset can't be used for training. It must be tokenized before. I didn't know how to do it so I have used a code from CHATGPT that I have modidified.  
To simplify the dataset, only the first answer for each question is kept.

In [4]:
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
#CHATGPT
def preprocess_function(examples):
    questions = examples['question']
    contexts = examples['context']
    answers = examples['answers']
    
    inputs = tokenizer(questions, contexts, max_length=384, truncation=True, padding='max_length', return_offsets_mapping=True)
    
    start_positions = []
    end_positions = []
    
    for i in range(len(questions)):
        if not answers[i]:  # Si pas de réponse
            start_positions.append(0)  # Valeur par défaut
            end_positions.append(0)    # Valeur par défaut
            continue
        
        # Only the firs answer is used
        first_answer = answers[i][0]['text']
        first_answer_start = answers[i][0]['answer_start']
        
        offsets = inputs['offset_mapping'][i]

        start_token = None
        end_token = None
        for idx, (start, end) in enumerate(offsets):
            if start <= first_answer_start < end:
                start_token = idx
            if start < first_answer_start + len(first_answer) <= end:
                end_token = idx
                break
        
        if start_token is not None and end_token is not None:
            start_positions.append(start_token)
            end_positions.append(end_token)
        else:
            start_positions.append(-1)  # Default value
            end_positions.append(-1)    # Default value


    inputs.pop('offset_mapping') #offset_mapping is not necessary to train the model
    
    # Converting everything into tensors 
    inputs.update({
        'start_positions': torch.tensor(start_positions),
        'end_positions': torch.tensor(end_positions)
    })
    for column in inputs.keys():
        inputs[column] = torch.tensor(inputs[column])

    

    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['id', 'question', 'context', 'answers'])
tokenized_dataset = tokenized_dataset.select(range(4000))#Decrease the size of the dataset to have a longer training



Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

  inputs[column] = torch.tensor(inputs[column])


Everything in the tokenized dataset should be a tensor otherwise the training won't be possible.

In [5]:
for column in ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']:
    print(column, type(tokenized_dataset[column]))  # Devrait être torch.Tensor

input_ids <class 'list'>
token_type_ids <class 'list'>
attention_mask <class 'list'>
start_positions <class 'list'>
end_positions <class 'list'>


It seems that dataset.map didn't change the type of the dataset.  
To change the format I have used set_format.

In [6]:
tokenized_dataset.set_format(type='torch')

In [7]:
for column in ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']:
    print(column, type(tokenized_dataset[column]))  # Devrait être torch.Tensor

input_ids <class 'torch.Tensor'>
token_type_ids <class 'torch.Tensor'>
attention_mask <class 'torch.Tensor'>
start_positions <class 'torch.Tensor'>
end_positions <class 'torch.Tensor'>


In [8]:
splited_tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed = 42)
tokenized_dataset_train = splited_tokenized_dataset['train']
tokenized_dataset_validation = splited_tokenized_dataset['test']

## Dataloader

In [9]:
train_dataloader = DataLoader(dataset=tokenized_dataset_train, batch_size=16, shuffle=True)
validation_dataloader = DataLoader(dataset=tokenized_dataset_validation, batch_size=16, shuffle=True)

## Model and Optimizer

In [10]:
model = BertForQuestionAnswering.from_pretrained(model_name)
model.to(device)
optim = AdamW(model.parameters(),lr=5e-5)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model choosen for this task is bert base cased.  
The optimizer is AdamW which is similar the the usual Adam optimizer but with weight decay. It seems that this kind of optimizer is better for transformer models.  
The learning rate at the begning of the training is 5e-5 which is the usual learning rate for transformsers.

## Loss and Metrics

The default loss for this model is the binary crossentropy loss.  
To have something else from the loss to display on tensorboard I used the f1_score and a exact_match score.  
The f1_score wasn't imported from scikit_learn because to use it I had to use numpy arrays.  
However, to get numpy arrays it seems that it is necessary to transfer my results from the GPU to the CPU.

In [11]:
# CHATGPT for the implementation of the metric
def f1_score(pred_toks, true_toks):
    """
    Function to calculate F1-score
    """
    common = Counter(pred_toks) & Counter(true_toks)  # Find common tokens
    num_common = sum(common.values())  # Count how many tokens are in common
    if num_common == 0:
        return 0
    precision = num_common / len(pred_toks)
    recall = num_common / len(true_toks)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

## Checkpoint

In [12]:
def save_checkpoint(model, optimizer, epoch, loss, checkpoint_dir):
    """
    Creates checkpoint
    """
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch+1}.pt')
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }, checkpoint_path)
    print(f"Checkpoint saved: {checkpoint_path}")
checkpoint_dir = "./checkpoints"  # Directory where checkpoints will be saved
os.makedirs(checkpoint_dir, exist_ok=True)

## Training

In [13]:
def compute_predictions_and_metrics(outputs, input_ids, start_positions, end_positions):
    """
    Computes start and end predictions and calculates F1 score.
    """    
    start_preds = torch.argmax(outputs.start_logits, dim=1)
    end_preds = torch.argmax(outputs.end_logits, dim=1)

    total_f1 = 0
    batch_size = input_ids.size(0)

    for i in range(batch_size):
        # F1 Score for the predicted vs true tokens
        pred_tokens = input_ids[i][start_preds[i]:end_preds[i] + 1].tolist()
        true_tokens = input_ids[i][start_positions[i]:end_positions[i] + 1].tolist()
        total_f1 += f1_score(pred_tokens, true_tokens)

    return total_f1, batch_size

In [14]:
# TensorBoard setup: log directory
log_dir = "./logs_exclusive"
writer = SummaryWriter(log_dir=log_dir)

# Number of training epochs
epochs = 10
# Create a checkpoint every 2 epochs
save_checkpoint_every = 2

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()

    total_loss_train = 0
    total_f1_train = 0
    num_questions_train = 0

    total_loss_validation = 0
    total_f1_validation = 0
    num_questions_validation = 0

    # Training Loop
    for batch in tqdm(train_dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        optim.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions, end_positions=end_positions)
        # Loss
        loss = outputs.loss
        total_loss_train += loss.item()

        # Backward pass and weight update
        loss.backward()
        optim.step()

        # Metric
        f1_batch, batch_size = compute_predictions_and_metrics(outputs, input_ids, start_positions, end_positions)
        total_f1_train += f1_batch
        num_questions_train += batch_size


    # Validation Loop
    model.eval()
    with torch.no_grad():
        for batch in tqdm(validation_dataloader, desc="Validating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask,
                            start_positions=start_positions, end_positions=end_positions)
            # Loss
            loss = outputs.loss
            total_loss_validation += loss.item()
            # Metric
            f1_batch, batch_size = compute_predictions_and_metrics(outputs, input_ids, start_positions, end_positions)
            total_f1_validation += f1_batch
            num_questions_validation += batch_size

    # Calculate average loss and metrics for the training epoch
    avg_loss_train = total_loss_train / len(train_dataloader)
    avg_f1_train = total_f1_train / num_questions_train
    avg_loss_validation = total_loss_validation / len(validation_dataloader)
    avg_f1_validation = total_f1_validation / num_questions_validation

    # Print out training metrics for the epoch
    print(f"Loss at epoch {epoch + 1}: Train {avg_loss_train:.4f} | Validation {avg_loss_validation:.4f}")
    print(f"F1 Score at epoch {epoch + 1}: Train {avg_f1_train:.4f} | Validation {avg_f1_validation:.4f}")

    # Log training metrics to TensorBoard
    writer.add_scalar("Loss/Train", avg_loss_train, epoch)
    writer.add_scalar("Loss/Validation", avg_loss_validation, epoch)
    writer.add_scalar("Metrics/F1_Score/Train", avg_f1_train, epoch)
    writer.add_scalar("Metrics/F1_Score/Validation", avg_f1_validation, epoch)


    # Save a checkpoint every 'save_checkpoint_every' epochs
    if (epoch + 1) % save_checkpoint_every == 0:
        save_checkpoint(model, optim, epoch, avg_loss_validation, checkpoint_dir)

# Close the TensorBoard writer
writer.close()

print('Training completed.')

Epoch 1/10


Training:   0%|          | 0/200 [00:00<?, ?it/s]

Validating:   0%|          | 0/50 [00:00<?, ?it/s]

Loss at epoch 1: Train 2.7326 | Validation 2.1256
F1 Score at epoch 1: Train 0.4663 | Validation 0.3571
Epoch 2/10


Training:   0%|          | 0/200 [00:00<?, ?it/s]

Validating:   0%|          | 0/50 [00:00<?, ?it/s]

Loss at epoch 2: Train 1.7469 | Validation 1.8917
F1 Score at epoch 2: Train 0.4256 | Validation 0.3914
Checkpoint saved: ./checkpoints/checkpoint_epoch_2.pt
Epoch 3/10


Training:   0%|          | 0/200 [00:00<?, ?it/s]

Validating:   0%|          | 0/50 [00:00<?, ?it/s]

Loss at epoch 3: Train 1.1111 | Validation 2.0680
F1 Score at epoch 3: Train 0.5515 | Validation 0.3542
Epoch 4/10


Training:   0%|          | 0/200 [00:00<?, ?it/s]

Validating:   0%|          | 0/50 [00:00<?, ?it/s]

Loss at epoch 4: Train 0.7788 | Validation 2.4999
F1 Score at epoch 4: Train 0.6585 | Validation 0.3782
Checkpoint saved: ./checkpoints/checkpoint_epoch_4.pt
Epoch 5/10


Training:   0%|          | 0/200 [00:00<?, ?it/s]

Validating:   0%|          | 0/50 [00:00<?, ?it/s]

Loss at epoch 5: Train 0.5527 | Validation 2.7790
F1 Score at epoch 5: Train 0.7608 | Validation 0.3741
Epoch 6/10


Training:   0%|          | 0/200 [00:00<?, ?it/s]

Validating:   0%|          | 0/50 [00:00<?, ?it/s]

Loss at epoch 6: Train 0.4044 | Validation 2.9695
F1 Score at epoch 6: Train 0.8161 | Validation 0.3897
Checkpoint saved: ./checkpoints/checkpoint_epoch_6.pt
Epoch 7/10


Training:   0%|          | 0/200 [00:00<?, ?it/s]

Validating:   0%|          | 0/50 [00:00<?, ?it/s]

Loss at epoch 7: Train 0.3024 | Validation 3.1862
F1 Score at epoch 7: Train 0.8714 | Validation 0.4051
Epoch 8/10


Training:   0%|          | 0/200 [00:00<?, ?it/s]

Validating:   0%|          | 0/50 [00:00<?, ?it/s]

Loss at epoch 8: Train 0.2594 | Validation 3.4759
F1 Score at epoch 8: Train 0.8874 | Validation 0.3972
Checkpoint saved: ./checkpoints/checkpoint_epoch_8.pt
Epoch 9/10


Training:   0%|          | 0/200 [00:00<?, ?it/s]

Validating:   0%|          | 0/50 [00:00<?, ?it/s]

Loss at epoch 9: Train 0.2041 | Validation 3.4785
F1 Score at epoch 9: Train 0.9061 | Validation 0.4151
Epoch 10/10


Training:   0%|          | 0/200 [00:00<?, ?it/s]

Validating:   0%|          | 0/50 [00:00<?, ?it/s]

Loss at epoch 10: Train 0.1853 | Validation 3.8786
F1 Score at epoch 10: Train 0.9208 | Validation 0.4150
Checkpoint saved: ./checkpoints/checkpoint_epoch_10.pt
Training completed.
