# Model Preprocessing and Training

In [1]:
!pip install datasets
!pip install pandas
import pandas as pd
import torch
from transformers import BertTokenizer, BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_dataset, Dataset


# Step 1: Load the SQuAD dataset and tokenizer
df = pd.read_csv('SQ_train.csv')

# Convert the 'Answer_Text' column to string type
df['Answer_Text'] = df['Answer_Text'].astype(str)

dataset = Dataset.from_pandas(df)
# Adjust the 'test_size' parameter to control the split ratio
dataset = dataset.train_test_split(test_size=0.2)
dataset['validation'] = dataset.pop('test')


tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Step 2: Preprocess the data (Tokenize and align the start/end positions)
def preprocess_function(examples):
    inputs = tokenizer(
        examples['Question'], examples['Context'],
        max_length=512, truncation="only_second",
        padding="max_length", return_offsets_mapping=True
    )
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(inputs["offset_mapping"]):
        start_char = examples["Answer_Start"][i]
        answer_text = examples["Answer_Text"][i]
        end_char = start_char + len(answer_text)
        sequence_ids = inputs.sequence_ids(i)
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # Set start and end positions
        if not (offsets[context_start][0] <= start_char and offsets[context_end][1] >= end_char):
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while offsets[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_start
            while offsets[idx][1] < end_char:
                idx += 1
            end_positions.append(idx)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Preprocess the datasets
train_dataset = dataset['train'].map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)
val_dataset = dataset['validation'].map(preprocess_function, batched=True, remove_columns=dataset['validation'].column_names)

# Step 3: Set the format for PyTorch
train_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'start_positions', 'end_positions'])
val_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

# Step 4: Load the BERT model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Step 5: Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Step 6: Initialize Trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Map:   0%|          | 0/9039 [00:00<?, ? examples/s]

Map:   0%|          | 0/2260 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,2.6894,1.619785
2,1.3138,1.417931
3,0.8807,1.519624
4,0.5846,1.770095


Epoch,Training Loss,Validation Loss
1,2.6894,1.619785
2,1.3138,1.417931
3,0.8807,1.519624
4,0.5846,1.770095
5,0.4169,1.961539


TrainOutput(global_step=2825, training_loss=1.0810592273273298, metrics={'train_runtime': 4663.5492, 'train_samples_per_second': 9.691, 'train_steps_per_second': 0.606, 'total_flos': 1.180930692068352e+16, 'train_loss': 1.0810592273273298, 'epoch': 5.0})

# Model Evaluation

In [2]:
!pip install evaluate
from evaluate import load
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Define the metric for Exact Match (EM) and F1
metric = load("squad")

# Define the compute_metrics function to include F1, Accuracy, Precision, Recall
def compute_metrics(p):
    start_preds, end_preds = p.predictions
    start_preds = start_preds.argmax(axis=-1)
    end_preds = end_preds.argmax(axis=-1)

    # Initialize empty lists for true values and predictions
    true_starts = []
    true_ends = []
    pred_starts = []
    pred_ends = []

    # Collect true and predicted start/end positions
    # Assuming p.label_ids is a tuple containing start and end positions
    true_starts = p.label_ids[0]  # Accessing start positions from the tuple
    true_ends = p.label_ids[1]  # Accessing end positions from the tuple

    # Collect predicted start and end positions for each example
    pred_starts = start_preds
    pred_ends = end_preds

    # Compute F1, Accuracy, Precision, Recall based on true and predicted values
    start_f1 = f1_score(true_starts, pred_starts, average='weighted')
    end_f1 = f1_score(true_ends, pred_ends, average='weighted')

    start_precision = precision_score(true_starts, pred_starts, average='weighted')
    end_precision = precision_score(true_ends, pred_ends, average='weighted')

    start_recall = recall_score(true_starts, pred_starts, average='weighted')
    end_recall = recall_score(true_ends, pred_ends, average='weighted')

    start_accuracy = accuracy_score(true_starts, pred_starts)
    end_accuracy = accuracy_score(true_ends, pred_ends)

    # Return all metrics
    return {
        "start_f1": start_f1,
        "end_f1": end_f1,
        "start_precision": start_precision,
        "end_precision": end_precision,
        "start_recall": start_recall,
        "end_recall": end_recall,
        "start_accuracy": start_accuracy,
        "end_accuracy": end_accuracy,
    }

# Trainer with updated compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Evaluate the model
results = trainer.evaluate()
print("Evaluation results:", results)

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Evaluation results: {'eval_loss': 1.9615387916564941, 'eval_model_preparation_time': 0.0031, 'eval_start_f1': 0.5894067860026418, 'eval_end_f1': 0.6288559282267402, 'eval_start_precision': 0.6069294585756806, 'eval_end_precision': 0.6535813790172035, 'eval_start_recall': 0.5915929203539823, 'eval_end_recall': 0.6283185840707964, 'eval_start_accuracy': 0.5915929203539823, 'eval_end_accuracy': 0.6283185840707964, 'eval_runtime': 62.3838, 'eval_samples_per_second': 36.227, 'eval_steps_per_second': 2.276}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Model Saving

In [None]:
import pickle

# Save the model to a file
with open('model_filename.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved as 'model_filename.pkl'")

Model saved as 'model_filename.pkl'


# Model loading

In [None]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from torchinfo import summary
import pickle

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the model from the saved file
with open('model_filename.pkl', 'rb') as file:
    model = pickle.load(file)

# Create a sample input
inputs = tokenizer("To what does yoga say personal experimentation and knowledge lead?", "moksha", return_tensors='pt')

# Move model and inputs to the same device (CPU in this case)
device = torch.device('cpu')  # Or 'cuda' if you have a GPU
model.to(device)  # Move the model to the device
inputs = {key: value.to(device) for key, value in inputs.items()} # Move inputs to the device

# Print the model summary
summary(model, input_data=inputs)



Layer (type:depth-idx)                                       Output Shape              Param #
BertForQuestionAnswering                                     [1, 16]                   --
├─BertModel: 1-1                                             [1, 16, 768]              --
│    └─BertEmbeddings: 2-1                                   [1, 16, 768]              --
│    │    └─Embedding: 3-1                                   [1, 16, 768]              23,440,896
│    │    └─Embedding: 3-2                                   [1, 16, 768]              1,536
│    │    └─Embedding: 3-3                                   [1, 16, 768]              393,216
│    │    └─LayerNorm: 3-4                                   [1, 16, 768]              1,536
│    │    └─Dropout: 3-5                                     [1, 16, 768]              --
│    └─BertEncoder: 2-2                                      [1, 16, 768]              --
│    │    └─ModuleList: 3-6                                  --             