# Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import tensorflow as tf
from datasets import Dataset, DatasetDict
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
import json

2025-08-22 01:03:55.572801: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755824635.790507      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755824635.853746      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data Reading

In [2]:
train_file = '/kaggle/input/stanford-question-answering-dataset/train-v1.1.json'
dev_file = '/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json'

def load_prepare_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    records = []
    for article in data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answers = {
                    "text": [ans['text'] for ans in qa['answers']],
                    "answer_start": [ans['answer_start'] for ans in qa['answers']],
                }
                records.append({
                    "id": qa["id"],  # Ensure to use the ID if available
                    "title": article["title"],
                    "context": context,
                    "question": question,
                    "answers": answers,
                })
    return records


train_data = load_prepare_data(train_file)
dev_data = load_prepare_data(dev_file)


train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
validation_dataset = Dataset.from_pandas(pd.DataFrame(dev_data))

squad = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset
})

# BERT Preprocessing & Training

In [3]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [5]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [6]:
batch_size = 8
num_epochs = 2
total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps,
)

I0000 00:00:1755824712.478774      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [7]:
model = TFAutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_squad["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_squad["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [9]:
model.compile(optimizer=optimizer)

In [10]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=5)

Epoch 1/5


I0000 00:00:1755824752.570777     105 service.cc:148] XLA service 0x7c91f1e8b430 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1755824752.571274     105 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1755824752.642611     105 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1755824752.762812     105 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7c9264e193d0>

In [11]:
model.save_pretrained('my_first_QA_model')