## Reference 

Reference : https://huggingface.co/docs/transformers/en/tasks/question_answering

In [1]:
from datasets import load_dataset, load_from_disk
import os
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load a QA dataset (e.g., SQuAD)
from datasets import load_dataset
squad = load_dataset("squad", split="train[:5000]")


In [3]:
squad = squad.train_test_split(test_size=0.3)

In [4]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 3500
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1500
    })
})

## Step 3. Data preprocessing

In [5]:
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs



tokenized_data_folder = "../data/preprocessed"


# Check if the tokenized dataset already exists
if os.path.exists(tokenized_data_folder):
    # Load the tokenized data if it already exists
    tokenized_squad = load_from_disk(tokenized_data_folder)
    print("Loaded tokenized data from disk.")
else:
    # If the tokenized data doesn't exist, preprocess and save it
    print("Tokenized data not found. Tokenizing and saving to disk...")
    # Apply the preprocessing function to tokenize the dataset
    tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
    
    # Save the tokenized data to the specified directory in Arrow format
    tokenized_squad.save_to_disk(tokenized_data_folder)
    print("Tokenized data saved to disk.")


Loaded tokenized data from disk.


Now create a batch of examples using DefaultDataCollator

In [6]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [7]:
tokenized_squad

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 3500
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1500
    })
})

# Step 4: Model Training and Experiment Tracking with MLflow

In [8]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer
import mlflow

# End any existing run if active
if mlflow.active_run():
    mlflow.end_run()

# Set up MLflow
mlflow.start_run(run_name="flan-t5-qa")

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

training_args = TrainingArguments(
    output_dir="my_awesome_qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

# Log model to MLflow
mlflow.log_param("model", "flan-t5-large")
mlflow.log_metric("epoch", training_args.num_train_epochs)
mlflow.pytorch.log_model(model, "flan-t5-qa-model")
mlflow.end_run()


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [90,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [90,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [90,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [90,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
