In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5"

In [2]:
%pip install -q evaluate peft
%pip install --upgrade accelerate>=0.26.0

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
from transformers import GPT2TokenizerFast, GPT2ForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig
import evaluate

In [4]:
# Load SQuAD dataset
dataset = load_dataset("squad")

In [5]:
# Load fast tokenizer and model
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2ForQuestionAnswering.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at gpt2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Add padding token to tokenizer (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [7]:
# Preprocess function for SQuAD
def preprocess_function(examples):
    questions = examples["question"]
    contexts = examples["context"]
    answers = examples["answers"]

    # Tokenize inputs (question + context)
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
        return_offsets_mapping=True
    )

    # Prepare labels (start and end positions of answers)
    start_positions = []
    end_positions = []

    for i in range(len(answers)):
        answer = answers[i]
        start = answer["answer_start"][0]
        answer_text = answer["text"][0]
        end = start + len(answer_text)

        # Find token positions using offset mapping
        offset_mapping = inputs["offset_mapping"][i]
        token_start = 0
        token_end = 0

        for idx, (start_char, end_char) in enumerate(offset_mapping):
            if start_char <= start < end_char:
                token_start = idx
            if start_char < end <= end_char:
                token_end = idx
                break

        # Handle edge cases (e.g., answer not found in tokenized context)
        if token_start == 0 and token_end == 0 and start != 0:
            token_start = token_end = 0  # Set to 0 for unanswerable cases

        start_positions.append(token_start)
        end_positions.append(token_end)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs.pop("offset_mapping")  # Remove offset_mapping as it's not needed for training
    return inputs

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["id", "title", "context", "question", "answers"])


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [8]:
# Split dataset and select n samples for training demo
train_dataset = tokenized_dataset["train"]  # Use first n samples
eval_dataset = tokenized_dataset["validation"]

In [9]:
# Configure LoRA
lora_config = LoraConfig(
    r=8,  # Reduced rank for efficiency
    lora_alpha=16,  # Scaling factor
    target_modules=["c_attn", "c_proj"],  # Target GPT-2 attention and projection layers
    lora_dropout=0.05,  # Reduced dropout for stability
    bias="none",
    task_type="QUESTION_ANS"
)

# Apply LoRA to the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = get_peft_model(model, lora_config)
model.to(device)



PeftModelForQuestionAnswering(
  (base_model): LoraModel(
    (model): GPT2ForQuestionAnswering(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): Paramete

In [10]:
# Define training arguments with mixed precision
training_args = TrainingArguments(
    output_dir="./gpt2_squad_lora_demo",
    eval_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision training
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)


In [11]:
# Load evaluation metric (SQuAD)
metric = evaluate.load("squad")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    start_logits, end_logits = predictions
    start_positions, end_positions = labels

    # Convert logits to predicted start and end positions
    pred_starts = torch.argmax(torch.tensor(start_logits), dim=-1)
    pred_ends = torch.argmax(torch.tensor(end_logits), dim=-1)

    predictions = []
    references = []

    for i, (start, end) in enumerate(zip(pred_starts, pred_ends)):
        pred_tokens = range(start, end + 1)
        pred_text = tokenizer.decode([t for t in pred_tokens if t < tokenizer.vocab_size], skip_special_tokens=True)
        predictions.append({"prediction_text": pred_text, "id": dataset["validation"][i]["id"]})
        references.append({
            "answers": {
                "text": [dataset["validation"][i]["answers"]["text"][0]],
                "answer_start": [dataset["validation"][i]["answers"]["answer_start"][0]]
            },
            "id": dataset["validation"][i]["id"]
        })

    return metric.compute(predictions=predictions, references=references)

In [12]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)


[2025-05-31 03:49:51,182] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/thaole/miniconda3/envs/hifed/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/thaole/miniconda3/envs/hifed/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/thaole/miniconda3/envs/hifed/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/thaole/miniconda3/envs/hifed/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/thaole/miniconda3/envs/hifed/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/thaole/miniconda3/envs/hifed/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/home/thaole/minic

In [13]:
# Train the model
print("Starting LoRA fine-tuning...")
trainer.train()


Starting LoRA fine-tuning...




Epoch,Training Loss,Validation Loss,Exact Match,F1
1,3.5354,2.616277,0.018921,0.036063
2,2.6529,2.333976,0.018921,0.033724
3,2.4537,2.233581,0.018921,0.029164
4,2.3802,2.191426,0.018921,0.029164




TrainOutput(global_step=10952, training_loss=2.7555670839230277, metrics={'train_runtime': 1940.7846, 'train_samples_per_second': 180.543, 'train_steps_per_second': 5.643, 'total_flos': 6.932398222832026e+16, 'train_loss': 2.7555670839230277, 'epoch': 4.0})