In [1]:
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Path to your fine-tuning dataset
dataset_path = "../data/qa_datasets/t5_finetune.jsonl"

# Load JSONL dataset
dataset = load_dataset("json", data_files=dataset_path, split="train")

print(dataset[0])  # preview


Generating train split: 3752 examples [00:00, 313286.92 examples/s]

{'input': 'What was the Revenue from operations in Q4 FY24?', 'output': 'The Revenue from operations in Q4 FY24 was 61,237.'}





In [6]:
print(dataset[0])
print(dataset.column_names)


{'input': 'What was the Revenue from operations in Q4 FY24?', 'output': 'The Revenue from operations in Q4 FY24 was 61,237.'}
['input', 'output']


In [7]:
MODEL_NAME = "google/flan-t5-small"  # change to flan-t5-base if GPU allows

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(batch):
    questions = batch["input"]   # your JSON uses 'input'
    answers   = batch["output"]  # your JSON uses 'output'

    model_inputs = tokenizer(
        questions,
        max_length=256,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        answers,
        max_length=64,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess, batched=True)


Map: 100%|███████████████████████████████████████████████████████████████████████████████████████| 3752/3752 [00:00<00:00, 5033.15 examples/s]


In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# LoRA Config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 344,064 || all params: 77,305,216 || trainable%: 0.4451


In [10]:
import transformers, sys
print("transformers", transformers.__version__)
print("python", sys.version)

transformers 4.55.2
python 3.12.5 (tags/v3.12.5:ff3bc82, Aug  6 2024, 20:45:27) [MSC v.1940 64 bit (AMD64)]


In [13]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results/flan_t5_lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.0,
    logging_steps=20,
    save_total_limit=2,
    #evaluation_strategy="epoch",   # works in 4.55.2
    save_strategy="epoch",         # works in 4.55.2
    predict_with_generate=True,    # important for seq2seq
    logging_dir="./logs",
    report_to="none",              # disable wandb/hf logging unless you want them
    fp16=False                     # set True if using GPU with mixed precision
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Seq2SeqTrainer(


In [14]:
trainer.train()




Step,Training Loss
20,36.3865
40,36.1263
60,37.0001
80,35.672
100,33.176
120,33.819
140,32.6901
160,30.9622
180,29.2382
200,28.4711




TrainOutput(global_step=2814, training_loss=8.006063897811359, metrics={'train_runtime': 8745.5441, 'train_samples_per_second': 1.287, 'train_steps_per_second': 0.322, 'total_flos': 1052140357287936.0, 'train_loss': 8.006063897811359, 'epoch': 3.0})

In [16]:
model.save_pretrained("../results/flan_t5_lora_adapter")
tokenizer.save_pretrained("../results/flan_t5_lora_adapter")
print("✅ LoRA adapter saved!")


✅ LoRA adapter saved!


In [22]:
test_path = "../data/qa_datasets/t5_finetune.jsonl"
tests = [json.loads(l) for l in open(test_path, "r", encoding="utf-8").read().splitlines()]

def generate_answer(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=64)
    return tokenizer.decode(output[0], skip_special_tokens=True)

for ex in tests[:5]:
    q = ex["input"]        # ✅ correct key
    gold = ex["output"]    # ✅ correct key
    print(q)
    pred = generate_answer(q)

    print("Q:", q)
    print("Pred:", pred)
    print("Gold:", gold)
    print("---")

What was the Revenue from operations in Q4 FY24?
Q: What was the Revenue from operations in Q4 FY24?
Pred: Q4 FY24
Gold: The Revenue from operations in Q4 FY24 was 61,237.
---
What was the Revenue from operations in Q3 FY23?
Q: What was the Revenue from operations in Q3 FY23?
Pred: FY23
Gold: The Revenue from operations in Q3 FY23 was 60,583.
---
What was the Revenue from operations in Q4 FY23?
Q: What was the Revenue from operations in Q4 FY23?
Pred: The Revenue from operations in Q4 FY23 was 8.
Gold: The Revenue from operations in Q4 FY23 was 59,162.
---
What was the Revenue from operations in FY24?
Q: What was the Revenue from operations in FY24?
Pred: Revenue from operations in FY24 was.
Gold: The Revenue from operations in FY24 was 2,40,893.
---
What was the Revenue from operations in FY23?
Q: What was the Revenue from operations in FY23?
Pred: FY23.
Gold: The Revenue from operations in FY23 was 2,25,458.
---


In [23]:
def generate_answer(question):
    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=64)
    return tokenizer.decode(output[0], skip_special_tokens=True)
q = "What was the Revenue from operations in FY23?"
print("Pred:", generate_answer(q))

Pred: The Revenue from operations in FY 23.
