In [None]:
import json
from datasets import load_dataset, Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from peft import get_peft_model, LoraConfig, TaskType

# ---------------------
# Step 1: Load dataset
# ---------------------
with open("/content/qa_dataset_from_cars24.json", "r") as f:
    qa_data = json.load(f)

dataset = Dataset.from_list(qa_data)

# ---------------------
# Step 2: Load T5 tokenizer and model
# ---------------------
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# ---------------------
# Step 3: Preprocessing for T5 (generative QA)
# ---------------------
def preprocess(example):
    input_text = f"question: {example['question']} context: {example['context']}"
    target_text = example['answers']['text'][0]  # assuming one answer per example
    model_inputs = tokenizer(
        input_text,
        max_length=512,
        padding="max_length",
        truncation=True
    )
    labels = tokenizer(
        target_text,
        max_length=64,
        padding="max_length",
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, remove_columns=["context", "question", "answers"])

# ---------------------
# Step 4: PEFT config (LoRA for T5)
# ---------------------
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(model, peft_config)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [None]:

# ---------------------
# Step 5: Training Setup
# ---------------------
training_args = TrainingArguments(
    output_dir="./t5-qa-checkpoints",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    fp16=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

# ---------------------
# Step 6: Train!
# ---------------------
trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


TrainOutput(global_step=6, training_loss=25.362449645996094, metrics={'train_runtime': 3.5807, 'train_samples_per_second': 9.216, 'train_steps_per_second': 1.676, 'total_flos': 20185301385216.0, 'train_loss': 25.362449645996094, 'epoch': 3.0})

In [None]:
import torch

def t5_answer_question(question, context):
    input_text = f"question: {question} context: {context}"
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512  # ✅ Always specify max_length when truncation=True
    )

    # Move inputs to the same device as the model
    device = model.device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate answer
    output_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=64
    )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [None]:
question = "What is the price of the 2008 Honda City ZX in New Delhi?"
context = "Name: Honda City ZX, Model: 2008, Engine: Petrol, Location: New Delhi, Price: 200900"

print(t5_answer_question(question, context))


200900


In [None]:
# Gradio interface
import torch
import gradio as gr
from transformers import T5Tokenizer, T5ForConditionalGeneration
gr.Interface(
    fn=t5_answer_question,
    inputs=[
        gr.Textbox(lines=2, label="Question", placeholder="Ask a question..."),
        gr.Textbox(lines=8, label="Context", placeholder="Paste your car info here...")
    ],
    outputs=gr.Textbox(label="Answer"),
    title="LoRA QA: Cars24 (T5)",
    description="Ask questions about cars based on context (fine-tuned T5 with LoRA)."
).launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4d4659b49f37c98008.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


