**Code used for fine-tuning of T5-Small models in Colab**

In [None]:
import json
from pathlib import Path
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')


# === Paths ===
BASE_PATH = Path("/content/drive/My Drive/...")
MODEL_PATH = BASE_PATH / "model"
RESULTS_PATH = BASE_PATH / "results"
LOGS_PATH = BASE_PATH / "logs"
DATA_PATH = Path("derived_dataset.json")


def load_data(path):
    """Loads input-output pairs from JSON file."""
    with open(path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return [item["input"] for item in data], [item["output"] for item in data]


def preprocess_data(batch, tokenizer):
    """Tokenizes input and output pairs."""
    inputs = tokenizer(batch["input"], max_length=512, truncation=True, padding="max_length")
    outputs = tokenizer(batch["output"], max_length=512, truncation=True, padding="max_length")

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": outputs["input_ids"]
    }


def prepare_datasets(inputs, outputs, tokenizer):
    """Creates and tokenizes Hugging Face Datasets."""
    dataset = Dataset.from_dict({"input": inputs, "output": outputs})
    split = dataset.train_test_split(test_size=0.2, seed=42)

    train_dataset = split["train"].map(lambda x: preprocess_data(x, tokenizer), batched=True, remove_columns=["input", "output"])
    val_dataset = split["test"].map(lambda x: preprocess_data(x, tokenizer), batched=True, remove_columns=["input", "output"])

    return train_dataset, val_dataset


def configure_training_args():
    """Defines training configuration."""
    return TrainingArguments(
        output_dir=str(RESULTS_PATH),
        evaluation_strategy="steps",
        eval_steps=3000,
        save_steps=3000,
        save_strategy="steps",
        logging_dir=str(LOGS_PATH),
        logging_steps=100,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=16,
        warmup_steps=500,
        weight_decay=0.01,
        fp16=True,
        report_to="tensorboard",
        load_best_model_at_end=True,
        gradient_accumulation_steps=4
    )


def train_model():
    """Runs full fine-tuning pipeline for T5."""
    inputs, outputs = load_data(DATA_PATH)

    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    model = T5ForConditionalGeneration.from_pretrained("t5-small")

    train_dataset, val_dataset = prepare_datasets(inputs, outputs, tokenizer)
    training_args = configure_training_args()

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

    model.save_pretrained(MODEL_PATH)
    tokenizer.save_pretrained(MODEL_PATH)

    print(f"Model saved to: {MODEL_PATH}")
    print(f"Checkpoints stored in: {RESULTS_PATH}")
    print(f"Logs stored in: {LOGS_PATH}")


# === Entry Point ===
train_model()